SROA.cpp source code [llvm_projects/llvm/lib/Transforms/Scalar/SROA.cpp]

1	//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	/// \file
9	/// This transformation implements the well known scalar replacement of
10	/// aggregates transformation. It tries to identify promotable elements of an
11	/// aggregate alloca, and promote them to registers. It will also try to
12	/// convert uses of an element (or set of elements) of an alloca into a vector
13	/// or bitfield-style integer scalar if appropriate.
14	///
15	/// It works to do this with minimal slicing of the alloca so that regions
16	/// which are merely transferred in and out of external memory remain unchanged
17	/// and are not decomposed to scalar code.
18	///
19	/// Because this also performs alloca promotion, it can be thought of as also
20	/// serving the purpose of SSA formation. The algorithm iterates on the
21	/// function until all opportunities for promotion have been realized.
22	///
23	//===----------------------------------------------------------------------===//
24
25	#include "llvm/Transforms/Scalar/SROA.h"
26	#include "llvm/ADT/APInt.h"
27	#include "llvm/ADT/ArrayRef.h"
28	#include "llvm/ADT/DenseMap.h"
29	#include "llvm/ADT/MapVector.h"
30	#include "llvm/ADT/PointerIntPair.h"
31	#include "llvm/ADT/STLExtras.h"
32	#include "llvm/ADT/SetVector.h"
33	#include "llvm/ADT/SmallBitVector.h"
34	#include "llvm/ADT/SmallPtrSet.h"
35	#include "llvm/ADT/SmallVector.h"
36	#include "llvm/ADT/Statistic.h"
37	#include "llvm/ADT/StringRef.h"
38	#include "llvm/ADT/Twine.h"
39	#include "llvm/ADT/iterator.h"
40	#include "llvm/ADT/iterator_range.h"
41	#include "llvm/Analysis/AssumptionCache.h"
42	#include "llvm/Analysis/DomTreeUpdater.h"
43	#include "llvm/Analysis/GlobalsModRef.h"
44	#include "llvm/Analysis/Loads.h"
45	#include "llvm/Analysis/PtrUseVisitor.h"
46	#include "llvm/Analysis/ValueTracking.h"
47	#include "llvm/Config/llvm-config.h"
48	#include "llvm/IR/BasicBlock.h"
49	#include "llvm/IR/Constant.h"
50	#include "llvm/IR/ConstantFolder.h"
51	#include "llvm/IR/Constants.h"
52	#include "llvm/IR/DIBuilder.h"
53	#include "llvm/IR/DataLayout.h"
54	#include "llvm/IR/DebugInfo.h"
55	#include "llvm/IR/DebugInfoMetadata.h"
56	#include "llvm/IR/DerivedTypes.h"
57	#include "llvm/IR/Dominators.h"
58	#include "llvm/IR/Function.h"
59	#include "llvm/IR/GlobalAlias.h"
60	#include "llvm/IR/IRBuilder.h"
61	#include "llvm/IR/InstVisitor.h"
62	#include "llvm/IR/Instruction.h"
63	#include "llvm/IR/Instructions.h"
64	#include "llvm/IR/IntrinsicInst.h"
65	#include "llvm/IR/LLVMContext.h"
66	#include "llvm/IR/Metadata.h"
67	#include "llvm/IR/Module.h"
68	#include "llvm/IR/Operator.h"
69	#include "llvm/IR/PassManager.h"
70	#include "llvm/IR/Type.h"
71	#include "llvm/IR/Use.h"
72	#include "llvm/IR/User.h"
73	#include "llvm/IR/Value.h"
74	#include "llvm/IR/ValueHandle.h"
75	#include "llvm/InitializePasses.h"
76	#include "llvm/Pass.h"
77	#include "llvm/Support/Casting.h"
78	#include "llvm/Support/CommandLine.h"
79	#include "llvm/Support/Compiler.h"
80	#include "llvm/Support/Debug.h"
81	#include "llvm/Support/ErrorHandling.h"
82	#include "llvm/Support/raw_ostream.h"
83	#include "llvm/Transforms/Scalar.h"
84	#include "llvm/Transforms/Utils/BasicBlockUtils.h"
85	#include "llvm/Transforms/Utils/Local.h"
86	#include "llvm/Transforms/Utils/PromoteMemToReg.h"
87	#include "llvm/Transforms/Utils/SSAUpdater.h"
88	#include <algorithm>
89	#include <cassert>
90	#include <cstddef>
91	#include <cstdint>
92	#include <cstring>
93	#include <iterator>
94	#include <string>
95	#include <tuple>
96	#include <utility>
97	#include <variant>
98	#include <vector>
99
100	using namespace llvm;
101
102	#define DEBUG_TYPE "sroa"
103
104	STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
105	STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
106	STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca");
107	STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten");
108	STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition");
109	STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced");
110	STATISTIC(NumPromoted, "Number of allocas promoted to SSA values");
111	STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
112	STATISTIC(NumLoadsPredicated,
113	"Number of loads rewritten into predicated loads to allow promotion");
114	STATISTIC(
115	NumStoresPredicated,
116	"Number of stores rewritten into predicated loads to allow promotion");
117	STATISTIC(NumDeleted, "Number of instructions deleted");
118	STATISTIC(NumVectorized, "Number of vectorized aggregates");
119
120	/// Disable running mem2reg during SROA in order to test or debug SROA.
121	static cl::opt<bool> SROASkipMem2Reg("sroa-skip-mem2reg", cl::init(Val: false),
122	cl::Hidden);
123	namespace {
124
125	class AllocaSliceRewriter;
126	class AllocaSlices;
127	class Partition;
128
129	class SelectHandSpeculativity {
130	unsigned char Storage = `0`; // None are speculatable by default.
131	using TrueVal = Bitfield::Element<bool, `0`, `1`>; // Low 0'th bit.
132	using FalseVal = Bitfield::Element<bool, `1`, `1`>; // Low 1'th bit.
133	public:
134	SelectHandSpeculativity() = default;
135	SelectHandSpeculativity &setAsSpeculatable(bool isTrueVal);
136	bool isSpeculatable(bool isTrueVal) const;
137	bool areAllSpeculatable() const;
138	bool areAnySpeculatable() const;
139	bool areNoneSpeculatable() const;
140	// For interop as int half of PointerIntPair.
141	explicit operator intptr_t() const { return static_cast<intptr_t>(Storage); }
142	explicit SelectHandSpeculativity(intptr_t Storage_) : Storage(Storage_) {}
143	};
144	static_assert(sizeof(SelectHandSpeculativity) == sizeof(unsigned char));
145
146	using PossiblySpeculatableLoad =
147	PointerIntPair<LoadInst *, `2`, SelectHandSpeculativity>;
148	using UnspeculatableStore = StoreInst *;
149	using RewriteableMemOp =
150	std::variant<PossiblySpeculatableLoad, UnspeculatableStore>;
151	using RewriteableMemOps = SmallVector<RewriteableMemOp, `2`>;
152
153	/// An optimization pass providing Scalar Replacement of Aggregates.
154	///
155	/// This pass takes allocations which can be completely analyzed (that is, they
156	/// don't escape) and tries to turn them into scalar SSA values. There are
157	/// a few steps to this process.
158	///
159	/// 1) It takes allocations of aggregates and analyzes the ways in which they
160	/// are used to try to split them into smaller allocations, ideally of
161	/// a single scalar data type. It will split up memcpy and memset accesses
162	/// as necessary and try to isolate individual scalar accesses.
163	/// 2) It will transform accesses into forms which are suitable for SSA value
164	/// promotion. This can be replacing a memset with a scalar store of an
165	/// integer value, or it can involve speculating operations on a PHI or
166	/// select to be a PHI or select of the results.
167	/// 3) Finally, this will try to detect a pattern of accesses which map cleanly
168	/// onto insert and extract operations on a vector value, and convert them to
169	/// this form. By doing so, it will enable promotion of vector aggregates to
170	/// SSA vector values.
171	class SROA {
172	LLVMContext *const C;
173	DomTreeUpdater *const DTU;
174	AssumptionCache *const AC;
175	const bool PreserveCFG;
176
177	/// Worklist of alloca instructions to simplify.
178	///
179	/// Each alloca in the function is added to this. Each new alloca formed gets
180	/// added to it as well to recursively simplify unless that alloca can be
181	/// directly promoted. Finally, each time we rewrite a use of an alloca other
182	/// the one being actively rewritten, we add it back onto the list if not
183	/// already present to ensure it is re-visited.
184	SmallSetVector<AllocaInst *, `16`> Worklist;
185
186	/// A collection of instructions to delete.
187	/// We try to batch deletions to simplify code and make things a bit more
188	/// efficient. We also make sure there is no dangling pointers.
189	SmallVector<WeakVH, `8`> DeadInsts;
190
191	/// Post-promotion worklist.
192	///
193	/// Sometimes we discover an alloca which has a high probability of becoming
194	/// viable for SROA after a round of promotion takes place. In those cases,
195	/// the alloca is enqueued here for re-processing.
196	///
197	/// Note that we have to be very careful to clear allocas out of this list in
198	/// the event they are deleted.
199	SmallSetVector<AllocaInst *, `16`> PostPromotionWorklist;
200
201	/// A collection of alloca instructions we can directly promote.
202	SetVector<AllocaInst , SmallVector<AllocaInst >,
203	SmallPtrSet<AllocaInst *, `16`>, `16`>
204	PromotableAllocas;
205
206	/// A worklist of PHIs to speculate prior to promoting allocas.
207	///
208	/// All of these PHIs have been checked for the safety of speculation and by
209	/// being speculated will allow promoting allocas currently in the promotable
210	/// queue.
211	SmallSetVector<PHINode *, `8`> SpeculatablePHIs;
212
213	/// A worklist of select instructions to rewrite prior to promoting
214	/// allocas.
215	SmallMapVector<SelectInst *, RewriteableMemOps, `8`> SelectsToRewrite;
216
217	/// Select instructions that use an alloca and are subsequently loaded can be
218	/// rewritten to load both input pointers and then select between the result,
219	/// allowing the load of the alloca to be promoted.
220	/// From this:
221	/// %P2 = select i1 %cond, ptr %Alloca, ptr %Other
222	/// %V = load <type>, ptr %P2
223	/// to:
224	/// %V1 = load <type>, ptr %Alloca -> will be mem2reg'd
225	/// %V2 = load <type>, ptr %Other
226	/// %V = select i1 %cond, <type> %V1, <type> %V2
227	///
228	/// We can do this to a select if its only uses are loads
229	/// and if either the operand to the select can be loaded unconditionally,
230	/// or if we are allowed to perform CFG modifications.
231	/// If found an intervening bitcast with a single use of the load,
232	/// allow the promotion.
233	static std::optional<RewriteableMemOps>
234	isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG);
235
236	public:
237	SROA(LLVMContext C, DomTreeUpdater DTU, AssumptionCache *AC,
238	SROAOptions PreserveCFG_)
239	: C(C), DTU(DTU), AC(AC),
240	PreserveCFG(PreserveCFG_ == SROAOptions::PreserveCFG) {}
241
242	/// Main run method used by both the SROAPass and by the legacy pass.
243	std::pair<bool /Changed/, bool /CFGChanged/> runSROA(Function &F);
244
245	private:
246	friend class AllocaSliceRewriter;
247
248	bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS);
249	AllocaInst *rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P);
250	bool splitAlloca(AllocaInst &AI, AllocaSlices &AS);
251	bool propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS);
252	std::pair<bool /Changed/, bool /CFGChanged/> runOnAlloca(AllocaInst &AI);
253	void clobberUse(Use &U);
254	bool deleteDeadInstructions(SmallPtrSetImpl<AllocaInst *> &DeletedAllocas);
255	bool promoteAllocas();
256	};
257
258	} // end anonymous namespace
259
260	/// Calculate the fragment of a variable to use when slicing a store
261	/// based on the slice dimensions, existing fragment, and base storage
262	/// fragment.
263	/// Results:
264	/// UseFrag - Use Target as the new fragment.
265	/// UseNoFrag - The new slice already covers the whole variable.
266	/// Skip - The new alloca slice doesn't include this variable.
267	/// FIXME: Can we use calculateFragmentIntersect instead?
268	namespace {
269	enum FragCalcResult { UseFrag, UseNoFrag, Skip };
270	}
271	static FragCalcResult
272	calculateFragment(DILocalVariable *Variable,
273	uint64_t NewStorageSliceOffsetInBits,
274	uint64_t NewStorageSliceSizeInBits,
275	std::optional<DIExpression::FragmentInfo> StorageFragment,
276	std::optional<DIExpression::FragmentInfo> CurrentFragment,
277	DIExpression::FragmentInfo &Target) {
278	// If the base storage describes part of the variable apply the offset and
279	// the size constraint.
280	if (StorageFragment) {
281	Target.SizeInBits =
282	std::min(a: NewStorageSliceSizeInBits, b: StorageFragment ->SizeInBits);
283	Target.OffsetInBits =
284	NewStorageSliceOffsetInBits + StorageFragment ->OffsetInBits;
285	} else {
286	Target.SizeInBits = NewStorageSliceSizeInBits;
287	Target.OffsetInBits = NewStorageSliceOffsetInBits;
288	}
289
290	// If this slice extracts the entirety of an independent variable from a
291	// larger alloca, do not produce a fragment expression, as the variable is
292	// not fragmented.
293	if (!CurrentFragment) {
294	if (auto Size = Variable->getSizeInBits()) {
295	// Treat the current fragment as covering the whole variable.
296	CurrentFragment = DIExpression::FragmentInfo (*Size, `0`);
297	if (Target == CurrentFragment)
298	return UseNoFrag;
299	}
300	}
301
302	// No additional work to do if there isn't a fragment already, or there is
303	// but it already exactly describes the new assignment.
304	if (!CurrentFragment \|\| *CurrentFragment == Target)
305	return UseFrag;
306
307	// Reject the target fragment if it doesn't fit wholly within the current
308	// fragment. TODO: We could instead chop up the target to fit in the case of
309	// a partial overlap.
310	if (Target.startInBits() < CurrentFragment ->startInBits() \|\|
311	Target.endInBits() > CurrentFragment ->endInBits())
312	return Skip;
313
314	// Target fits within the current fragment, return it.
315	return UseFrag;
316	}
317
318	static DebugVariable getAggregateVariable(DbgVariableIntrinsic *DVI) {
319	return DebugVariable (DVI->getVariable(), std::nullopt,
320	DVI->getDebugLoc().getInlinedAt());
321	}
322	static DebugVariable getAggregateVariable(DbgVariableRecord *DVR) {
323	return DebugVariable (DVR->getVariable(), std::nullopt,
324	DVR->getDebugLoc().getInlinedAt());
325	}
326
327	/// Helpers for handling new and old debug info modes in migrateDebugInfo.
328	/// These overloads unwrap a DbgInstPtr {Instruction \| DbgRecord} union based
329	/// on the \p Unused parameter type.
330	DbgVariableRecord UnwrapDbgInstPtr(DbgInstPtr P, DbgVariableRecord Unused) {
331	(void)Unused;
332	return static_cast<DbgVariableRecord >(cast<DbgRecord >(Val&: P));
333	}
334	DbgAssignIntrinsic UnwrapDbgInstPtr(DbgInstPtr P, DbgAssignIntrinsic Unused) {
335	(void)Unused;
336	return static_cast<DbgAssignIntrinsic >(cast<Instruction >(Val&: P));
337	}
338
339	/// Find linked dbg.assign and generate a new one with the correct
340	/// FragmentInfo. Link Inst to the new dbg.assign. If Value is nullptr the
341	/// value component is copied from the old dbg.assign to the new.
342	/// \param OldAlloca Alloca for the variable before splitting.
343	/// \param IsSplit True if the store (not necessarily alloca)
344	/// is being split.
345	/// \param OldAllocaOffsetInBits Offset of the slice taken from OldAlloca.
346	/// \param SliceSizeInBits New number of bits being written to.
347	/// \param OldInst Instruction that is being split.
348	/// \param Inst New instruction performing this part of the
349	/// split store.
350	/// \param Dest Store destination.
351	/// \param Value Stored value.
352	/// \param DL Datalayout.
353	static void migrateDebugInfo(AllocaInst OldAlloca, bool* IsSplit,
354	uint64_t OldAllocaOffsetInBits,
355	uint64_t SliceSizeInBits, Instruction *OldInst,
356	Instruction Inst, Value Dest, Value *Value,
357	const DataLayout &DL) {
358	auto MarkerRange = at::getAssignmentMarkers(Inst: OldInst);
359	auto DVRAssignMarkerRange = at::getDVRAssignmentMarkers(Inst: OldInst);
360	// Nothing to do if OldInst has no linked dbg.assign intrinsics.
361	if (MarkerRange.empty() && DVRAssignMarkerRange.empty())
362	return;
363
364	LLVM_DEBUG(dbgs() << " migrateDebugInfo\n");
365	LLVM_DEBUG(dbgs() << " OldAlloca: " << *OldAlloca << "\n");
366	LLVM_DEBUG(dbgs() << " IsSplit: " << IsSplit << "\n");
367	LLVM_DEBUG(dbgs() << " OldAllocaOffsetInBits: " << OldAllocaOffsetInBits
368	<< "\n");
369	LLVM_DEBUG(dbgs() << " SliceSizeInBits: " << SliceSizeInBits << "\n");
370	LLVM_DEBUG(dbgs() << " OldInst: " << *OldInst << "\n");
371	LLVM_DEBUG(dbgs() << " Inst: " << *Inst << "\n");
372	LLVM_DEBUG(dbgs() << " Dest: " << *Dest << "\n");
373	if (Value)
374	LLVM_DEBUG(dbgs() << " Value: " << *Value << "\n");
375
376	/// Map of aggregate variables to their fragment associated with OldAlloca.
377	DenseMap<DebugVariable, std::optional<DIExpression::FragmentInfo>>
378	BaseFragments;
379	for (auto *DAI : at::getAssignmentMarkers(Inst: OldAlloca))
380	BaseFragments [getAggregateVariable(DVI: DAI)] =
381	DAI->getExpression()->getFragmentInfo();
382	for (auto *DVR : at::getDVRAssignmentMarkers(Inst: OldAlloca))
383	BaseFragments [getAggregateVariable(DVR)] =
384	DVR->getExpression()->getFragmentInfo();
385
386	// The new inst needs a DIAssignID unique metadata tag (if OldInst has
387	// one). It shouldn't already have one: assert this assumption.
388	assert(!Inst->getMetadata(LLVMContext::MD_DIAssignID));
389	DIAssignID NewID = nullptr*;
390	auto &Ctx = Inst->getContext();
391	DIBuilder DIB(OldInst->getModule(), /AllowUnresolved/* false);
392	assert(OldAlloca->isStaticAlloca());
393
394	auto MigrateDbgAssign = [&](auto *DbgAssign) {
395	LLVM_DEBUG(dbgs() << " existing dbg.assign is: " << *DbgAssign
396	<< "\n");
397	auto *Expr = DbgAssign->getExpression();
398	bool SetKillLocation = false;
399
400	if (IsSplit) {
401	std::optional<DIExpression::FragmentInfo> BaseFragment;
402	{
403	auto R = BaseFragments.find(getAggregateVariable(DbgAssign));
404	if (R == BaseFragments.end())
405	return;
406	BaseFragment = R->second;
407	}
408	std::optional<DIExpression::FragmentInfo> CurrentFragment =
409	Expr->getFragmentInfo();
410	DIExpression::FragmentInfo NewFragment;
411	FragCalcResult Result = calculateFragment(
412	DbgAssign->getVariable(), OldAllocaOffsetInBits, SliceSizeInBits,
413	BaseFragment, CurrentFragment, NewFragment);
414
415	if (Result == Skip)
416	return;
417	if (Result == UseFrag && !(NewFragment == CurrentFragment)) {
418	if (CurrentFragment) {
419	// Rewrite NewFragment to be relative to the existing one (this is
420	// what createFragmentExpression wants). CalculateFragment has
421	// already resolved the size for us. FIXME: Should it return the
422	// relative fragment too?
423	NewFragment.OffsetInBits -= CurrentFragment ->OffsetInBits;
424	}
425	// Add the new fragment info to the existing expression if possible.
426	if (auto E = DIExpression::createFragmentExpression(
427	Expr, OffsetInBits: NewFragment.OffsetInBits, SizeInBits: NewFragment.SizeInBits)) {
428	Expr = *E;
429	} else {
430	// Otherwise, add the new fragment info to an empty expression and
431	// discard the value component of this dbg.assign as the value cannot
432	// be computed with the new fragment.
433	Expr = *DIExpression::createFragmentExpression(
434	Expr: DIExpression::get(Context&: Expr->getContext(), Elements: {}),
435	OffsetInBits: NewFragment.OffsetInBits, SizeInBits: NewFragment.SizeInBits);
436	SetKillLocation = true;
437	}
438	}
439	}
440
441	// If we haven't created a DIAssignID ID do that now and attach it to Inst.
442	if (!NewID) {
443	NewID = DIAssignID::getDistinct(Context&: Ctx);
444	Inst->setMetadata(KindID: LLVMContext::MD_DIAssignID, Node: NewID);
445	}
446
447	::Value *NewValue = Value ? Value : DbgAssign->getValue();
448	auto *NewAssign = UnwrapDbgInstPtr(
449	DIB.insertDbgAssign(LinkedInstr: Inst, Val: NewValue, SrcVar: DbgAssign->getVariable(), ValExpr: Expr,
450	Addr: Dest, AddrExpr: DIExpression::get(Context&: Expr->getContext(), Elements: {}),
451	DL: DbgAssign->getDebugLoc()),
452	DbgAssign);
453
454	// If we've updated the value but the original dbg.assign has an arglist
455	// then kill it now - we can't use the requested new value.
456	// We can't replace the DIArgList with the new value as it'd leave
457	// the DIExpression in an invalid state (DW_OP_LLVM_arg operands without
458	// an arglist). And we can't keep the DIArgList in case the linked store
459	// is being split - in which case the DIArgList + expression may no longer
460	// be computing the correct value.
461	// This should be a very rare situation as it requires the value being
462	// stored to differ from the dbg.assign (i.e., the value has been
463	// represented differently in the debug intrinsic for some reason).
464	SetKillLocation \|=
465	Value && (DbgAssign->hasArgList() \|\|
466	!DbgAssign->getExpression()->isSingleLocationExpression());
467	if (SetKillLocation)
468	NewAssign->setKillLocation();
469
470	// We could use more precision here at the cost of some additional (code)
471	// complexity - if the original dbg.assign was adjacent to its store, we
472	// could position this new dbg.assign adjacent to its store rather than the
473	// old dbg.assgn. That would result in interleaved dbg.assigns rather than
474	// what we get now:
475	// split store !1
476	// split store !2
477	// dbg.assign !1
478	// dbg.assign !2
479	// This (current behaviour) results results in debug assignments being
480	// noted as slightly offset (in code) from the store. In practice this
481	// should have little effect on the debugging experience due to the fact
482	// that all the split stores should get the same line number.
483	NewAssign->moveBefore(DbgAssign->getIterator());
484
485	NewAssign->setDebugLoc(DbgAssign->getDebugLoc());
486	LLVM_DEBUG(dbgs() << "Created new assign: " << *NewAssign << "\n");
487	};
488
489	for_each(Range&: MarkerRange, F: MigrateDbgAssign);
490	for_each(Range&: DVRAssignMarkerRange, F: MigrateDbgAssign);
491	}
492
493	namespace {
494
495	/// A custom IRBuilder inserter which prefixes all names, but only in
496	/// Assert builds.
497	class IRBuilderPrefixedInserter final : public IRBuilderDefaultInserter {
498	std::string Prefix;
499
500	Twine getNameWithPrefix(const Twine &Name) const {
501	return Name.isTriviallyEmpty() ? Name : Prefix + Name;
502	}
503
504	public:
505	void SetNamePrefix(const Twine &P) { Prefix = P.str(); }
506
507	void InsertHelper(Instruction I, const* Twine &Name,
508	BasicBlock::iterator InsertPt) const override {
509	IRBuilderDefaultInserter::InsertHelper(I, Name: getNameWithPrefix(Name),
510	InsertPt);
511	}
512	};
513
514	/// Provide a type for IRBuilder that drops names in release builds.
515	using IRBuilderTy = IRBuilder<ConstantFolder, IRBuilderPrefixedInserter>;
516
517	/// A used slice of an alloca.
518	///
519	/// This structure represents a slice of an alloca used by some instruction. It
520	/// stores both the begin and end offsets of this use, a pointer to the use
521	/// itself, and a flag indicating whether we can classify the use as splittable
522	/// or not when forming partitions of the alloca.
523	class Slice {
524	/// The beginning offset of the range.
525	uint64_t BeginOffset = `0`;
526
527	/// The ending offset, not included in the range.
528	uint64_t EndOffset = `0`;
529
530	/// Storage for both the use of this slice and whether it can be
531	/// split.
532	PointerIntPair<Use , `1`, bool*> UseAndIsSplittable;
533
534	public:
535	Slice() = default;
536
537	Slice(uint64_t BeginOffset, uint64_t EndOffset, Use U, bool* IsSplittable)
538	: BeginOffset(BeginOffset), EndOffset(EndOffset),
539	UseAndIsSplittable (U, IsSplittable) {}
540
541	uint64_t beginOffset() const { return BeginOffset; }
542	uint64_t endOffset() const { return EndOffset; }
543
544	bool isSplittable() const { return UseAndIsSplittable.getInt(); }
545	void makeUnsplittable() { UseAndIsSplittable.setInt(false); }
546
547	Use getUse() const* { return UseAndIsSplittable.getPointer(); }
548
549	bool isDead() const { return getUse() == nullptr; }
550	void kill() { UseAndIsSplittable.setPointer(nullptr); }
551
552	/// Support for ordering ranges.
553	///
554	/// This provides an ordering over ranges such that start offsets are
555	/// always increasing, and within equal start offsets, the end offsets are
556	/// decreasing. Thus the spanning range comes first in a cluster with the
557	/// same start position.
558	bool operator<(const Slice &RHS) const {
559	if (beginOffset() < RHS.beginOffset())
560	return true;
561	if (beginOffset() > RHS.beginOffset())
562	return false;
563	if (isSplittable() != RHS.isSplittable())
564	return !isSplittable();
565	if (endOffset() > RHS.endOffset())
566	return true;
567	return false;
568	}
569
570	/// Support comparison with a single offset to allow binary searches.
571	friend LLVM_ATTRIBUTE_UNUSED bool operator<(const Slice &LHS,
572	uint64_t RHSOffset) {
573	return LHS.beginOffset() < RHSOffset;
574	}
575	friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset,
576	const Slice &RHS) {
577	return LHSOffset < RHS.beginOffset();
578	}
579
580	bool operator==(const Slice &RHS) const {
581	return isSplittable() == RHS.isSplittable() &&
582	beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset();
583	}
584	bool operator!=(const Slice &RHS) const { return !operator==(RHS); }
585	};
586
587	/// Representation of the alloca slices.
588	///
589	/// This class represents the slices of an alloca which are formed by its
590	/// various uses. If a pointer escapes, we can't fully build a representation
591	/// for the slices used and we reflect that in this structure. The uses are
592	/// stored, sorted by increasing beginning offset and with unsplittable slices
593	/// starting at a particular offset before splittable slices.
594	class AllocaSlices {
595	public:
596	/// Construct the slices of a particular alloca.
597	AllocaSlices(const DataLayout &DL, AllocaInst &AI);
598
599	/// Test whether a pointer to the allocation escapes our analysis.
600	///
601	/// If this is true, the slices are never fully built and should be
602	/// ignored.
603	bool isEscaped() const { return PointerEscapingInstr; }
604	bool isEscapedReadOnly() const { return PointerEscapingInstrReadOnly; }
605
606	/// Support for iterating over the slices.
607	/// @{
608	using iterator = SmallVectorImpl<Slice>::iterator;
609	using range = iterator_range<iterator>;
610
611	iterator begin() { return Slices.begin(); }
612	iterator end() { return Slices.end(); }
613
614	using const_iterator = SmallVectorImpl<Slice>::const_iterator;
615	using const_range = iterator_range<const_iterator>;
616
617	const_iterator begin() const { return Slices.begin(); }
618	const_iterator end() const { return Slices.end(); }
619	/// @}
620
621	/// Erase a range of slices.
622	void erase(iterator Start, iterator Stop) { Slices.erase(CS: Start, CE: Stop); }
623
624	/// Insert new slices for this alloca.
625	///
626	/// This moves the slices into the alloca's slices collection, and re-sorts
627	/// everything so that the usual ordering properties of the alloca's slices
628	/// hold.
629	void insert(ArrayRef<Slice> NewSlices) {
630	int OldSize = Slices.size();
631	Slices.append(in_start: NewSlices.begin(), in_end: NewSlices.end());
632	auto SliceI = Slices.begin() + OldSize;
633	std::stable_sort(first: SliceI, last: Slices.end());
634	std::inplace_merge(first: Slices.begin(), middle: SliceI, last: Slices.end());
635	}
636
637	// Forward declare the iterator and range accessor for walking the
638	// partitions.
639	class partition_iterator;
640	iterator_range<partition_iterator> partitions();
641
642	/// Access the dead users for this alloca.
643	ArrayRef<Instruction > getDeadUsers() const* { return DeadUsers; }
644
645	/// Access Uses that should be dropped if the alloca is promotable.
646	ArrayRef<Use > getDeadUsesIfPromotable() const* {
647	return DeadUseIfPromotable;
648	}
649
650	/// Access the dead operands referring to this alloca.
651	///
652	/// These are operands which have cannot actually be used to refer to the
653	/// alloca as they are outside its range and the user doesn't correct for
654	/// that. These mostly consist of PHI node inputs and the like which we just
655	/// need to replace with undef.
656	ArrayRef<Use > getDeadOperands() const* { return DeadOperands; }
657
658	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
659	void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const;
660	void printSlice(raw_ostream &OS, const_iterator I,
661	StringRef Indent = " ") const;
662	void printUse(raw_ostream &OS, const_iterator I,
663	StringRef Indent = " ") const;
664	void print(raw_ostream &OS) const;
665	void dump(const_iterator I) const;
666	void dump() const;
667	#endif
668
669	private:
670	template <typename DerivedT, typename RetT = void> class BuilderBase;
671	class SliceBuilder;
672
673	friend class AllocaSlices::SliceBuilder;
674
675	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
676	/// Handle to alloca instruction to simplify method interfaces.
677	AllocaInst &AI;
678	#endif
679
680	/// The instruction responsible for this alloca not having a known set
681	/// of slices.
682	///
683	/// When an instruction (potentially) escapes the pointer to the alloca, we
684	/// store a pointer to that here and abort trying to form slices of the
685	/// alloca. This will be null if the alloca slices are analyzed successfully.
686	Instruction *PointerEscapingInstr;
687	Instruction *PointerEscapingInstrReadOnly;
688
689	/// The slices of the alloca.
690	///
691	/// We store a vector of the slices formed by uses of the alloca here. This
692	/// vector is sorted by increasing begin offset, and then the unsplittable
693	/// slices before the splittable ones. See the Slice inner class for more
694	/// details.
695	SmallVector<Slice, `8`> Slices;
696
697	/// Instructions which will become dead if we rewrite the alloca.
698	///
699	/// Note that these are not separated by slice. This is because we expect an
700	/// alloca to be completely rewritten or not rewritten at all. If rewritten,
701	/// all these instructions can simply be removed and replaced with poison as
702	/// they come from outside of the allocated space.
703	SmallVector<Instruction *, `8`> DeadUsers;
704
705	/// Uses which will become dead if can promote the alloca.
706	SmallVector<Use *, `8`> DeadUseIfPromotable;
707
708	/// Operands which will become dead if we rewrite the alloca.
709	///
710	/// These are operands that in their particular use can be replaced with
711	/// poison when we rewrite the alloca. These show up in out-of-bounds inputs
712	/// to PHI nodes and the like. They aren't entirely dead (there might be
713	/// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
714	/// want to swap this particular input for poison to simplify the use lists of
715	/// the alloca.
716	SmallVector<Use *, `8`> DeadOperands;
717	};
718
719	/// A partition of the slices.
720	///
721	/// An ephemeral representation for a range of slices which can be viewed as
722	/// a partition of the alloca. This range represents a span of the alloca's
723	/// memory which cannot be split, and provides access to all of the slices
724	/// overlapping some part of the partition.
725	///
726	/// Objects of this type are produced by traversing the alloca's slices, but
727	/// are only ephemeral and not persistent.
728	class Partition {
729	private:
730	friend class AllocaSlices;
731	friend class AllocaSlices::partition_iterator;
732
733	using iterator = AllocaSlices::iterator;
734
735	/// The beginning and ending offsets of the alloca for this
736	/// partition.
737	uint64_t BeginOffset = `0`, EndOffset = `0`;
738
739	/// The start and end iterators of this partition.
740	iterator SI, SJ;
741
742	/// A collection of split slice tails overlapping the partition.
743	SmallVector<Slice *, `4`> SplitTails;
744
745	/// Raw constructor builds an empty partition starting and ending at
746	/// the given iterator.
747	Partition(iterator SI) : SI(SI), SJ(SI) {}
748
749	public:
750	/// The start offset of this partition.
751	///
752	/// All of the contained slices start at or after this offset.
753	uint64_t beginOffset() const { return BeginOffset; }
754
755	/// The end offset of this partition.
756	///
757	/// All of the contained slices end at or before this offset.
758	uint64_t endOffset() const { return EndOffset; }
759
760	/// The size of the partition.
761	///
762	/// Note that this can never be zero.
763	uint64_t size() const {
764	assert(BeginOffset < EndOffset && "Partitions must span some bytes!");
765	return EndOffset - BeginOffset;
766	}
767
768	/// Test whether this partition contains no slices, and merely spans
769	/// a region occupied by split slices.
770	bool empty() const { return SI == SJ; }
771
772	/// \name Iterate slices that start within the partition.
773	/// These may be splittable or unsplittable. They have a begin offset >= the
774	/// partition begin offset.
775	/// @{
776	// FIXME: We should probably define a "concat_iterator" helper and use that
777	// to stitch together pointee_iterators over the split tails and the
778	// contiguous iterators of the partition. That would give a much nicer
779	// interface here. We could then additionally expose filtered iterators for
780	// split, unsplit, and unsplittable splices based on the usage patterns.
781	iterator begin() const { return SI; }
782	iterator end() const { return SJ; }
783	/// @}
784
785	/// Get the sequence of split slice tails.
786	///
787	/// These tails are of slices which start before this partition but are
788	/// split and overlap into the partition. We accumulate these while forming
789	/// partitions.
790	ArrayRef<Slice > splitSliceTails() const* { return SplitTails; }
791	};
792
793	} // end anonymous namespace
794
795	/// An iterator over partitions of the alloca's slices.
796	///
797	/// This iterator implements the core algorithm for partitioning the alloca's
798	/// slices. It is a forward iterator as we don't support backtracking for
799	/// efficiency reasons, and re-use a single storage area to maintain the
800	/// current set of split slices.
801	///
802	/// It is templated on the slice iterator type to use so that it can operate
803	/// with either const or non-const slice iterators.
804	class AllocaSlices::partition_iterator
805	: public iterator_facade_base<partition_iterator, std::forward_iterator_tag,
806	Partition> {
807	friend class AllocaSlices;
808
809	/// Most of the state for walking the partitions is held in a class
810	/// with a nice interface for examining them.
811	Partition P;
812
813	/// We need to keep the end of the slices to know when to stop.
814	AllocaSlices::iterator SE;
815
816	/// We also need to keep track of the maximum split end offset seen.
817	/// FIXME: Do we really?
818	uint64_t MaxSplitSliceEndOffset = `0`;
819
820	/// Sets the partition to be empty at given iterator, and sets the
821	/// end iterator.
822	partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE)
823	: P (SI), SE(SE) {
824	// If not already at the end, advance our state to form the initial
825	// partition.
826	if (SI != SE)
827	advance();
828	}
829
830	/// Advance the iterator to the next partition.
831	///
832	/// Requires that the iterator not be at the end of the slices.
833	void advance() {
834	assert((P.SI != SE \|\| !P.SplitTails.empty()) &&
835	"Cannot advance past the end of the slices!");
836
837	// Clear out any split uses which have ended.
838	if (!P.SplitTails.empty()) {
839	if (P.EndOffset >= MaxSplitSliceEndOffset) {
840	// If we've finished all splits, this is easy.
841	P.SplitTails.clear();
842	MaxSplitSliceEndOffset = `0`;
843	} else {
844	// Remove the uses which have ended in the prior partition. This
845	// cannot change the max split slice end because we just checked that
846	// the prior partition ended prior to that max.
847	llvm::erase_if(C&: P.SplitTails,
848	P: [&](Slice S) { return* S->endOffset() <= P.EndOffset; });
849	assert(llvm::any_of(P.SplitTails,
850	[&](Slice *S) {
851	return S->endOffset() == MaxSplitSliceEndOffset;
852	}) &&
853	"Could not find the current max split slice offset!");
854	assert(llvm::all_of(P.SplitTails,
855	[&](Slice *S) {
856	return S->endOffset() <= MaxSplitSliceEndOffset;
857	}) &&
858	"Max split slice end offset is not actually the max!");
859	}
860	}
861
862	// If P.SI is already at the end, then we've cleared the split tail and
863	// now have an end iterator.
864	if (P.SI == SE) {
865	assert(P.SplitTails.empty() && "Failed to clear the split slices!");
866	return;
867	}
868
869	// If we had a non-empty partition previously, set up the state for
870	// subsequent partitions.
871	if (P.SI != P.SJ) {
872	// Accumulate all the splittable slices which started in the old
873	// partition into the split list.
874	for (Slice &S : P)
875	if (S.isSplittable() && S.endOffset() > P.EndOffset) {
876	P.SplitTails.push_back(Elt: &S);
877	MaxSplitSliceEndOffset =
878	std::max(a: S.endOffset(), b: MaxSplitSliceEndOffset);
879	}
880
881	// Start from the end of the previous partition.
882	P.SI = P.SJ;
883
884	// If P.SI is now at the end, we at most have a tail of split slices.
885	if (P.SI == SE) {
886	P.BeginOffset = P.EndOffset;
887	P.EndOffset = MaxSplitSliceEndOffset;
888	return;
889	}
890
891	// If the we have split slices and the next slice is after a gap and is
892	// not splittable immediately form an empty partition for the split
893	// slices up until the next slice begins.
894	if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset &&
895	!P.SI->isSplittable()) {
896	P.BeginOffset = P.EndOffset;
897	P.EndOffset = P.SI->beginOffset();
898	return;
899	}
900	}
901
902	// OK, we need to consume new slices. Set the end offset based on the
903	// current slice, and step SJ past it. The beginning offset of the
904	// partition is the beginning offset of the next slice unless we have
905	// pre-existing split slices that are continuing, in which case we begin
906	// at the prior end offset.
907	P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset;
908	P.EndOffset = P.SI->endOffset();
909	++P.SJ;
910
911	// There are two strategies to form a partition based on whether the
912	// partition starts with an unsplittable slice or a splittable slice.
913	if (!P.SI->isSplittable()) {
914	// When we're forming an unsplittable region, it must always start at
915	// the first slice and will extend through its end.
916	assert(P.BeginOffset == P.SI->beginOffset());
917
918	// Form a partition including all of the overlapping slices with this
919	// unsplittable slice.
920	while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
921	if (!P.SJ->isSplittable())
922	P.EndOffset = std::max(a: P.EndOffset, b: P.SJ->endOffset());
923	++P.SJ;
924	}
925
926	// We have a partition across a set of overlapping unsplittable
927	// partitions.
928	return;
929	}
930
931	// If we're starting with a splittable slice, then we need to form
932	// a synthetic partition spanning it and any other overlapping splittable
933	// splices.
934	assert(P.SI->isSplittable() && "Forming a splittable partition!");
935
936	// Collect all of the overlapping splittable slices.
937	while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset &&
938	P.SJ->isSplittable()) {
939	P.EndOffset = std::max(a: P.EndOffset, b: P.SJ->endOffset());
940	++P.SJ;
941	}
942
943	// Back upiP.EndOffset if we ended the span early when encountering an
944	// unsplittable slice. This synthesizes the early end offset of
945	// a partition spanning only splittable slices.
946	if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
947	assert(!P.SJ->isSplittable());
948	P.EndOffset = P.SJ->beginOffset();
949	}
950	}
951
952	public:
953	bool operator==(const partition_iterator &RHS) const {
954	assert(SE == RHS.SE &&
955	"End iterators don't match between compared partition iterators!");
956
957	// The observed positions of partitions is marked by the P.SI iterator and
958	// the emptiness of the split slices. The latter is only relevant when
959	// P.SI == SE, as the end iterator will additionally have an empty split
960	// slices list, but the prior may have the same P.SI and a tail of split
961	// slices.
962	if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) {
963	assert(P.SJ == RHS.P.SJ &&
964	"Same set of slices formed two different sized partitions!");
965	assert(P.SplitTails.size() == RHS.P.SplitTails.size() &&
966	"Same slice position with differently sized non-empty split "
967	"slice tails!");
968	return true;
969	}
970	return false;
971	}
972
973	partition_iterator &operator++() {
974	advance();
975	return *this;
976	}
977
978	Partition &operator() { return* P; }
979	};
980
981	/// A forward range over the partitions of the alloca's slices.
982	///
983	/// This accesses an iterator range over the partitions of the alloca's
984	/// slices. It computes these partitions on the fly based on the overlapping
985	/// offsets of the slices and the ability to split them. It will visit "empty"
986	/// partitions to cover regions of the alloca only accessed via split
987	/// slices.
988	iterator_range<AllocaSlices::partition_iterator> AllocaSlices::partitions() {
989	return make_range(x: partition_iterator (begin(), end()),
990	y: partition_iterator (end(), end()));
991	}
992
993	static Value *foldSelectInst(SelectInst &SI) {
994	// If the condition being selected on is a constant or the same value is
995	// being selected between, fold the select. Yes this does (rarely) happen
996	// early on.
997	if (ConstantInt *CI = dyn_cast<ConstantInt>(Val: SI.getCondition()))
998	return SI.getOperand(i_nocapture: `1` + CI->isZero());
999	if (SI.getOperand(i_nocapture: `1`) == SI.getOperand(i_nocapture: `2`))
1000	return SI.getOperand(i_nocapture: `1`);
1001
1002	return nullptr;
1003	}
1004
1005	/// A helper that folds a PHI node or a select.
1006	static Value *foldPHINodeOrSelectInst(Instruction &I) {
1007	if (PHINode *PN = dyn_cast<PHINode>(Val: &I)) {
1008	// If PN merges together the same value, return that value.
1009	return PN->hasConstantValue();
1010	}
1011	return foldSelectInst(SI&: cast<SelectInst>(Val&: I));
1012	}
1013
1014	/// Builder for the alloca slices.
1015	///
1016	/// This class builds a set of alloca slices by recursively visiting the uses
1017	/// of an alloca and making a slice for each load and store at each offset.
1018	class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
1019	friend class PtrUseVisitor<SliceBuilder>;
1020	friend class InstVisitor<SliceBuilder>;
1021
1022	using Base = PtrUseVisitor<SliceBuilder>;
1023
1024	const uint64_t AllocSize;
1025	AllocaSlices &AS;
1026
1027	SmallDenseMap<Instruction , unsigned*> MemTransferSliceMap;
1028	SmallDenseMap<Instruction *, uint64_t> PHIOrSelectSizes;
1029
1030	/// Set to de-duplicate dead instructions found in the use walk.
1031	SmallPtrSet<Instruction *, `4`> VisitedDeadInsts;
1032
1033	public:
1034	SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
1035	: PtrUseVisitor<SliceBuilder>(DL),
1036	AllocSize(DL.getTypeAllocSize(Ty: AI.getAllocatedType()).getFixedValue()),
1037	AS(AS) {}
1038
1039	private:
1040	void markAsDead(Instruction &I) {
1041	if (VisitedDeadInsts.insert(Ptr: &I).second)
1042	AS.DeadUsers.push_back(Elt: &I);
1043	}
1044
1045	void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
1046	bool IsSplittable = false) {
1047	// Completely skip uses which have a zero size or start either before or
1048	// past the end of the allocation.
1049	if (Size == `0` \|\| Offset.uge(RHS: AllocSize)) {
1050	LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @"
1051	<< Offset
1052	<< " which has zero size or starts outside of the "
1053	<< AllocSize << " byte alloca:\n"
1054	<< " alloca: " << AS.AI << "\n"
1055	<< " use: " << I << "\n");
1056	return markAsDead(I);
1057	}
1058
1059	uint64_t BeginOffset = Offset.getZExtValue();
1060	uint64_t EndOffset = BeginOffset + Size;
1061
1062	// Clamp the end offset to the end of the allocation. Note that this is
1063	// formulated to handle even the case where "BeginOffset + Size" overflows.
1064	// This may appear superficially to be something we could ignore entirely,
1065	// but that is not so! There may be widened loads or PHI-node uses where
1066	// some instructions are dead but not others. We can't completely ignore
1067	// them, and so have to record at least the information here.
1068	assert(AllocSize >= BeginOffset); // Established above.
1069	if (Size > AllocSize - BeginOffset) {
1070	LLVM_DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @"
1071	<< Offset << " to remain within the " << AllocSize
1072	<< " byte alloca:\n"
1073	<< " alloca: " << AS.AI << "\n"
1074	<< " use: " << I << "\n");
1075	EndOffset = AllocSize;
1076	}
1077
1078	AS.Slices.push_back(Elt: Slice (BeginOffset, EndOffset, U, IsSplittable));
1079	}
1080
1081	void visitBitCastInst(BitCastInst &BC) {
1082	if (BC.use_empty())
1083	return markAsDead(I&: BC);
1084
1085	return Base::visitBitCastInst(BC);
1086	}
1087
1088	void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
1089	if (ASC.use_empty())
1090	return markAsDead(I&: ASC);
1091
1092	return Base::visitAddrSpaceCastInst(ASC);
1093	}
1094
1095	void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
1096	if (GEPI.use_empty())
1097	return markAsDead(I&: GEPI);
1098
1099	return Base::visitGetElementPtrInst(GEPI);
1100	}
1101
1102	void handleLoadOrStore(Type Ty, Instruction &I, const* APInt &Offset,
1103	uint64_t Size, bool IsVolatile) {
1104	// We allow splitting of non-volatile loads and stores where the type is an
1105	// integer type. These may be used to implement 'memcpy' or other "transfer
1106	// of bits" patterns.
1107	bool IsSplittable =
1108	Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty);
1109
1110	insertUse(I, Offset, Size, IsSplittable);
1111	}
1112
1113	void visitLoadInst(LoadInst &LI) {
1114	assert((!LI.isSimple() \|\| LI.getType()->isSingleValueType()) &&
1115	"All simple FCA loads should have been pre-split");
1116
1117	// If there is a load with an unknown offset, we can still perform store
1118	// to load forwarding for other known-offset loads.
1119	if (!IsOffsetKnown)
1120	return PI.setEscapedReadOnly(&LI);
1121
1122	TypeSize Size = DL.getTypeStoreSize(Ty: LI.getType());
1123	if (Size.isScalable()) {
1124	unsigned VScale = LI.getFunction()->getVScaleValue();
1125	if (!VScale)
1126	return PI.setAborted(&LI);
1127
1128	Size = TypeSize::getFixed(ExactSize: Size.getKnownMinValue() * VScale);
1129	}
1130
1131	return handleLoadOrStore(Ty: LI.getType(), I&: LI, Offset, Size: Size.getFixedValue(),
1132	IsVolatile: LI.isVolatile());
1133	}
1134
1135	void visitStoreInst(StoreInst &SI) {
1136	Value *ValOp = SI.getValueOperand();
1137	if (ValOp == *U)
1138	return PI.setEscapedAndAborted(&SI);
1139	if (!IsOffsetKnown)
1140	return PI.setAborted(&SI);
1141
1142	TypeSize StoreSize = DL.getTypeStoreSize(Ty: ValOp->getType());
1143	if (StoreSize.isScalable()) {
1144	unsigned VScale = SI.getFunction()->getVScaleValue();
1145	if (!VScale)
1146	return PI.setAborted(&SI);
1147
1148	StoreSize = TypeSize::getFixed(ExactSize: StoreSize.getKnownMinValue() * VScale);
1149	}
1150
1151	uint64_t Size = StoreSize.getFixedValue();
1152
1153	// If this memory access can be shown to statically* extend outside the*
1154	// bounds of the allocation, it's behavior is undefined, so simply
1155	// ignore it. Note that this is more strict than the generic clamping
1156	// behavior of insertUse. We also try to handle cases which might run the
1157	// risk of overflow.
1158	// FIXME: We should instead consider the pointer to have escaped if this
1159	// function is being instrumented for addressing bugs or race conditions.
1160	if (Size > AllocSize \|\| Offset.ugt(RHS: AllocSize - Size)) {
1161	LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @"
1162	<< Offset << " which extends past the end of the "
1163	<< AllocSize << " byte alloca:\n"
1164	<< " alloca: " << AS.AI << "\n"
1165	<< " use: " << SI << "\n");
1166	return markAsDead(I&: SI);
1167	}
1168
1169	assert((!SI.isSimple() \|\| ValOp->getType()->isSingleValueType()) &&
1170	"All simple FCA stores should have been pre-split");
1171	handleLoadOrStore(Ty: ValOp->getType(), I&: SI, Offset, Size, IsVolatile: SI.isVolatile());
1172	}
1173
1174	void visitMemSetInst(MemSetInst &II) {
1175	assert(II.getRawDest() == *U && "Pointer use is not the destination?");
1176	ConstantInt *Length = dyn_cast<ConstantInt>(Val: II.getLength());
1177	if ((Length && Length->getValue() == `0`) \|\|
1178	(IsOffsetKnown && Offset.uge(RHS: AllocSize)))
1179	// Zero-length mem transfer intrinsics can be ignored entirely.
1180	return markAsDead(I&: II);
1181
1182	if (!IsOffsetKnown)
1183	return PI.setAborted(&II);
1184
1185	insertUse(I&: II, Offset,
1186	Size: Length ? Length->getLimitedValue()
1187	: AllocSize - Offset.getLimitedValue(),
1188	IsSplittable: (bool)Length);
1189	}
1190
1191	void visitMemTransferInst(MemTransferInst &II) {
1192	ConstantInt *Length = dyn_cast<ConstantInt>(Val: II.getLength());
1193	if (Length && Length->getValue() == `0`)
1194	// Zero-length mem transfer intrinsics can be ignored entirely.
1195	return markAsDead(I&: II);
1196
1197	// Because we can visit these intrinsics twice, also check to see if the
1198	// first time marked this instruction as dead. If so, skip it.
1199	if (VisitedDeadInsts.count(Ptr: &II))
1200	return;
1201
1202	if (!IsOffsetKnown)
1203	return PI.setAborted(&II);
1204
1205	// This side of the transfer is completely out-of-bounds, and so we can
1206	// nuke the entire transfer. However, we also need to nuke the other side
1207	// if already added to our partitions.
1208	// FIXME: Yet another place we really should bypass this when
1209	// instrumenting for ASan.
1210	if (Offset.uge(RHS: AllocSize)) {
1211	SmallDenseMap<Instruction , unsigned*>::iterator MTPI =
1212	MemTransferSliceMap.find(Val: &II);
1213	if (MTPI != MemTransferSliceMap.end())
1214	AS.Slices [MTPI ->second].kill();
1215	return markAsDead(I&: II);
1216	}
1217
1218	uint64_t RawOffset = Offset.getLimitedValue();
1219	uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset;
1220
1221	// Check for the special case where the same exact value is used for both
1222	// source and dest.
1223	if (U == II.getRawDest() && U == II.getRawSource()) {
1224	// For non-volatile transfers this is a no-op.
1225	if (!II.isVolatile())
1226	return markAsDead(I&: II);
1227
1228	return insertUse(I&: II, Offset, Size, /IsSplittable=/false);
1229	}
1230
1231	// If we have seen both source and destination for a mem transfer, then
1232	// they both point to the same alloca.
1233	bool Inserted;
1234	SmallDenseMap<Instruction , unsigned*>::iterator MTPI;
1235	std::tie(args&: MTPI, args&: Inserted) =
1236	MemTransferSliceMap.insert(KV: std::make_pair(x: &II, y: AS.Slices.size()));
1237	unsigned PrevIdx = MTPI ->second;
1238	if (!Inserted) {
1239	Slice &PrevP = AS.Slices [PrevIdx];
1240
1241	// Check if the begin offsets match and this is a non-volatile transfer.
1242	// In that case, we can completely elide the transfer.
1243	if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) {
1244	PrevP.kill();
1245	return markAsDead(I&: II);
1246	}
1247
1248	// Otherwise we have an offset transfer within the same alloca. We can't
1249	// split those.
1250	PrevP.makeUnsplittable();
1251	}
1252
1253	// Insert the use now that we've fixed up the splittable nature.
1254	insertUse(I&: II, Offset, Size, /IsSplittable=/Inserted && Length);
1255
1256	// Check that we ended up with a valid index in the map.
1257	assert(AS.Slices[PrevIdx].getUse()->getUser() == &II &&
1258	"Map index doesn't point back to a slice with this user.");
1259	}
1260
1261	// Disable SRoA for any intrinsics except for lifetime invariants and
1262	// invariant group.
1263	// FIXME: What about debug intrinsics? This matches old behavior, but
1264	// doesn't make sense.
1265	void visitIntrinsicInst(IntrinsicInst &II) {
1266	if (II.isDroppable()) {
1267	AS.DeadUseIfPromotable.push_back(Elt: U);
1268	return;
1269	}
1270
1271	if (!IsOffsetKnown)
1272	return PI.setAborted(&II);
1273
1274	if (II.isLifetimeStartOrEnd()) {
1275	ConstantInt *Length = cast<ConstantInt>(Val: II.getArgOperand(i: `0`));
1276	uint64_t Size = std::min(a: AllocSize - Offset.getLimitedValue(),
1277	b: Length->getLimitedValue());
1278	insertUse(I&: II, Offset, Size, IsSplittable: true);
1279	return;
1280	}
1281
1282	if (II.isLaunderOrStripInvariantGroup()) {
1283	insertUse(I&: II, Offset, Size: AllocSize, IsSplittable: true);
1284	enqueueUsers(I&: II);
1285	return;
1286	}
1287
1288	Base::visitIntrinsicInst(II);
1289	}
1290
1291	Instruction hasUnsafePHIOrSelectUse(Instruction Root, uint64_t &Size) {
1292	// We consider any PHI or select that results in a direct load or store of
1293	// the same offset to be a viable use for slicing purposes. These uses
1294	// are considered unsplittable and the size is the maximum loaded or stored
1295	// size.
1296	SmallPtrSet<Instruction *, `4`> Visited;
1297	SmallVector<std::pair<Instruction , Instruction >, `4`> Uses;
1298	Visited.insert(Ptr: Root);
1299	Uses.push_back(Elt: std::make_pair(x: cast<Instruction>(Val&: *U), y&: Root));
1300	const DataLayout &DL = Root->getDataLayout();
1301	// If there are no loads or stores, the access is dead. We mark that as
1302	// a size zero access.
1303	Size = `0`;
1304	do {
1305	Instruction I, UsedI;
1306	std::tie(args&: UsedI, args&: I) = Uses.pop_back_val();
1307
1308	if (LoadInst *LI = dyn_cast<LoadInst>(Val: I)) {
1309	TypeSize LoadSize = DL.getTypeStoreSize(Ty: LI->getType());
1310	if (LoadSize.isScalable()) {
1311	PI.setAborted(LI);
1312	return nullptr;
1313	}
1314	Size = std::max(a: Size, b: LoadSize.getFixedValue());
1315	continue;
1316	}
1317	if (StoreInst *SI = dyn_cast<StoreInst>(Val: I)) {
1318	Value *Op = SI->getOperand(i_nocapture: `0`);
1319	if (Op == UsedI)
1320	return SI;
1321	TypeSize StoreSize = DL.getTypeStoreSize(Ty: Op->getType());
1322	if (StoreSize.isScalable()) {
1323	PI.setAborted(SI);
1324	return nullptr;
1325	}
1326	Size = std::max(a: Size, b: StoreSize.getFixedValue());
1327	continue;
1328	}
1329
1330	if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: I)) {
1331	if (!GEP->hasAllZeroIndices())
1332	return GEP;
1333	} else if (!isa<BitCastInst>(Val: I) && !isa<PHINode>(Val: I) &&
1334	!isa<SelectInst>(Val: I) && !isa<AddrSpaceCastInst>(Val: I)) {
1335	return I;
1336	}
1337
1338	for (User *U : I->users())
1339	if (Visited.insert(Ptr: cast<Instruction>(Val: U)).second)
1340	Uses.push_back(Elt: std::make_pair(x&: I, y: cast<Instruction>(Val: U)));
1341	} while (!Uses.empty());
1342
1343	return nullptr;
1344	}
1345
1346	void visitPHINodeOrSelectInst(Instruction &I) {
1347	assert(isa<PHINode>(I) \|\| isa<SelectInst>(I));
1348	if (I.use_empty())
1349	return markAsDead(I);
1350
1351	// If this is a PHI node before a catchswitch, we cannot insert any non-PHI
1352	// instructions in this BB, which may be required during rewriting. Bail out
1353	// on these cases.
1354	if (isa<PHINode>(Val: I) &&
1355	I.getParent()->getFirstInsertionPt() == I.getParent()->end())
1356	return PI.setAborted(&I);
1357
1358	// TODO: We could use simplifyInstruction here to fold PHINodes and
1359	// SelectInsts. However, doing so requires to change the current
1360	// dead-operand-tracking mechanism. For instance, suppose neither loading
1361	// from %U nor %other traps. Then "load (select undef, %U, %other)" does not
1362	// trap either. However, if we simply replace %U with undef using the
1363	// current dead-operand-tracking mechanism, "load (select undef, undef,
1364	// %other)" may trap because the select may return the first operand
1365	// "undef".
1366	if (Value *Result = foldPHINodeOrSelectInst(I)) {
1367	if (Result == *U)
1368	// If the result of the constant fold will be the pointer, recurse
1369	// through the PHI/select as if we had RAUW'ed it.
1370	enqueueUsers(I);
1371	else
1372	// Otherwise the operand to the PHI/select is dead, and we can replace
1373	// it with poison.
1374	AS.DeadOperands.push_back(Elt: U);
1375
1376	return;
1377	}
1378
1379	if (!IsOffsetKnown)
1380	return PI.setAborted(&I);
1381
1382	// See if we already have computed info on this node.
1383	uint64_t &Size = PHIOrSelectSizes [&I];
1384	if (!Size) {
1385	// This is a new PHI/Select, check for an unsafe use of it.
1386	if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(Root: &I, Size))
1387	return PI.setAborted(UnsafeI);
1388	}
1389
1390	// For PHI and select operands outside the alloca, we can't nuke the entire
1391	// phi or select -- the other side might still be relevant, so we special
1392	// case them here and use a separate structure to track the operands
1393	// themselves which should be replaced with poison.
1394	// FIXME: This should instead be escaped in the event we're instrumenting
1395	// for address sanitization.
1396	if (Offset.uge(RHS: AllocSize)) {
1397	AS.DeadOperands.push_back(Elt: U);
1398	return;
1399	}
1400
1401	insertUse(I, Offset, Size);
1402	}
1403
1404	void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(I&: PN); }
1405
1406	void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(I&: SI); }
1407
1408	/// Disable SROA entirely if there are unhandled users of the alloca.
1409	void visitInstruction(Instruction &I) { PI.setAborted(&I); }
1410
1411	void visitCallBase(CallBase &CB) {
1412	// If the call operand is read-only and only does a read-only or address
1413	// capture, then we mark it as EscapedReadOnly.
1414	if (CB.isDataOperand(U) &&
1415	!capturesFullProvenance(CC: CB.getCaptureInfo(OpNo: U->getOperandNo())) &&
1416	CB.onlyReadsMemory(OpNo: U->getOperandNo())) {
1417	PI.setEscapedReadOnly(&CB);
1418	return;
1419	}
1420
1421	Base::visitCallBase(CB);
1422	}
1423	};
1424
1425	AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
1426	:
1427	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
1428	AI(AI),
1429	#endif
1430	PointerEscapingInstr(nullptr), PointerEscapingInstrReadOnly(nullptr) {
1431	SliceBuilder PB(DL, AI, *this);
1432	SliceBuilder::PtrInfo PtrI = PB.visitPtr(I&: AI);
1433	if (PtrI.isEscaped() \|\| PtrI.isAborted()) {
1434	// FIXME: We should sink the escape vs. abort info into the caller nicely,
1435	// possibly by just storing the PtrInfo in the AllocaSlices.
1436	PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst()
1437	: PtrI.getAbortingInst();
1438	assert(PointerEscapingInstr && "Did not track a bad instruction");
1439	return;
1440	}
1441	PointerEscapingInstrReadOnly = PtrI.getEscapedReadOnlyInst();
1442
1443	llvm::erase_if(C&: Slices, P: [](const Slice &S) { return S.isDead(); });
1444
1445	// Sort the uses. This arranges for the offsets to be in ascending order,
1446	// and the sizes to be in descending order.
1447	llvm::stable_sort(Range&: Slices);
1448	}
1449
1450	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
1451
1452	void AllocaSlices::print(raw_ostream &OS, const_iterator I,
1453	StringRef Indent) const {
1454	printSlice(OS, I, Indent);
1455	OS << "\n";
1456	printUse(OS, I, Indent);
1457	}
1458
1459	void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I,
1460	StringRef Indent) const {
1461	OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")"
1462	<< " slice #" << (I - begin())
1463	<< (I->isSplittable() ? " (splittable)" : "");
1464	}
1465
1466	void AllocaSlices::printUse(raw_ostream &OS, const_iterator I,
1467	StringRef Indent) const {
1468	OS << Indent << " used by: " << *I->getUse()->getUser() << "\n";
1469	}
1470
1471	void AllocaSlices::print(raw_ostream &OS) const {
1472	if (PointerEscapingInstr) {
1473	OS << "Can't analyze slices for alloca: " << AI << "\n"
1474	<< " A pointer to this alloca escaped by:\n"
1475	<< " " << *PointerEscapingInstr << "\n";
1476	return;
1477	}
1478
1479	if (PointerEscapingInstrReadOnly)
1480	OS << "Escapes into ReadOnly: " << *PointerEscapingInstrReadOnly << "\n";
1481
1482	OS << "Slices of alloca: " << AI << "\n";
1483	for (const_iterator I = begin(), E = end(); I != E; ++I)
1484	print(OS, I);
1485	}
1486
1487	LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const {
1488	print(dbgs(), I);
1489	}
1490	LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }
1491
1492	#endif // !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
1493
1494	/// Walk the range of a partitioning looking for a common type to cover this
1495	/// sequence of slices.
1496	static std::pair<Type , IntegerType >
1497	findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E,
1498	uint64_t EndOffset) {
1499	Type Ty = nullptr*;
1500	bool TyIsCommon = true;
1501	IntegerType ITy = nullptr*;
1502
1503	// Note that we need to look at every* alloca slice's Use to ensure we*
1504	// always get consistent results regardless of the order of slices.
1505	for (AllocaSlices::const_iterator I = B; I != E; ++I) {
1506	Use *U = I->getUse();
1507	if (isa<IntrinsicInst>(Val: *U->getUser()))
1508	continue;
1509	if (I->beginOffset() != B->beginOffset() \|\| I->endOffset() != EndOffset)
1510	continue;
1511
1512	Type UserTy = nullptr*;
1513	if (LoadInst *LI = dyn_cast<LoadInst>(Val: U->getUser())) {
1514	UserTy = LI->getType();
1515	} else if (StoreInst *SI = dyn_cast<StoreInst>(Val: U->getUser())) {
1516	UserTy = SI->getValueOperand()->getType();
1517	}
1518
1519	if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(Val: UserTy)) {
1520	// If the type is larger than the partition, skip it. We only encounter
1521	// this for split integer operations where we want to use the type of the
1522	// entity causing the split. Also skip if the type is not a byte width
1523	// multiple.
1524	if (UserITy->getBitWidth() % `8` != `0` \|\|
1525	UserITy->getBitWidth() / `8` > (EndOffset - B->beginOffset()))
1526	continue;
1527
1528	// Track the largest bitwidth integer type used in this way in case there
1529	// is no common type.
1530	if (!ITy \|\| ITy->getBitWidth() < UserITy->getBitWidth())
1531	ITy = UserITy;
1532	}
1533
1534	// To avoid depending on the order of slices, Ty and TyIsCommon must not
1535	// depend on types skipped above.
1536	if (!UserTy \|\| (Ty && Ty != UserTy))
1537	TyIsCommon = false; // Give up on anything but an iN type.
1538	else
1539	Ty = UserTy;
1540	}
1541
1542	return {TyIsCommon ? Ty : nullptr, ITy};
1543	}
1544
1545	/// PHI instructions that use an alloca and are subsequently loaded can be
1546	/// rewritten to load both input pointers in the pred blocks and then PHI the
1547	/// results, allowing the load of the alloca to be promoted.
1548	/// From this:
1549	/// %P2 = phi [i32 %Alloca, i32* %Other]*
1550	/// %V = load i32 %P2*
1551	/// to:
1552	/// %V1 = load i32 %Alloca -> will be mem2reg'd*
1553	/// ...
1554	/// %V2 = load i32 %Other*
1555	/// ...
1556	/// %V = phi [i32 %V1, i32 %V2]
1557	///
1558	/// We can do this to a select if its only uses are loads and if the operands
1559	/// to the select can be loaded unconditionally.
1560	///
1561	/// FIXME: This should be hoisted into a generic utility, likely in
1562	/// Transforms/Util/Local.h
1563	static bool isSafePHIToSpeculate(PHINode &PN) {
1564	const DataLayout &DL = PN.getDataLayout();
1565
1566	// For now, we can only do this promotion if the load is in the same block
1567	// as the PHI, and if there are no stores between the phi and load.
1568	// TODO: Allow recursive phi users.
1569	// TODO: Allow stores.
1570	BasicBlock *BB = PN.getParent();
1571	Align MaxAlign;
1572	uint64_t APWidth = DL.getIndexTypeSizeInBits(Ty: PN.getType());
1573	Type LoadType = nullptr*;
1574	for (User *U : PN.users()) {
1575	LoadInst *LI = dyn_cast<LoadInst>(Val: U);
1576	if (!LI \|\| !LI->isSimple())
1577	return false;
1578
1579	// For now we only allow loads in the same block as the PHI. This is
1580	// a common case that happens when instcombine merges two loads through
1581	// a PHI.
1582	if (LI->getParent() != BB)
1583	return false;
1584
1585	if (LoadType) {
1586	if (LoadType != LI->getType())
1587	return false;
1588	} else {
1589	LoadType = LI->getType();
1590	}
1591
1592	// Ensure that there are no instructions between the PHI and the load that
1593	// could store.
1594	for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI)
1595	if (BBI ->mayWriteToMemory())
1596	return false;
1597
1598	MaxAlign = std::max(a: MaxAlign, b: LI->getAlign());
1599	}
1600
1601	if (!LoadType)
1602	return false;
1603
1604	APInt LoadSize =
1605	APInt (APWidth, DL.getTypeStoreSize(Ty: LoadType).getFixedValue());
1606
1607	// We can only transform this if it is safe to push the loads into the
1608	// predecessor blocks. The only thing to watch out for is that we can't put
1609	// a possibly trapping load in the predecessor if it is a critical edge.
1610	for (unsigned Idx = `0`, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1611	Instruction *TI = PN.getIncomingBlock(i: Idx)->getTerminator();
1612	Value *InVal = PN.getIncomingValue(i: Idx);
1613
1614	// If the value is produced by the terminator of the predecessor (an
1615	// invoke) or it has side-effects, there is no valid place to put a load
1616	// in the predecessor.
1617	if (TI == InVal \|\| TI->mayHaveSideEffects())
1618	return false;
1619
1620	// If the predecessor has a single successor, then the edge isn't
1621	// critical.
1622	if (TI->getNumSuccessors() == `1`)
1623	continue;
1624
1625	// If this pointer is always safe to load, or if we can prove that there
1626	// is already a load in the block, then we can move the load to the pred
1627	// block.
1628	if (isSafeToLoadUnconditionally(V: InVal, Alignment: MaxAlign, Size: LoadSize, DL, ScanFrom: TI))
1629	continue;
1630
1631	return false;
1632	}
1633
1634	return true;
1635	}
1636
1637	static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN) {
1638	LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
1639
1640	LoadInst *SomeLoad = cast<LoadInst>(Val: PN.user_back());
1641	Type *LoadTy = SomeLoad->getType();
1642	IRB.SetInsertPoint(&PN);
1643	PHINode *NewPN = IRB.CreatePHI(Ty: LoadTy, NumReservedValues: PN.getNumIncomingValues(),
1644	Name: PN.getName() + ".sroa.speculated");
1645
1646	// Get the AA tags and alignment to use from one of the loads. It does not
1647	// matter which one we get and if any differ.
1648	AAMDNodes AATags = SomeLoad->getAAMetadata();
1649	Align Alignment = SomeLoad->getAlign();
1650
1651	// Rewrite all loads of the PN to use the new PHI.
1652	while (!PN.use_empty()) {
1653	LoadInst *LI = cast<LoadInst>(Val: PN.user_back());
1654	LI->replaceAllUsesWith(V: NewPN);
1655	LI->eraseFromParent();
1656	}
1657
1658	// Inject loads into all of the pred blocks.
1659	DenseMap<BasicBlock , Value > InjectedLoads;
1660	for (unsigned Idx = `0`, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1661	BasicBlock *Pred = PN.getIncomingBlock(i: Idx);
1662	Value *InVal = PN.getIncomingValue(i: Idx);
1663
1664	// A PHI node is allowed to have multiple (duplicated) entries for the same
1665	// basic block, as long as the value is the same. So if we already injected
1666	// a load in the predecessor, then we should reuse the same load for all
1667	// duplicated entries.
1668	if (Value *V = InjectedLoads.lookup(Val: Pred)) {
1669	NewPN->addIncoming(V, BB: Pred);
1670	continue;
1671	}
1672
1673	Instruction *TI = Pred->getTerminator();
1674	IRB.SetInsertPoint(TI);
1675
1676	LoadInst *Load = IRB.CreateAlignedLoad(
1677	Ty: LoadTy, Ptr: InVal, Align: Alignment,
1678	Name: (PN.getName() + ".sroa.speculate.load." + Pred->getName()));
1679	++NumLoadsSpeculated;
1680	if (AATags)
1681	Load->setAAMetadata(AATags);
1682	NewPN->addIncoming(V: Load, BB: Pred);
1683	InjectedLoads [Pred] = Load;
1684	}
1685
1686	LLVM_DEBUG(dbgs() << " speculated to: " << *NewPN << "\n");
1687	PN.eraseFromParent();
1688	}
1689
1690	SelectHandSpeculativity &
1691	SelectHandSpeculativity::setAsSpeculatable(bool isTrueVal) {
1692	if (isTrueVal)
1693	Bitfield::set<SelectHandSpeculativity::TrueVal>(Packed&: Storage, Value: true);
1694	else
1695	Bitfield::set<SelectHandSpeculativity::FalseVal>(Packed&: Storage, Value: true);
1696	return *this;
1697	}
1698
1699	bool SelectHandSpeculativity::isSpeculatable(bool isTrueVal) const {
1700	return isTrueVal ? Bitfield::get<SelectHandSpeculativity::TrueVal>(Packed: Storage)
1701	: Bitfield::get<SelectHandSpeculativity::FalseVal>(Packed: Storage);
1702	}
1703
1704	bool SelectHandSpeculativity::areAllSpeculatable() const {
1705	return isSpeculatable(/isTrueVal=/true) &&
1706	isSpeculatable(/isTrueVal=/false);
1707	}
1708
1709	bool SelectHandSpeculativity::areAnySpeculatable() const {
1710	return isSpeculatable(/isTrueVal=/true) \|\|
1711	isSpeculatable(/isTrueVal=/false);
1712	}
1713	bool SelectHandSpeculativity::areNoneSpeculatable() const {
1714	return !areAnySpeculatable();
1715	}
1716
1717	static SelectHandSpeculativity
1718	isSafeLoadOfSelectToSpeculate(LoadInst &LI, SelectInst &SI, bool PreserveCFG) {
1719	assert(LI.isSimple() && "Only for simple loads");
1720	SelectHandSpeculativity Spec;
1721
1722	const DataLayout &DL = SI.getDataLayout();
1723	for (Value *Value : {SI.getTrueValue(), SI.getFalseValue()})
1724	if (isSafeToLoadUnconditionally(V: Value, Ty: LI.getType(), Alignment: LI.getAlign(), DL,
1725	ScanFrom: &LI))
1726	Spec.setAsSpeculatable(/isTrueVal=/Value == SI.getTrueValue());
1727	else if (PreserveCFG)
1728	return Spec;
1729
1730	return Spec;
1731	}
1732
1733	std::optional<RewriteableMemOps>
1734	SROA::isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG) {
1735	RewriteableMemOps Ops;
1736
1737	for (User *U : SI.users()) {
1738	if (auto *BC = dyn_cast<BitCastInst>(Val: U); BC && BC->hasOneUse())
1739	U = *BC->user_begin();
1740
1741	if (auto *Store = dyn_cast<StoreInst>(Val: U)) {
1742	// Note that atomic stores can be transformed; atomic semantics do not
1743	// have any meaning for a local alloca. Stores are not speculatable,
1744	// however, so if we can't turn it into a predicated store, we are done.
1745	if (Store->isVolatile() \|\| PreserveCFG)
1746	return {}; // Give up on this `select`.
1747	Ops.emplace_back(Args&: Store);
1748	continue;
1749	}
1750
1751	auto *LI = dyn_cast<LoadInst>(Val: U);
1752
1753	// Note that atomic loads can be transformed;
1754	// atomic semantics do not have any meaning for a local alloca.
1755	if (!LI \|\| LI->isVolatile())
1756	return {}; // Give up on this `select`.
1757
1758	PossiblySpeculatableLoad Load(LI);
1759	if (!LI->isSimple()) {
1760	// If the `load` is not simple, we can't speculatively execute it,
1761	// but we could handle this via a CFG modification. But can we?
1762	if (PreserveCFG)
1763	return {}; // Give up on this `select`.
1764	Ops.emplace_back(Args&: Load);
1765	continue;
1766	}
1767
1768	SelectHandSpeculativity Spec =
1769	isSafeLoadOfSelectToSpeculate(LI&: *LI, SI, PreserveCFG);
1770	if (PreserveCFG && !Spec.areAllSpeculatable())
1771	return {}; // Give up on this `select`.
1772
1773	Load.setInt(Spec);
1774	Ops.emplace_back(Args&: Load);
1775	}
1776
1777	return Ops;
1778	}
1779
1780	static void speculateSelectInstLoads(SelectInst &SI, LoadInst &LI,
1781	IRBuilderTy &IRB) {
1782	LLVM_DEBUG(dbgs() << " original load: " << SI << "\n");
1783
1784	Value *TV = SI.getTrueValue();
1785	Value *FV = SI.getFalseValue();
1786	// Replace the given load of the select with a select of two loads.
1787
1788	assert(LI.isSimple() && "We only speculate simple loads");
1789
1790	IRB.SetInsertPoint(&LI);
1791
1792	LoadInst *TL =
1793	IRB.CreateAlignedLoad(Ty: LI.getType(), Ptr: TV, Align: LI.getAlign(),
1794	Name: LI.getName() + ".sroa.speculate.load.true");
1795	LoadInst *FL =
1796	IRB.CreateAlignedLoad(Ty: LI.getType(), Ptr: FV, Align: LI.getAlign(),
1797	Name: LI.getName() + ".sroa.speculate.load.false");
1798	NumLoadsSpeculated += `2`;
1799
1800	// Transfer alignment and AA info if present.
1801	TL->setAlignment(LI.getAlign());
1802	FL->setAlignment(LI.getAlign());
1803
1804	AAMDNodes Tags = LI.getAAMetadata();
1805	if (Tags) {
1806	TL->setAAMetadata(Tags);
1807	FL->setAAMetadata(Tags);
1808	}
1809
1810	Value *V = IRB.CreateSelect(C: SI.getCondition(), True: TL, False: FL,
1811	Name: LI.getName() + ".sroa.speculated");
1812
1813	LLVM_DEBUG(dbgs() << " speculated to: " << *V << "\n");
1814	LI.replaceAllUsesWith(V);
1815	}
1816
1817	template <typename T>
1818	static void rewriteMemOpOfSelect(SelectInst &SI, T &I,
1819	SelectHandSpeculativity Spec,
1820	DomTreeUpdater &DTU) {
1821	assert((isa<LoadInst>(I) \|\| isa<StoreInst>(I)) && "Only for load and store!");
1822	LLVM_DEBUG(dbgs() << " original mem op: " << I << "\n");
1823	BasicBlock *Head = I.getParent();
1824	Instruction ThenTerm = nullptr*;
1825	Instruction ElseTerm = nullptr*;
1826	if (Spec.areNoneSpeculatable())
1827	SplitBlockAndInsertIfThenElse(SI.getCondition(), &I, &ThenTerm, &ElseTerm,
1828	SI.getMetadata(KindID: LLVMContext::MD_prof), &DTU);
1829	else {
1830	SplitBlockAndInsertIfThen(SI.getCondition(), &I, /Unreachable=/false,
1831	SI.getMetadata(KindID: LLVMContext::MD_prof), &DTU,
1832	/LI=/nullptr, /ThenBlock=/nullptr);
1833	if (Spec.isSpeculatable(/isTrueVal=/true))
1834	cast<BranchInst>(Val: Head->getTerminator())->swapSuccessors();
1835	}
1836	auto *HeadBI = cast<BranchInst>(Val: Head->getTerminator());
1837	Spec = {}; // Do not use `Spec` beyond this point.
1838	BasicBlock *Tail = I.getParent();
1839	Tail->setName(Head->getName() + ".cont");
1840	PHINode *PN;
1841	if (isa<LoadInst>(I))
1842	PN = PHINode::Create(Ty: I.getType(), NumReservedValues: `2`, NameStr: "", InsertBefore: I.getIterator());
1843	for (BasicBlock *SuccBB : successors(BB: Head)) {
1844	bool IsThen = SuccBB == HeadBI->getSuccessor(i: `0`);
1845	int SuccIdx = IsThen ? `0` : `1`;
1846	auto *NewMemOpBB = SuccBB == Tail ? Head : SuccBB;
1847	auto &CondMemOp = cast<T>(*I.clone());
1848	if (NewMemOpBB != Head) {
1849	NewMemOpBB->setName(Head->getName() + (IsThen ? ".then" : ".else"));
1850	if (isa<LoadInst>(I))
1851	++NumLoadsPredicated;
1852	else
1853	++NumStoresPredicated;
1854	} else {
1855	CondMemOp.dropUBImplyingAttrsAndMetadata();
1856	++NumLoadsSpeculated;
1857	}
1858	CondMemOp.insertBefore(NewMemOpBB->getTerminator()->getIterator());
1859	Value *Ptr = SI.getOperand(i_nocapture: `1` + SuccIdx);
1860	CondMemOp.setOperand(I.getPointerOperandIndex(), Ptr);
1861	if (isa<LoadInst>(I)) {
1862	CondMemOp.setName(I.getName() + (IsThen ? ".then" : ".else") + ".val");
1863	PN->addIncoming(V: &CondMemOp, BB: NewMemOpBB);
1864	} else
1865	LLVM_DEBUG(dbgs() << " to: " << CondMemOp << "\n");
1866	}
1867	if (isa<LoadInst>(I)) {
1868	PN->takeName(V: &I);
1869	LLVM_DEBUG(dbgs() << " to: " << *PN << "\n");
1870	I.replaceAllUsesWith(PN);
1871	}
1872	}
1873
1874	static void rewriteMemOpOfSelect(SelectInst &SelInst, Instruction &I,
1875	SelectHandSpeculativity Spec,
1876	DomTreeUpdater &DTU) {
1877	if (auto *LI = dyn_cast<LoadInst>(Val: &I))
1878	rewriteMemOpOfSelect(SI&: SelInst, I&: *LI, Spec, DTU);
1879	else if (auto *SI = dyn_cast<StoreInst>(Val: &I))
1880	rewriteMemOpOfSelect(SI&: SelInst, I&: *SI, Spec, DTU);
1881	else
1882	llvm_unreachable_internal(msg: "Only for load and store.");
1883	}
1884
1885	static bool rewriteSelectInstMemOps(SelectInst &SI,
1886	const RewriteableMemOps &Ops,
1887	IRBuilderTy &IRB, DomTreeUpdater *DTU) {
1888	bool CFGChanged = false;
1889	LLVM_DEBUG(dbgs() << " original select: " << SI << "\n");
1890
1891	for (const RewriteableMemOp &Op : Ops) {
1892	SelectHandSpeculativity Spec;
1893	Instruction *I;
1894	if (auto *const *US = std::get_if<UnspeculatableStore>(ptr: &Op)) {
1895	I = *US;
1896	} else {
1897	auto PSL = std::get<PossiblySpeculatableLoad>(v: Op);
1898	I = PSL.getPointer();
1899	Spec = PSL.getInt();
1900	}
1901	if (Spec.areAllSpeculatable()) {
1902	speculateSelectInstLoads(SI, LI&: cast<LoadInst>(Val&: *I), IRB);
1903	} else {
1904	assert(DTU && "Should not get here when not allowed to modify the CFG!");
1905	rewriteMemOpOfSelect(SelInst&: SI, I&: I, Spec, DTU&: DTU);
1906	CFGChanged = true;
1907	}
1908	I->eraseFromParent();
1909	}
1910
1911	for (User *U : make_early_inc_range(Range: SI.users()))
1912	cast<BitCastInst>(Val: U)->eraseFromParent();
1913	SI.eraseFromParent();
1914	return CFGChanged;
1915	}
1916
1917	/// Compute an adjusted pointer from Ptr by Offset bytes where the
1918	/// resulting pointer has PointerTy.
1919	static Value getAdjustedPtr(IRBuilderTy &IRB, const* DataLayout &DL, Value *Ptr,
1920	APInt Offset, Type *PointerTy,
1921	const Twine &NamePrefix) {
1922	if (Offset != `0`)
1923	Ptr = IRB.CreateInBoundsPtrAdd(Ptr, Offset: IRB.getInt(AI: Offset),
1924	Name: NamePrefix + "sroa_idx");
1925	return IRB.CreatePointerBitCastOrAddrSpaceCast(V: Ptr, DestTy: PointerTy,
1926	Name: NamePrefix + "sroa_cast");
1927	}
1928
1929	/// Compute the adjusted alignment for a load or store from an offset.
1930	static Align getAdjustedAlignment(Instruction *I, uint64_t Offset) {
1931	return commonAlignment(A: getLoadStoreAlignment(I), Offset);
1932	}
1933
1934	/// Test whether we can convert a value from the old to the new type.
1935	///
1936	/// This predicate should be used to guard calls to convertValue in order to
1937	/// ensure that we only try to convert viable values. The strategy is that we
1938	/// will peel off single element struct and array wrappings to get to an
1939	/// underlying value, and convert that value.
1940	static bool canConvertValue(const DataLayout &DL, Type OldTy, Type NewTy,
1941	unsigned VScale = `0`) {
1942	if (OldTy == NewTy)
1943	return true;
1944
1945	// For integer types, we can't handle any bit-width differences. This would
1946	// break both vector conversions with extension and introduce endianness
1947	// issues when in conjunction with loads and stores.
1948	if (isa<IntegerType>(Val: OldTy) && isa<IntegerType>(Val: NewTy)) {
1949	assert(cast<IntegerType>(OldTy)->getBitWidth() !=
1950	cast<IntegerType>(NewTy)->getBitWidth() &&
1951	"We can't have the same bitwidth for different int types");
1952	return false;
1953	}
1954
1955	TypeSize NewSize = DL.getTypeSizeInBits(Ty: NewTy);
1956	TypeSize OldSize = DL.getTypeSizeInBits(Ty: OldTy);
1957
1958	if ((isa<ScalableVectorType>(Val: NewTy) && isa<FixedVectorType>(Val: OldTy)) \|\|
1959	(isa<ScalableVectorType>(Val: OldTy) && isa<FixedVectorType>(Val: NewTy))) {
1960	// Conversion is only possible when the size of scalable vectors is known.
1961	if (!VScale)
1962	return false;
1963
1964	// For ptr-to-int and int-to-ptr casts, the pointer side is resolved within
1965	// a single domain (either fixed or scalable). Any additional conversion
1966	// between fixed and scalable types is handled through integer types.
1967	auto OldVTy = OldTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(OldTy) : OldTy;
1968	auto NewVTy = NewTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(NewTy) : NewTy;
1969
1970	if (isa<ScalableVectorType>(Val: NewTy)) {
1971	if (!VectorType::getWithSizeAndScalar(SizeTy: cast<VectorType>(Val: NewVTy), EltTy: OldVTy))
1972	return false;
1973
1974	NewSize = TypeSize::getFixed(ExactSize: NewSize.getKnownMinValue() * VScale);
1975	} else {
1976	if (!VectorType::getWithSizeAndScalar(SizeTy: cast<VectorType>(Val: OldVTy), EltTy: NewVTy))
1977	return false;
1978
1979	OldSize = TypeSize::getFixed(ExactSize: OldSize.getKnownMinValue() * VScale);
1980	}
1981	}
1982
1983	if (NewSize != OldSize)
1984	return false;
1985	if (!NewTy->isSingleValueType() \|\| !OldTy->isSingleValueType())
1986	return false;
1987
1988	// We can convert pointers to integers and vice-versa. Same for vectors
1989	// of pointers and integers.
1990	OldTy = OldTy->getScalarType();
1991	NewTy = NewTy->getScalarType();
1992	if (NewTy->isPointerTy() \|\| OldTy->isPointerTy()) {
1993	if (NewTy->isPointerTy() && OldTy->isPointerTy()) {
1994	unsigned OldAS = OldTy->getPointerAddressSpace();
1995	unsigned NewAS = NewTy->getPointerAddressSpace();
1996	// Convert pointers if they are pointers from the same address space or
1997	// different integral (not non-integral) address spaces with the same
1998	// pointer size.
1999	return OldAS == NewAS \|\|
2000	(!DL.isNonIntegralAddressSpace(AddrSpace: OldAS) &&
2001	!DL.isNonIntegralAddressSpace(AddrSpace: NewAS) &&
2002	DL.getPointerSize(AS: OldAS) == DL.getPointerSize(AS: NewAS));
2003	}
2004
2005	// We can convert integers to integral pointers, but not to non-integral
2006	// pointers.
2007	if (OldTy->isIntegerTy())
2008	return !DL.isNonIntegralPointerType(Ty: NewTy);
2009
2010	// We can convert integral pointers to integers, but non-integral pointers
2011	// need to remain pointers.
2012	if (!DL.isNonIntegralPointerType(Ty: OldTy))
2013	return NewTy->isIntegerTy();
2014
2015	return false;
2016	}
2017
2018	if (OldTy->isTargetExtTy() \|\| NewTy->isTargetExtTy())
2019	return false;
2020
2021	return true;
2022	}
2023
2024	/// Generic routine to convert an SSA value to a value of a different
2025	/// type.
2026	///
2027	/// This will try various different casting techniques, such as bitcasts,
2028	/// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
2029	/// two types for viability with this routine.
2030	static Value convertValue(const* DataLayout &DL, IRBuilderTy &IRB, Value *V,
2031	Type *NewTy) {
2032	Type *OldTy = V->getType();
2033
2034	#ifndef NDEBUG
2035	BasicBlock *BB = IRB.GetInsertBlock();
2036	assert(BB && BB->getParent() && "VScale unknown!");
2037	unsigned VScale = BB->getParent()->getVScaleValue();
2038	assert(canConvertValue(DL, OldTy, NewTy, VScale) &&
2039	"Value not convertable to type");
2040	#endif
2041
2042	if (OldTy == NewTy)
2043	return V;
2044
2045	assert(!(isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) &&
2046	"Integer types must be the exact same to convert.");
2047
2048	// A variant of bitcast that supports a mixture of fixed and scalable types
2049	// that are know to have the same size.
2050	auto CreateBitCastLike = [&IRB](Value In, Type Ty) -> Value * {
2051	Type *InTy = In->getType();
2052	if (InTy == Ty)
2053	return In;
2054
2055	if (isa<FixedVectorType>(Val: InTy) && isa<ScalableVectorType>(Val: Ty)) {
2056	// For vscale_range(2) expand <4 x i32> to <vscale x 4 x i16> -->
2057	// <4 x i32> to <vscale x 2 x i32> to <vscale x 4 x i16>
2058	auto *VTy = VectorType::getWithSizeAndScalar(SizeTy: cast<VectorType>(Val: Ty), EltTy: InTy);
2059	return IRB.CreateBitCast(V: IRB.CreateInsertVector(DstType: VTy,
2060	SrcVec: PoisonValue::get(T: VTy), SubVec: In,
2061	Idx: IRB.getInt64(C: `0`)),
2062	DestTy: Ty);
2063	}
2064
2065	if (isa<ScalableVectorType>(Val: InTy) && isa<FixedVectorType>(Val: Ty)) {
2066	// For vscale_range(2) expand <vscale x 4 x i16> to <4 x i32> -->
2067	// <vscale x 4 x i16> to <vscale x 2 x i32> to <4 x i32>
2068	auto *VTy = VectorType::getWithSizeAndScalar(SizeTy: cast<VectorType>(Val: InTy), EltTy: Ty);
2069	return IRB.CreateExtractVector(DstType: Ty, SrcVec: IRB.CreateBitCast(V: In, DestTy: VTy),
2070	Idx: IRB.getInt64(C: `0`));
2071	}
2072
2073	return IRB.CreateBitCast(V: In, DestTy: Ty);
2074	};
2075
2076	// See if we need inttoptr for this type pair. May require additional bitcast.
2077	if (OldTy->isIntOrIntVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
2078	// Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8*
2079	// Expand i128 to <2 x i8> --> i128 to <2 x i64> to <2 x i8>
2080	// Expand <4 x i32> to <2 x i8> --> <4 x i32> to <2 x i64> to <2 x i8>
2081	// Directly handle i64 to i8*
2082	return IRB.CreateIntToPtr(V: CreateBitCastLike (V, DL.getIntPtrType(NewTy)),
2083	DestTy: NewTy);
2084	}
2085
2086	// See if we need ptrtoint for this type pair. May require additional bitcast.
2087	if (OldTy->isPtrOrPtrVectorTy() && NewTy->isIntOrIntVectorTy()) {
2088	// Expand <2 x i8> to i128 --> <2 x i8> to <2 x i64> to i128
2089	// Expand i8 to <2 x i32> --> i8* to i64 to <2 x i32>*
2090	// Expand <2 x i8> to <4 x i32> --> <2 x i8> to <2 x i64> to <4 x i32>
2091	// Expand i8 to i64 --> i8* to i64 to i64*
2092	return CreateBitCastLike (IRB.CreatePtrToInt(V, DestTy: DL.getIntPtrType(OldTy)),
2093	NewTy);
2094	}
2095
2096	if (OldTy->isPtrOrPtrVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
2097	unsigned OldAS = OldTy->getPointerAddressSpace();
2098	unsigned NewAS = NewTy->getPointerAddressSpace();
2099	// To convert pointers with different address spaces (they are already
2100	// checked convertible, i.e. they have the same pointer size), so far we
2101	// cannot use `bitcast` (which has restrict on the same address space) or
2102	// `addrspacecast` (which is not always no-op casting). Instead, use a pair
2103	// of no-op `ptrtoint`/`inttoptr` casts through an integer with the same bit
2104	// size.
2105	if (OldAS != NewAS) {
2106	assert(DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
2107	return IRB.CreateIntToPtr(
2108	V: CreateBitCastLike (IRB.CreatePtrToInt(V, DestTy: DL.getIntPtrType(OldTy)),
2109	DL.getIntPtrType(NewTy)),
2110	DestTy: NewTy);
2111	}
2112	}
2113
2114	return CreateBitCastLike (V, NewTy);
2115	}
2116
2117	/// Test whether the given slice use can be promoted to a vector.
2118	///
2119	/// This function is called to test each entry in a partition which is slated
2120	/// for a single slice.
2121	static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
2122	VectorType *Ty,
2123	uint64_t ElementSize,
2124	const DataLayout &DL,
2125	unsigned VScale) {
2126	// First validate the slice offsets.
2127	uint64_t BeginOffset =
2128	std::max(a: S.beginOffset(), b: P.beginOffset()) - P.beginOffset();
2129	uint64_t BeginIndex = BeginOffset / ElementSize;
2130	if (BeginIndex * ElementSize != BeginOffset \|\|
2131	BeginIndex >= cast<FixedVectorType>(Val: Ty)->getNumElements())
2132	return false;
2133	uint64_t EndOffset = std::min(a: S.endOffset(), b: P.endOffset()) - P.beginOffset();
2134	uint64_t EndIndex = EndOffset / ElementSize;
2135	if (EndIndex * ElementSize != EndOffset \|\|
2136	EndIndex > cast<FixedVectorType>(Val: Ty)->getNumElements())
2137	return false;
2138
2139	assert(EndIndex > BeginIndex && "Empty vector!");
2140	uint64_t NumElements = EndIndex - BeginIndex;
2141	Type *SliceTy = (NumElements == `1`)
2142	? Ty->getElementType()
2143	: FixedVectorType::get(ElementType: Ty->getElementType(), NumElts: NumElements);
2144
2145	Type *SplitIntTy =
2146	Type::getIntNTy(C&: Ty->getContext(), N: NumElements * ElementSize * `8`);
2147
2148	Use *U = S.getUse();
2149
2150	if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Val: U->getUser())) {
2151	if (MI->isVolatile())
2152	return false;
2153	if (!S.isSplittable())
2154	return false; // Skip any unsplittable intrinsics.
2155	} else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: U->getUser())) {
2156	if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
2157	return false;
2158	} else if (LoadInst *LI = dyn_cast<LoadInst>(Val: U->getUser())) {
2159	if (LI->isVolatile())
2160	return false;
2161	Type *LTy = LI->getType();
2162	// Disable vector promotion when there are loads or stores of an FCA.
2163	if (LTy->isStructTy())
2164	return false;
2165	if (P.beginOffset() > S.beginOffset() \|\| P.endOffset() < S.endOffset()) {
2166	assert(LTy->isIntegerTy());
2167	LTy = SplitIntTy;
2168	}
2169	if (!canConvertValue(DL, OldTy: SliceTy, NewTy: LTy, VScale))
2170	return false;
2171	} else if (StoreInst *SI = dyn_cast<StoreInst>(Val: U->getUser())) {
2172	if (SI->isVolatile())
2173	return false;
2174	Type *STy = SI->getValueOperand()->getType();
2175	// Disable vector promotion when there are loads or stores of an FCA.
2176	if (STy->isStructTy())
2177	return false;
2178	if (P.beginOffset() > S.beginOffset() \|\| P.endOffset() < S.endOffset()) {
2179	assert(STy->isIntegerTy());
2180	STy = SplitIntTy;
2181	}
2182	if (!canConvertValue(DL, OldTy: STy, NewTy: SliceTy, VScale))
2183	return false;
2184	} else {
2185	return false;
2186	}
2187
2188	return true;
2189	}
2190
2191	/// Test whether a vector type is viable for promotion.
2192	///
2193	/// This implements the necessary checking for \c checkVectorTypesForPromotion
2194	/// (and thus isVectorPromotionViable) over all slices of the alloca for the
2195	/// given VectorType.
2196	static bool checkVectorTypeForPromotion(Partition &P, VectorType *VTy,
2197	const DataLayout &DL, unsigned VScale) {
2198	uint64_t ElementSize =
2199	DL.getTypeSizeInBits(Ty: VTy->getElementType()).getFixedValue();
2200
2201	// While the definition of LLVM vectors is bitpacked, we don't support sizes
2202	// that aren't byte sized.
2203	if (ElementSize % `8`)
2204	return false;
2205	assert((DL.getTypeSizeInBits(VTy).getFixedValue() % `8`) == `0` &&
2206	"vector size not a multiple of element size?");
2207	ElementSize /= `8`;
2208
2209	for (const Slice &S : P)
2210	if (!isVectorPromotionViableForSlice(P, S, Ty: VTy, ElementSize, DL, VScale))
2211	return false;
2212
2213	for (const Slice *S : P.splitSliceTails())
2214	if (!isVectorPromotionViableForSlice(P, S: *S, Ty: VTy, ElementSize, DL, VScale))
2215	return false;
2216
2217	return true;
2218	}
2219
2220	/// Test whether any vector type in \p CandidateTys is viable for promotion.
2221	///
2222	/// This implements the necessary checking for \c isVectorPromotionViable over
2223	/// all slices of the alloca for the given VectorType.
2224	static VectorType *
2225	checkVectorTypesForPromotion(Partition &P, const DataLayout &DL,
2226	SmallVectorImpl<VectorType *> &CandidateTys,
2227	bool HaveCommonEltTy, Type *CommonEltTy,
2228	bool HaveVecPtrTy, bool HaveCommonVecPtrTy,
2229	VectorType CommonVecPtrTy, unsigned* VScale) {
2230	// If we didn't find a vector type, nothing to do here.
2231	if (CandidateTys.empty())
2232	return nullptr;
2233
2234	// Pointer-ness is sticky, if we had a vector-of-pointers candidate type,
2235	// then we should choose it, not some other alternative.
2236	// But, we can't perform a no-op pointer address space change via bitcast,
2237	// so if we didn't have a common pointer element type, bail.
2238	if (HaveVecPtrTy && !HaveCommonVecPtrTy)
2239	return nullptr;
2240
2241	// Try to pick the "best" element type out of the choices.
2242	if (!HaveCommonEltTy && HaveVecPtrTy) {
2243	// If there was a pointer element type, there's really only one choice.
2244	CandidateTys.clear();
2245	CandidateTys.push_back(Elt: CommonVecPtrTy);
2246	} else if (!HaveCommonEltTy && !HaveVecPtrTy) {
2247	// Integer-ify vector types.
2248	for (VectorType *&VTy : CandidateTys) {
2249	if (!VTy->getElementType()->isIntegerTy())
2250	VTy = cast<VectorType>(Val: VTy->getWithNewType(EltTy: IntegerType::getIntNTy(
2251	C&: VTy->getContext(), N: VTy->getScalarSizeInBits())));
2252	}
2253
2254	// Rank the remaining candidate vector types. This is easy because we know
2255	// they're all integer vectors. We sort by ascending number of elements.
2256	auto RankVectorTypesComp = [&DL](VectorType RHSTy, VectorType LHSTy) {
2257	(void)DL;
2258	assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
2259	DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
2260	"Cannot have vector types of different sizes!");
2261	assert(RHSTy->getElementType()->isIntegerTy() &&
2262	"All non-integer types eliminated!");
2263	assert(LHSTy->getElementType()->isIntegerTy() &&
2264	"All non-integer types eliminated!");
2265	return cast<FixedVectorType>(Val: RHSTy)->getNumElements() <
2266	cast<FixedVectorType>(Val: LHSTy)->getNumElements();
2267	};
2268	auto RankVectorTypesEq = [&DL](VectorType RHSTy, VectorType LHSTy) {
2269	(void)DL;
2270	assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
2271	DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
2272	"Cannot have vector types of different sizes!");
2273	assert(RHSTy->getElementType()->isIntegerTy() &&
2274	"All non-integer types eliminated!");
2275	assert(LHSTy->getElementType()->isIntegerTy() &&
2276	"All non-integer types eliminated!");
2277	return cast<FixedVectorType>(Val: RHSTy)->getNumElements() ==
2278	cast<FixedVectorType>(Val: LHSTy)->getNumElements();
2279	};
2280	llvm::sort(C&: CandidateTys, Comp: RankVectorTypesComp);
2281	CandidateTys.erase(CS: llvm::unique(R&: CandidateTys, P: RankVectorTypesEq),
2282	CE: CandidateTys.end());
2283	} else {
2284	// The only way to have the same element type in every vector type is to
2285	// have the same vector type. Check that and remove all but one.
2286	#ifndef NDEBUG
2287	for (VectorType *VTy : CandidateTys) {
2288	assert(VTy->getElementType() == CommonEltTy &&
2289	"Unaccounted for element type!");
2290	assert(VTy == CandidateTys[`0`] &&
2291	"Different vector types with the same element type!");
2292	}
2293	#endif
2294	CandidateTys.resize(N: `1`);
2295	}
2296
2297	// FIXME: hack. Do we have a named constant for this?
2298	// SDAG SDNode can't have more than 65535 operands.
2299	llvm::erase_if(C&: CandidateTys, P: [](VectorType *VTy) {
2300	return cast<FixedVectorType>(Val: VTy)->getNumElements() >
2301	std::numeric_limits<unsigned short>::max();
2302	});
2303
2304	for (VectorType *VTy : CandidateTys)
2305	if (checkVectorTypeForPromotion(P, VTy, DL, VScale))
2306	return VTy;
2307
2308	return nullptr;
2309	}
2310
2311	static VectorType *createAndCheckVectorTypesForPromotion(
2312	SetVector<Type > &OtherTys, ArrayRef<VectorType > CandidateTysCopy,
2313	function_ref<void(Type *)> CheckCandidateType, Partition &P,
2314	const DataLayout &DL, SmallVectorImpl<VectorType *> &CandidateTys,
2315	bool &HaveCommonEltTy, Type &CommonEltTy, bool* &HaveVecPtrTy,
2316	bool &HaveCommonVecPtrTy, VectorType &CommonVecPtrTy, unsigned* VScale) {
2317	[[maybe_unused]] VectorType *OriginalElt =
2318	CandidateTysCopy.size() ? CandidateTysCopy [`0`] : nullptr;
2319	// Consider additional vector types where the element type size is a
2320	// multiple of load/store element size.
2321	for (Type *Ty : OtherTys) {
2322	if (!VectorType::isValidElementType(ElemTy: Ty))
2323	continue;
2324	unsigned TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
2325	// Make a copy of CandidateTys and iterate through it, because we
2326	// might append to CandidateTys in the loop.
2327	for (VectorType *const VTy : CandidateTysCopy) {
2328	// The elements in the copy should remain invariant throughout the loop
2329	assert(CandidateTysCopy[`0`] == OriginalElt && "Different Element");
2330	unsigned VectorSize = DL.getTypeSizeInBits(Ty: VTy).getFixedValue();
2331	unsigned ElementSize =
2332	DL.getTypeSizeInBits(Ty: VTy->getElementType()).getFixedValue();
2333	if (TypeSize != VectorSize && TypeSize != ElementSize &&
2334	VectorSize % TypeSize == `0`) {
2335	VectorType NewVTy = VectorType::get(ElementType: Ty, NumElements: VectorSize / TypeSize, Scalable: false*);
2336	CheckCandidateType (NewVTy);
2337	}
2338	}
2339	}
2340
2341	return checkVectorTypesForPromotion(
2342	P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
2343	HaveCommonVecPtrTy, CommonVecPtrTy, VScale);
2344	}
2345
2346	/// Test whether the given alloca partitioning and range of slices can be
2347	/// promoted to a vector.
2348	///
2349	/// This is a quick test to check whether we can rewrite a particular alloca
2350	/// partition (and its newly formed alloca) into a vector alloca with only
2351	/// whole-vector loads and stores such that it could be promoted to a vector
2352	/// SSA value. We only can ensure this for a limited set of operations, and we
2353	/// don't want to do the rewrites unless we are confident that the result will
2354	/// be promotable, so we have an early test here.
2355	static VectorType isVectorPromotionViable(Partition &P, const* DataLayout &DL,
2356	unsigned VScale) {
2357	// Collect the candidate types for vector-based promotion. Also track whether
2358	// we have different element types.
2359	SmallVector<VectorType *, `4`> CandidateTys;
2360	SetVector<Type *> LoadStoreTys;
2361	SetVector<Type *> DeferredTys;
2362	Type CommonEltTy = nullptr*;
2363	VectorType CommonVecPtrTy = nullptr*;
2364	bool HaveVecPtrTy = false;
2365	bool HaveCommonEltTy = true;
2366	bool HaveCommonVecPtrTy = true;
2367	auto CheckCandidateType = [&](Type *Ty) {
2368	if (auto *VTy = dyn_cast<FixedVectorType>(Val: Ty)) {
2369	// Return if bitcast to vectors is different for total size in bits.
2370	if (!CandidateTys.empty()) {
2371	VectorType *V = CandidateTys [`0`];
2372	if (DL.getTypeSizeInBits(Ty: VTy).getFixedValue() !=
2373	DL.getTypeSizeInBits(Ty: V).getFixedValue()) {
2374	CandidateTys.clear();
2375	return;
2376	}
2377	}
2378	CandidateTys.push_back(Elt: VTy);
2379	Type *EltTy = VTy->getElementType();
2380
2381	if (!CommonEltTy)
2382	CommonEltTy = EltTy;
2383	else if (CommonEltTy != EltTy)
2384	HaveCommonEltTy = false;
2385
2386	if (EltTy->isPointerTy()) {
2387	HaveVecPtrTy = true;
2388	if (!CommonVecPtrTy)
2389	CommonVecPtrTy = VTy;
2390	else if (CommonVecPtrTy != VTy)
2391	HaveCommonVecPtrTy = false;
2392	}
2393	}
2394	};
2395
2396	// Put load and store types into a set for de-duplication.
2397	for (const Slice &S : P) {
2398	Type *Ty;
2399	if (auto *LI = dyn_cast<LoadInst>(Val: S.getUse()->getUser()))
2400	Ty = LI->getType();
2401	else if (auto *SI = dyn_cast<StoreInst>(Val: S.getUse()->getUser()))
2402	Ty = SI->getValueOperand()->getType();
2403	else
2404	continue;
2405
2406	auto CandTy = Ty->getScalarType();
2407	if (CandTy->isPointerTy() && (S.beginOffset() != P.beginOffset() \|\|
2408	S.endOffset() != P.endOffset())) {
2409	DeferredTys.insert(X: Ty);
2410	continue;
2411	}
2412
2413	LoadStoreTys.insert(X: Ty);
2414	// Consider any loads or stores that are the exact size of the slice.
2415	if (S.beginOffset() == P.beginOffset() && S.endOffset() == P.endOffset())
2416	CheckCandidateType (Ty);
2417	}
2418
2419	SmallVector<VectorType *, `4`> CandidateTysCopy = CandidateTys;
2420	if (auto *VTy = createAndCheckVectorTypesForPromotion(
2421	OtherTys&: LoadStoreTys, CandidateTysCopy, CheckCandidateType, P, DL,
2422	CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
2423	HaveCommonVecPtrTy, CommonVecPtrTy, VScale))
2424	return VTy;
2425
2426	CandidateTys.clear();
2427	return createAndCheckVectorTypesForPromotion(
2428	OtherTys&: DeferredTys, CandidateTysCopy, CheckCandidateType, P, DL, CandidateTys,
2429	HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, HaveCommonVecPtrTy,
2430	CommonVecPtrTy, VScale);
2431	}
2432
2433	/// Test whether a slice of an alloca is valid for integer widening.
2434	///
2435	/// This implements the necessary checking for the \c isIntegerWideningViable
2436	/// test below on a single slice of the alloca.
2437	static bool isIntegerWideningViableForSlice(const Slice &S,
2438	uint64_t AllocBeginOffset,
2439	Type *AllocaTy,
2440	const DataLayout &DL,
2441	bool &WholeAllocaOp) {
2442	uint64_t Size = DL.getTypeStoreSize(Ty: AllocaTy).getFixedValue();
2443
2444	uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
2445	uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
2446
2447	Use *U = S.getUse();
2448
2449	// Lifetime intrinsics operate over the whole alloca whose sizes are usually
2450	// larger than other load/store slices (RelEnd > Size). But lifetime are
2451	// always promotable and should not impact other slices' promotability of the
2452	// partition.
2453	if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: U->getUser())) {
2454	if (II->isLifetimeStartOrEnd() \|\| II->isDroppable())
2455	return true;
2456	}
2457
2458	// We can't reasonably handle cases where the load or store extends past
2459	// the end of the alloca's type and into its padding.
2460	if (RelEnd > Size)
2461	return false;
2462
2463	if (LoadInst *LI = dyn_cast<LoadInst>(Val: U->getUser())) {
2464	if (LI->isVolatile())
2465	return false;
2466	// We can't handle loads that extend past the allocated memory.
2467	TypeSize LoadSize = DL.getTypeStoreSize(Ty: LI->getType());
2468	if (!LoadSize.isFixed() \|\| LoadSize.getFixedValue() > Size)
2469	return false;
2470	// So far, AllocaSliceRewriter does not support widening split slice tails
2471	// in rewriteIntegerLoad.
2472	if (S.beginOffset() < AllocBeginOffset)
2473	return false;
2474	// Note that we don't count vector loads or stores as whole-alloca
2475	// operations which enable integer widening because we would prefer to use
2476	// vector widening instead.
2477	if (!isa<VectorType>(Val: LI->getType()) && RelBegin == `0` && RelEnd == Size)
2478	WholeAllocaOp = true;
2479	if (IntegerType *ITy = dyn_cast<IntegerType>(Val: LI->getType())) {
2480	if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(Ty: ITy).getFixedValue())
2481	return false;
2482	} else if (RelBegin != `0` \|\| RelEnd != Size \|\|
2483	!canConvertValue(DL, OldTy: AllocaTy, NewTy: LI->getType())) {
2484	// Non-integer loads need to be convertible from the alloca type so that
2485	// they are promotable.
2486	return false;
2487	}
2488	} else if (StoreInst *SI = dyn_cast<StoreInst>(Val: U->getUser())) {
2489	Type *ValueTy = SI->getValueOperand()->getType();
2490	if (SI->isVolatile())
2491	return false;
2492	// We can't handle stores that extend past the allocated memory.
2493	TypeSize StoreSize = DL.getTypeStoreSize(Ty: ValueTy);
2494	if (!StoreSize.isFixed() \|\| StoreSize.getFixedValue() > Size)
2495	return false;
2496	// So far, AllocaSliceRewriter does not support widening split slice tails
2497	// in rewriteIntegerStore.
2498	if (S.beginOffset() < AllocBeginOffset)
2499	return false;
2500	// Note that we don't count vector loads or stores as whole-alloca
2501	// operations which enable integer widening because we would prefer to use
2502	// vector widening instead.
2503	if (!isa<VectorType>(Val: ValueTy) && RelBegin == `0` && RelEnd == Size)
2504	WholeAllocaOp = true;
2505	if (IntegerType *ITy = dyn_cast<IntegerType>(Val: ValueTy)) {
2506	if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(Ty: ITy).getFixedValue())
2507	return false;
2508	} else if (RelBegin != `0` \|\| RelEnd != Size \|\|
2509	!canConvertValue(DL, OldTy: ValueTy, NewTy: AllocaTy)) {
2510	// Non-integer stores need to be convertible to the alloca type so that
2511	// they are promotable.
2512	return false;
2513	}
2514	} else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Val: U->getUser())) {
2515	if (MI->isVolatile() \|\| !isa<Constant>(Val: MI->getLength()))
2516	return false;
2517	if (!S.isSplittable())
2518	return false; // Skip any unsplittable intrinsics.
2519	} else {
2520	return false;
2521	}
2522
2523	return true;
2524	}
2525
2526	/// Test whether the given alloca partition's integer operations can be
2527	/// widened to promotable ones.
2528	///
2529	/// This is a quick test to check whether we can rewrite the integer loads and
2530	/// stores to a particular alloca into wider loads and stores and be able to
2531	/// promote the resulting alloca.
2532	static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
2533	const DataLayout &DL) {
2534	uint64_t SizeInBits = DL.getTypeSizeInBits(Ty: AllocaTy).getFixedValue();
2535	// Don't create integer types larger than the maximum bitwidth.
2536	if (SizeInBits > IntegerType::MAX_INT_BITS)
2537	return false;
2538
2539	// Don't try to handle allocas with bit-padding.
2540	if (SizeInBits != DL.getTypeStoreSizeInBits(Ty: AllocaTy).getFixedValue())
2541	return false;
2542
2543	// We need to ensure that an integer type with the appropriate bitwidth can
2544	// be converted to the alloca type, whatever that is. We don't want to force
2545	// the alloca itself to have an integer type if there is a more suitable one.
2546	Type *IntTy = Type::getIntNTy(C&: AllocaTy->getContext(), N: SizeInBits);
2547	if (!canConvertValue(DL, OldTy: AllocaTy, NewTy: IntTy) \|\|
2548	!canConvertValue(DL, OldTy: IntTy, NewTy: AllocaTy))
2549	return false;
2550
2551	// While examining uses, we ensure that the alloca has a covering load or
2552	// store. We don't want to widen the integer operations only to fail to
2553	// promote due to some other unsplittable entry (which we may make splittable
2554	// later). However, if there are only splittable uses, go ahead and assume
2555	// that we cover the alloca.
2556	// FIXME: We shouldn't consider split slices that happen to start in the
2557	// partition here...
2558	bool WholeAllocaOp = P.empty() && DL.isLegalInteger(Width: SizeInBits);
2559
2560	for (const Slice &S : P)
2561	if (!isIntegerWideningViableForSlice(S, AllocBeginOffset: P.beginOffset(), AllocaTy, DL,
2562	WholeAllocaOp))
2563	return false;
2564
2565	for (const Slice *S : P.splitSliceTails())
2566	if (!isIntegerWideningViableForSlice(S: *S, AllocBeginOffset: P.beginOffset(), AllocaTy, DL,
2567	WholeAllocaOp))
2568	return false;
2569
2570	return WholeAllocaOp;
2571	}
2572
2573	static Value extractInteger(const* DataLayout &DL, IRBuilderTy &IRB, Value *V,
2574	IntegerType *Ty, uint64_t Offset,
2575	const Twine &Name) {
2576	LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2577	IntegerType *IntTy = cast<IntegerType>(Val: V->getType());
2578	assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <=
2579	DL.getTypeStoreSize(IntTy).getFixedValue() &&
2580	"Element extends past full value");
2581	uint64_t ShAmt = `8` * Offset;
2582	if (DL.isBigEndian())
2583	ShAmt = `8` * (DL.getTypeStoreSize(Ty: IntTy).getFixedValue() -
2584	DL.getTypeStoreSize(Ty).getFixedValue() - Offset);
2585	if (ShAmt) {
2586	V = IRB.CreateLShr(LHS: V, RHS: ShAmt, Name: Name + ".shift");
2587	LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2588	}
2589	assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2590	"Cannot extract to a larger integer!");
2591	if (Ty != IntTy) {
2592	V = IRB.CreateTrunc(V, DestTy: Ty, Name: Name + ".trunc");
2593	LLVM_DEBUG(dbgs() << " trunced: " << *V << "\n");
2594	}
2595	return V;
2596	}
2597
2598	static Value insertInteger(const* DataLayout &DL, IRBuilderTy &IRB, Value *Old,
2599	Value V, uint64_t Offset, const* Twine &Name) {
2600	IntegerType *IntTy = cast<IntegerType>(Val: Old->getType());
2601	IntegerType *Ty = cast<IntegerType>(Val: V->getType());
2602	assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2603	"Cannot insert a larger integer!");
2604	LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2605	if (Ty != IntTy) {
2606	V = IRB.CreateZExt(V, DestTy: IntTy, Name: Name + ".ext");
2607	LLVM_DEBUG(dbgs() << " extended: " << *V << "\n");
2608	}
2609	assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <=
2610	DL.getTypeStoreSize(IntTy).getFixedValue() &&
2611	"Element store outside of alloca store");
2612	uint64_t ShAmt = `8` * Offset;
2613	if (DL.isBigEndian())
2614	ShAmt = `8` * (DL.getTypeStoreSize(Ty: IntTy).getFixedValue() -
2615	DL.getTypeStoreSize(Ty).getFixedValue() - Offset);
2616	if (ShAmt) {
2617	V = IRB.CreateShl(LHS: V, RHS: ShAmt, Name: Name + ".shift");
2618	LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2619	}
2620
2621	if (ShAmt \|\| Ty->getBitWidth() < IntTy->getBitWidth()) {
2622	APInt Mask = ~Ty->getMask().zext(width: IntTy->getBitWidth()).shl(shiftAmt: ShAmt);
2623	Old = IRB.CreateAnd(LHS: Old, RHS: Mask, Name: Name + ".mask");
2624	LLVM_DEBUG(dbgs() << " masked: " << *Old << "\n");
2625	V = IRB.CreateOr(LHS: Old, RHS: V, Name: Name + ".insert");
2626	LLVM_DEBUG(dbgs() << " inserted: " << *V << "\n");
2627	}
2628	return V;
2629	}
2630
2631	static Value extractVector(IRBuilderTy &IRB, Value V, unsigned BeginIndex,
2632	unsigned EndIndex, const Twine &Name) {
2633	auto *VecTy = cast<FixedVectorType>(Val: V->getType());
2634	unsigned NumElements = EndIndex - BeginIndex;
2635	assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
2636
2637	if (NumElements == VecTy->getNumElements())
2638	return V;
2639
2640	if (NumElements == `1`) {
2641	V = IRB.CreateExtractElement(Vec: V, Idx: IRB.getInt32(C: BeginIndex),
2642	Name: Name + ".extract");
2643	LLVM_DEBUG(dbgs() << " extract: " << *V << "\n");
2644	return V;
2645	}
2646
2647	auto Mask = llvm::to_vector<`8`>(Range: llvm::seq<int>(Begin: BeginIndex, End: EndIndex));
2648	V = IRB.CreateShuffleVector(V, Mask, Name: Name + ".extract");
2649	LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2650	return V;
2651	}
2652
2653	static Value insertVector(IRBuilderTy &IRB, Value Old, Value *V,
2654	unsigned BeginIndex, const Twine &Name) {
2655	VectorType *VecTy = cast<VectorType>(Val: Old->getType());
2656	assert(VecTy && "Can only insert a vector into a vector");
2657
2658	VectorType *Ty = dyn_cast<VectorType>(Val: V->getType());
2659	if (!Ty) {
2660	// Single element to insert.
2661	V = IRB.CreateInsertElement(Vec: Old, NewElt: V, Idx: IRB.getInt32(C: BeginIndex),
2662	Name: Name + ".insert");
2663	LLVM_DEBUG(dbgs() << " insert: " << *V << "\n");
2664	return V;
2665	}
2666
2667	assert(cast<FixedVectorType>(Ty)->getNumElements() <=
2668	cast<FixedVectorType>(VecTy)->getNumElements() &&
2669	"Too many elements!");
2670	if (cast<FixedVectorType>(Val: Ty)->getNumElements() ==
2671	cast<FixedVectorType>(Val: VecTy)->getNumElements()) {
2672	assert(V->getType() == VecTy && "Vector type mismatch");
2673	return V;
2674	}
2675	unsigned EndIndex = BeginIndex + cast<FixedVectorType>(Val: Ty)->getNumElements();
2676
2677	// When inserting a smaller vector into the larger to store, we first
2678	// use a shuffle vector to widen it with undef elements, and then
2679	// a second shuffle vector to select between the loaded vector and the
2680	// incoming vector.
2681	SmallVector<int, `8`> Mask;
2682	Mask.reserve(N: cast<FixedVectorType>(Val: VecTy)->getNumElements());
2683	for (unsigned i = `0`; i != cast<FixedVectorType>(Val: VecTy)->getNumElements(); ++i)
2684	if (i >= BeginIndex && i < EndIndex)
2685	Mask.push_back(Elt: i - BeginIndex);
2686	else
2687	Mask.push_back(Elt: -`1`);
2688	V = IRB.CreateShuffleVector(V, Mask, Name: Name + ".expand");
2689	LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2690
2691	SmallVector<Constant *, `8`> Mask2;
2692	Mask2.reserve(N: cast<FixedVectorType>(Val: VecTy)->getNumElements());
2693	for (unsigned i = `0`; i != cast<FixedVectorType>(Val: VecTy)->getNumElements(); ++i)
2694	Mask2.push_back(Elt: IRB.getInt1(V: i >= BeginIndex && i < EndIndex));
2695
2696	V = IRB.CreateSelect(C: ConstantVector::get(V: Mask2), True: V, False: Old, Name: Name + "blend");
2697
2698	LLVM_DEBUG(dbgs() << " blend: " << *V << "\n");
2699	return V;
2700	}
2701
2702	namespace {
2703
2704	/// Visitor to rewrite instructions using p particular slice of an alloca
2705	/// to use a new alloca.
2706	///
2707	/// Also implements the rewriting to vector-based accesses when the partition
2708	/// passes the isVectorPromotionViable predicate. Most of the rewriting logic
2709	/// lives here.
2710	class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
2711	// Befriend the base class so it can delegate to private visit methods.
2712	friend class InstVisitor<AllocaSliceRewriter, bool>;
2713
2714	using Base = InstVisitor<AllocaSliceRewriter, bool>;
2715
2716	const DataLayout &DL;
2717	AllocaSlices &AS;
2718	SROA &Pass;
2719	AllocaInst &OldAI, &NewAI;
2720	const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
2721	Type *NewAllocaTy;
2722
2723	// This is a convenience and flag variable that will be null unless the new
2724	// alloca's integer operations should be widened to this integer type due to
2725	// passing isIntegerWideningViable above. If it is non-null, the desired
2726	// integer type will be stored here for easy access during rewriting.
2727	IntegerType *IntTy;
2728
2729	// If we are rewriting an alloca partition which can be written as pure
2730	// vector operations, we stash extra information here. When VecTy is
2731	// non-null, we have some strict guarantees about the rewritten alloca:
2732	// - The new alloca is exactly the size of the vector type here.
2733	// - The accesses all either map to the entire vector or to a single
2734	// element.
2735	// - The set of accessing instructions is only one of those handled above
2736	// in isVectorPromotionViable. Generally these are the same access kinds
2737	// which are promotable via mem2reg.
2738	VectorType *VecTy;
2739	Type *ElementTy;
2740	uint64_t ElementSize;
2741
2742	// The original offset of the slice currently being rewritten relative to
2743	// the original alloca.
2744	uint64_t BeginOffset = `0`;
2745	uint64_t EndOffset = `0`;
2746
2747	// The new offsets of the slice currently being rewritten relative to the
2748	// original alloca.
2749	uint64_t NewBeginOffset = `0`, NewEndOffset = `0`;
2750
2751	uint64_t SliceSize = `0`;
2752	bool IsSplittable = false;
2753	bool IsSplit = false;
2754	Use OldUse = nullptr*;
2755	Instruction OldPtr = nullptr*;
2756
2757	// Track post-rewrite users which are PHI nodes and Selects.
2758	SmallSetVector<PHINode *, `8`> &PHIUsers;
2759	SmallSetVector<SelectInst *, `8`> &SelectUsers;
2760
2761	// Utility IR builder, whose name prefix is setup for each visited use, and
2762	// the insertion point is set to point to the user.
2763	IRBuilderTy IRB;
2764
2765	// Return the new alloca, addrspacecasted if required to avoid changing the
2766	// addrspace of a volatile access.
2767	Value getPtrToNewAI(unsigned* AddrSpace, bool IsVolatile) {
2768	if (!IsVolatile \|\| AddrSpace == NewAI.getType()->getPointerAddressSpace())
2769	return &NewAI;
2770
2771	Type *AccessTy = IRB.getPtrTy(AddrSpace);
2772	return IRB.CreateAddrSpaceCast(V: &NewAI, DestTy: AccessTy);
2773	}
2774
2775	public:
2776	AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROA &Pass,
2777	AllocaInst &OldAI, AllocaInst &NewAI,
2778	uint64_t NewAllocaBeginOffset,
2779	uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
2780	VectorType *PromotableVecTy,
2781	SmallSetVector<PHINode *, `8`> &PHIUsers,
2782	SmallSetVector<SelectInst *, `8`> &SelectUsers)
2783	: DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
2784	NewAllocaBeginOffset(NewAllocaBeginOffset),
2785	NewAllocaEndOffset(NewAllocaEndOffset),
2786	NewAllocaTy(NewAI.getAllocatedType()),
2787	IntTy(
2788	IsIntegerPromotable
2789	? Type::getIntNTy(C&: NewAI.getContext(),
2790	N: DL.getTypeSizeInBits(Ty: NewAI.getAllocatedType())
2791	.getFixedValue())
2792	: nullptr),
2793	VecTy(PromotableVecTy),
2794	ElementTy(VecTy ? VecTy->getElementType() : nullptr),
2795	ElementSize(VecTy ? DL.getTypeSizeInBits(Ty: ElementTy).getFixedValue() / `8`
2796	: `0`),
2797	PHIUsers(PHIUsers), SelectUsers(SelectUsers),
2798	IRB (NewAI.getContext(), ConstantFolder ()) {
2799	if (VecTy) {
2800	assert((DL.getTypeSizeInBits(ElementTy).getFixedValue() % `8`) == `0` &&
2801	"Only multiple-of-8 sized vector elements are viable");
2802	++NumVectorized;
2803	}
2804	assert((!IntTy && !VecTy) \|\| (IntTy && !VecTy) \|\| (!IntTy && VecTy));
2805	}
2806
2807	bool visit(AllocaSlices::const_iterator I) {
2808	bool CanSROA = true;
2809	BeginOffset = I->beginOffset();
2810	EndOffset = I->endOffset();
2811	IsSplittable = I->isSplittable();
2812	IsSplit =
2813	BeginOffset < NewAllocaBeginOffset \|\| EndOffset > NewAllocaEndOffset;
2814	LLVM_DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : ""));
2815	LLVM_DEBUG(AS.printSlice(dbgs(), I, ""));
2816	LLVM_DEBUG(dbgs() << "\n");
2817
2818	// Compute the intersecting offset range.
2819	assert(BeginOffset < NewAllocaEndOffset);
2820	assert(EndOffset > NewAllocaBeginOffset);
2821	NewBeginOffset = std::max(a: BeginOffset, b: NewAllocaBeginOffset);
2822	NewEndOffset = std::min(a: EndOffset, b: NewAllocaEndOffset);
2823
2824	SliceSize = NewEndOffset - NewBeginOffset;
2825	LLVM_DEBUG(dbgs() << " Begin:(" << BeginOffset << ", " << EndOffset
2826	<< ") NewBegin:(" << NewBeginOffset << ", "
2827	<< NewEndOffset << ") NewAllocaBegin:("
2828	<< NewAllocaBeginOffset << ", " << NewAllocaEndOffset
2829	<< ")\n");
2830	assert(IsSplit \|\| NewBeginOffset == BeginOffset);
2831	OldUse = I->getUse();
2832	OldPtr = cast<Instruction>(Val: OldUse->get());
2833
2834	Instruction *OldUserI = cast<Instruction>(Val: OldUse->getUser());
2835	IRB.SetInsertPoint(OldUserI);
2836	IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
2837	IRB.getInserter().SetNamePrefix(Twine (NewAI.getName()) + "." +
2838	Twine (BeginOffset) + ".");
2839
2840	CanSROA &= visit(I: cast<Instruction>(Val: OldUse->getUser()));
2841	if (VecTy \|\| IntTy)
2842	assert(CanSROA);
2843	return CanSROA;
2844	}
2845
2846	private:
2847	// Make sure the other visit overloads are visible.
2848	using Base::visit;
2849
2850	// Every instruction which can end up as a user must have a rewrite rule.
2851	bool visitInstruction(Instruction &I) {
2852	LLVM_DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n");
2853	llvm_unreachable("No rewrite rule for this instruction!");
2854	}
2855
2856	Value getNewAllocaSlicePtr(IRBuilderTy &IRB, Type PointerTy) {
2857	// Note that the offset computation can use BeginOffset or NewBeginOffset
2858	// interchangeably for unsplit slices.
2859	assert(IsSplit \|\| BeginOffset == NewBeginOffset);
2860	uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
2861
2862	#ifndef NDEBUG
2863	StringRef OldName = OldPtr->getName();
2864	// Skip through the last '.sroa.' component of the name.
2865	size_t LastSROAPrefix = OldName.rfind(".sroa.");
2866	if (LastSROAPrefix != StringRef::npos) {
2867	OldName = OldName.substr(LastSROAPrefix + strlen(".sroa."));
2868	// Look for an SROA slice index.
2869	size_t IndexEnd = OldName.find_first_not_of("0123456789");
2870	if (IndexEnd != StringRef::npos && OldName[IndexEnd] == `'.'`) {
2871	// Strip the index and look for the offset.
2872	OldName = OldName.substr(IndexEnd + `1`);
2873	size_t OffsetEnd = OldName.find_first_not_of("0123456789");
2874	if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == `'.'`)
2875	// Strip the offset.
2876	OldName = OldName.substr(OffsetEnd + `1`);
2877	}
2878	}
2879	// Strip any SROA suffixes as well.
2880	OldName = OldName.substr(`0`, OldName.find(".sroa_"));
2881	#endif
2882
2883	return getAdjustedPtr(IRB, DL, Ptr: &NewAI,
2884	Offset: APInt (DL.getIndexTypeSizeInBits(Ty: PointerTy), Offset),
2885	PointerTy,
2886	#ifndef NDEBUG
2887	Twine(OldName) + "."
2888	#else
2889	NamePrefix: Twine ()
2890	#endif
2891	);
2892	}
2893
2894	/// Compute suitable alignment to access this slice of the new
2895	/// alloca.
2896	///
2897	/// You can optionally pass a type to this routine and if that type's ABI
2898	/// alignment is itself suitable, this will return zero.
2899	Align getSliceAlign() {
2900	return commonAlignment(A: NewAI.getAlign(),
2901	Offset: NewBeginOffset - NewAllocaBeginOffset);
2902	}
2903
2904	unsigned getIndex(uint64_t Offset) {
2905	assert(VecTy && "Can only call getIndex when rewriting a vector");
2906	uint64_t RelOffset = Offset - NewAllocaBeginOffset;
2907	assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
2908	uint32_t Index = RelOffset / ElementSize;
2909	assert(Index * ElementSize == RelOffset);
2910	return Index;
2911	}
2912
2913	void deleteIfTriviallyDead(Value *V) {
2914	Instruction *I = cast<Instruction>(Val: V);
2915	if (isInstructionTriviallyDead(I))
2916	Pass.DeadInsts.push_back(Elt: I);
2917	}
2918
2919	Value *rewriteVectorizedLoadInst(LoadInst &LI) {
2920	unsigned BeginIndex = getIndex(Offset: NewBeginOffset);
2921	unsigned EndIndex = getIndex(Offset: NewEndOffset);
2922	assert(EndIndex > BeginIndex && "Empty vector!");
2923
2924	LoadInst *Load = IRB.CreateAlignedLoad(Ty: NewAI.getAllocatedType(), Ptr: &NewAI,
2925	Align: NewAI.getAlign(), Name: "load");
2926
2927	Load->copyMetadata(SrcInst: LI, WL: {LLVMContext::MD_mem_parallel_loop_access,
2928	LLVMContext::MD_access_group});
2929	return extractVector(IRB, V: Load, BeginIndex, EndIndex, Name: "vec");
2930	}
2931
2932	Value *rewriteIntegerLoad(LoadInst &LI) {
2933	assert(IntTy && "We cannot insert an integer to the alloca");
2934	assert(!LI.isVolatile());
2935	Value *V = IRB.CreateAlignedLoad(Ty: NewAI.getAllocatedType(), Ptr: &NewAI,
2936	Align: NewAI.getAlign(), Name: "load");
2937	V = convertValue(DL, IRB, V, NewTy: IntTy);
2938	assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
2939	uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
2940	if (Offset > `0` \|\| NewEndOffset < NewAllocaEndOffset) {
2941	IntegerType ExtractTy = Type::getIntNTy(C&: LI.getContext(), N: SliceSize `8`);
2942	V = extractInteger(DL, IRB, V, Ty: ExtractTy, Offset, Name: "extract");
2943	}
2944	// It is possible that the extracted type is not the load type. This
2945	// happens if there is a load past the end of the alloca, and as
2946	// a consequence the slice is narrower but still a candidate for integer
2947	// lowering. To handle this case, we just zero extend the extracted
2948	// integer.
2949	assert(cast<IntegerType>(LI.getType())->getBitWidth() >= SliceSize * `8` &&
2950	"Can only handle an extract for an overly wide load");
2951	if (cast<IntegerType>(Val: LI.getType())->getBitWidth() > SliceSize * `8`)
2952	V = IRB.CreateZExt(V, DestTy: LI.getType());
2953	return V;
2954	}
2955
2956	bool visitLoadInst(LoadInst &LI) {
2957	LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
2958	Value *OldOp = LI.getOperand(i_nocapture: `0`);
2959	assert(OldOp == OldPtr);
2960
2961	AAMDNodes AATags = LI.getAAMetadata();
2962
2963	unsigned AS = LI.getPointerAddressSpace();
2964
2965	Type TargetTy = IsSplit ? Type::getIntNTy(C&: LI.getContext(), N: SliceSize `8`)
2966	: LI.getType();
2967	bool IsPtrAdjusted = false;
2968	Value *V;
2969	if (VecTy) {
2970	V = rewriteVectorizedLoadInst(LI);
2971	} else if (IntTy && LI.getType()->isIntegerTy()) {
2972	V = rewriteIntegerLoad(LI);
2973	} else if (NewBeginOffset == NewAllocaBeginOffset &&
2974	NewEndOffset == NewAllocaEndOffset &&
2975	(canConvertValue(DL, OldTy: NewAllocaTy, NewTy: TargetTy) \|\|
2976	(NewAllocaTy->isIntegerTy() && TargetTy->isIntegerTy() &&
2977	DL.getTypeStoreSize(Ty: TargetTy).getFixedValue() > SliceSize &&
2978	!LI.isVolatile()))) {
2979	Value *NewPtr =
2980	getPtrToNewAI(AddrSpace: LI.getPointerAddressSpace(), IsVolatile: LI.isVolatile());
2981	LoadInst *NewLI = IRB.CreateAlignedLoad(Ty: NewAI.getAllocatedType(), Ptr: NewPtr,
2982	Align: NewAI.getAlign(), isVolatile: LI.isVolatile(),
2983	Name: LI.getName());
2984	if (LI.isVolatile())
2985	NewLI->setAtomic(Ordering: LI.getOrdering(), SSID: LI.getSyncScopeID());
2986	if (NewLI->isAtomic())
2987	NewLI->setAlignment(LI.getAlign());
2988
2989	// Copy any metadata that is valid for the new load. This may require
2990	// conversion to a different kind of metadata, e.g. !nonnull might change
2991	// to !range or vice versa.
2992	copyMetadataForLoad(Dest&: *NewLI, Source: LI);
2993
2994	// Do this after copyMetadataForLoad() to preserve the TBAA shift.
2995	if (AATags)
2996	NewLI->setAAMetadata(AATags.adjustForAccess(
2997	Offset: NewBeginOffset - BeginOffset, AccessTy: NewLI->getType(), DL));
2998
2999	// Try to preserve nonnull metadata
3000	V = NewLI;
3001
3002	// If this is an integer load past the end of the slice (which means the
3003	// bytes outside the slice are undef or this load is dead) just forcibly
3004	// fix the integer size with correct handling of endianness.
3005	if (auto *AITy = dyn_cast<IntegerType>(Val: NewAllocaTy))
3006	if (auto *TITy = dyn_cast<IntegerType>(Val: TargetTy))
3007	if (AITy->getBitWidth() < TITy->getBitWidth()) {
3008	V = IRB.CreateZExt(V, DestTy: TITy, Name: "load.ext");
3009	if (DL.isBigEndian())
3010	V = IRB.CreateShl(LHS: V, RHS: TITy->getBitWidth() - AITy->getBitWidth(),
3011	Name: "endian_shift");
3012	}
3013	} else {
3014	Type *LTy = IRB.getPtrTy(AddrSpace: AS);
3015	LoadInst *NewLI =
3016	IRB.CreateAlignedLoad(Ty: TargetTy, Ptr: getNewAllocaSlicePtr(IRB, PointerTy: LTy),
3017	Align: getSliceAlign(), isVolatile: LI.isVolatile(), Name: LI.getName());
3018
3019	if (AATags)
3020	NewLI->setAAMetadata(AATags.adjustForAccess(
3021	Offset: NewBeginOffset - BeginOffset, AccessTy: NewLI->getType(), DL));
3022
3023	if (LI.isVolatile())
3024	NewLI->setAtomic(Ordering: LI.getOrdering(), SSID: LI.getSyncScopeID());
3025	NewLI->copyMetadata(SrcInst: LI, WL: {LLVMContext::MD_mem_parallel_loop_access,
3026	LLVMContext::MD_access_group});
3027
3028	V = NewLI;
3029	IsPtrAdjusted = true;
3030	}
3031	V = convertValue(DL, IRB, V, NewTy: TargetTy);
3032
3033	if (IsSplit) {
3034	assert(!LI.isVolatile());
3035	assert(LI.getType()->isIntegerTy() &&
3036	"Only integer type loads and stores are split");
3037	assert(SliceSize < DL.getTypeStoreSize(LI.getType()).getFixedValue() &&
3038	"Split load isn't smaller than original load");
3039	assert(DL.typeSizeEqualsStoreSize(LI.getType()) &&
3040	"Non-byte-multiple bit width");
3041	// Move the insertion point just past the load so that we can refer to it.
3042	BasicBlock::iterator LIIt = std::next(x: LI.getIterator());
3043	// Ensure the insertion point comes before any debug-info immediately
3044	// after the load, so that variable values referring to the load are
3045	// dominated by it.
3046	LIIt.setHeadBit(true);
3047	IRB.SetInsertPoint(TheBB: LI.getParent(), IP: LIIt);
3048	// Create a placeholder value with the same type as LI to use as the
3049	// basis for the new value. This allows us to replace the uses of LI with
3050	// the computed value, and then replace the placeholder with LI, leaving
3051	// LI only used for this computation.
3052	Value *Placeholder =
3053	new LoadInst (LI.getType(), PoisonValue::get(T: IRB.getPtrTy(AddrSpace: AS)), "",
3054	false, Align (`1`));
3055	V = insertInteger(DL, IRB, Old: Placeholder, V, Offset: NewBeginOffset - BeginOffset,
3056	Name: "insert");
3057	LI.replaceAllUsesWith(V);
3058	Placeholder->replaceAllUsesWith(V: &LI);
3059	Placeholder->deleteValue();
3060	} else {
3061	LI.replaceAllUsesWith(V);
3062	}
3063
3064	Pass.DeadInsts.push_back(Elt: &LI);
3065	deleteIfTriviallyDead(V: OldOp);
3066	LLVM_DEBUG(dbgs() << " to: " << *V << "\n");
3067	return !LI.isVolatile() && !IsPtrAdjusted;
3068	}
3069
3070	bool rewriteVectorizedStoreInst(Value V, StoreInst &SI, Value OldOp,
3071	AAMDNodes AATags) {
3072	// Capture V for the purpose of debug-info accounting once it's converted
3073	// to a vector store.
3074	Value *OrigV = V;
3075	if (V->getType() != VecTy) {
3076	unsigned BeginIndex = getIndex(Offset: NewBeginOffset);
3077	unsigned EndIndex = getIndex(Offset: NewEndOffset);
3078	assert(EndIndex > BeginIndex && "Empty vector!");
3079	unsigned NumElements = EndIndex - BeginIndex;
3080	assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
3081	"Too many elements!");
3082	Type *SliceTy = (NumElements == `1`)
3083	? ElementTy
3084	: FixedVectorType::get(ElementType: ElementTy, NumElts: NumElements);
3085	if (V->getType() != SliceTy)
3086	V = convertValue(DL, IRB, V, NewTy: SliceTy);
3087
3088	// Mix in the existing elements.
3089	Value *Old = IRB.CreateAlignedLoad(Ty: NewAI.getAllocatedType(), Ptr: &NewAI,
3090	Align: NewAI.getAlign(), Name: "load");
3091	V = insertVector(IRB, Old, V, BeginIndex, Name: "vec");
3092	}
3093	StoreInst *Store = IRB.CreateAlignedStore(Val: V, Ptr: &NewAI, Align: NewAI.getAlign());
3094	Store->copyMetadata(SrcInst: SI, WL: {LLVMContext::MD_mem_parallel_loop_access,
3095	LLVMContext::MD_access_group});
3096	if (AATags)
3097	Store->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3098	AccessTy: V->getType(), DL));
3099	Pass.DeadInsts.push_back(Elt: &SI);
3100
3101	// NOTE: Careful to use OrigV rather than V.
3102	migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * `8`, SliceSizeInBits: SliceSize * `8`, OldInst: &SI,
3103	Inst: Store, Dest: Store->getPointerOperand(), Value: OrigV, DL);
3104	LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3105	return true;
3106	}
3107
3108	bool rewriteIntegerStore(Value *V, StoreInst &SI, AAMDNodes AATags) {
3109	assert(IntTy && "We cannot extract an integer from the alloca");
3110	assert(!SI.isVolatile());
3111	if (DL.getTypeSizeInBits(Ty: V->getType()).getFixedValue() !=
3112	IntTy->getBitWidth()) {
3113	Value *Old = IRB.CreateAlignedLoad(Ty: NewAI.getAllocatedType(), Ptr: &NewAI,
3114	Align: NewAI.getAlign(), Name: "oldload");
3115	Old = convertValue(DL, IRB, V: Old, NewTy: IntTy);
3116	assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
3117	uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
3118	V = insertInteger(DL, IRB, Old, V: SI.getValueOperand(), Offset, Name: "insert");
3119	}
3120	V = convertValue(DL, IRB, V, NewTy: NewAllocaTy);
3121	StoreInst *Store = IRB.CreateAlignedStore(Val: V, Ptr: &NewAI, Align: NewAI.getAlign());
3122	Store->copyMetadata(SrcInst: SI, WL: {LLVMContext::MD_mem_parallel_loop_access,
3123	LLVMContext::MD_access_group});
3124	if (AATags)
3125	Store->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3126	AccessTy: V->getType(), DL));
3127
3128	migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * `8`, SliceSizeInBits: SliceSize * `8`, OldInst: &SI,
3129	Inst: Store, Dest: Store->getPointerOperand(),
3130	Value: Store->getValueOperand(), DL);
3131
3132	Pass.DeadInsts.push_back(Elt: &SI);
3133	LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3134	return true;
3135	}
3136
3137	bool visitStoreInst(StoreInst &SI) {
3138	LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3139	Value *OldOp = SI.getOperand(i_nocapture: `1`);
3140	assert(OldOp == OldPtr);
3141
3142	AAMDNodes AATags = SI.getAAMetadata();
3143	Value *V = SI.getValueOperand();
3144
3145	// Strip all inbounds GEPs and pointer casts to try to dig out any root
3146	// alloca that should be re-examined after promoting this alloca.
3147	if (V->getType()->isPointerTy())
3148	if (AllocaInst *AI = dyn_cast<AllocaInst>(Val: V->stripInBoundsOffsets()))
3149	Pass.PostPromotionWorklist.insert(X: AI);
3150
3151	TypeSize StoreSize = DL.getTypeStoreSize(Ty: V->getType());
3152	if (StoreSize.isFixed() && SliceSize < StoreSize.getFixedValue()) {
3153	assert(!SI.isVolatile());
3154	assert(V->getType()->isIntegerTy() &&
3155	"Only integer type loads and stores are split");
3156	assert(DL.typeSizeEqualsStoreSize(V->getType()) &&
3157	"Non-byte-multiple bit width");
3158	IntegerType NarrowTy = Type::getIntNTy(C&: SI.getContext(), N: SliceSize `8`);
3159	V = extractInteger(DL, IRB, V, Ty: NarrowTy, Offset: NewBeginOffset - BeginOffset,
3160	Name: "extract");
3161	}
3162
3163	if (VecTy)
3164	return rewriteVectorizedStoreInst(V, SI, OldOp, AATags);
3165	if (IntTy && V->getType()->isIntegerTy())
3166	return rewriteIntegerStore(V, SI, AATags);
3167
3168	StoreInst *NewSI;
3169	if (NewBeginOffset == NewAllocaBeginOffset &&
3170	NewEndOffset == NewAllocaEndOffset &&
3171	canConvertValue(DL, OldTy: V->getType(), NewTy: NewAllocaTy)) {
3172	V = convertValue(DL, IRB, V, NewTy: NewAllocaTy);
3173	Value *NewPtr =
3174	getPtrToNewAI(AddrSpace: SI.getPointerAddressSpace(), IsVolatile: SI.isVolatile());
3175
3176	NewSI =
3177	IRB.CreateAlignedStore(Val: V, Ptr: NewPtr, Align: NewAI.getAlign(), isVolatile: SI.isVolatile());
3178	} else {
3179	unsigned AS = SI.getPointerAddressSpace();
3180	Value *NewPtr = getNewAllocaSlicePtr(IRB, PointerTy: IRB.getPtrTy(AddrSpace: AS));
3181	NewSI =
3182	IRB.CreateAlignedStore(Val: V, Ptr: NewPtr, Align: getSliceAlign(), isVolatile: SI.isVolatile());
3183	}
3184	NewSI->copyMetadata(SrcInst: SI, WL: {LLVMContext::MD_mem_parallel_loop_access,
3185	LLVMContext::MD_access_group});
3186	if (AATags)
3187	NewSI->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3188	AccessTy: V->getType(), DL));
3189	if (SI.isVolatile())
3190	NewSI->setAtomic(Ordering: SI.getOrdering(), SSID: SI.getSyncScopeID());
3191	if (NewSI->isAtomic())
3192	NewSI->setAlignment(SI.getAlign());
3193
3194	migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * `8`, SliceSizeInBits: SliceSize * `8`, OldInst: &SI,
3195	Inst: NewSI, Dest: NewSI->getPointerOperand(),
3196	Value: NewSI->getValueOperand(), DL);
3197
3198	Pass.DeadInsts.push_back(Elt: &SI);
3199	deleteIfTriviallyDead(V: OldOp);
3200
3201	LLVM_DEBUG(dbgs() << " to: " << *NewSI << "\n");
3202	return NewSI->getPointerOperand() == &NewAI &&
3203	NewSI->getValueOperand()->getType() == NewAllocaTy &&
3204	!SI.isVolatile();
3205	}
3206
3207	/// Compute an integer value from splatting an i8 across the given
3208	/// number of bytes.
3209	///
3210	/// Note that this routine assumes an i8 is a byte. If that isn't true, don't
3211	/// call this routine.
3212	/// FIXME: Heed the advice above.
3213	///
3214	/// \param V The i8 value to splat.
3215	/// \param Size The number of bytes in the output (assuming i8 is one byte)
3216	Value getIntegerSplat(Value V, unsigned Size) {
3217	assert(Size > `0` && "Expected a positive number of bytes.");
3218	IntegerType *VTy = cast<IntegerType>(Val: V->getType());
3219	assert(VTy->getBitWidth() == `8` && "Expected an i8 value for the byte");
3220	if (Size == `1`)
3221	return V;
3222
3223	Type SplatIntTy = Type::getIntNTy(C&: VTy->getContext(), N: Size `8`);
3224	V = IRB.CreateMul(
3225	LHS: IRB.CreateZExt(V, DestTy: SplatIntTy, Name: "zext"),
3226	RHS: IRB.CreateUDiv(LHS: Constant::getAllOnesValue(Ty: SplatIntTy),
3227	RHS: IRB.CreateZExt(V: Constant::getAllOnesValue(Ty: V->getType()),
3228	DestTy: SplatIntTy)),
3229	Name: "isplat");
3230	return V;
3231	}
3232
3233	/// Compute a vector splat for a given element value.
3234	Value getVectorSplat(Value V, unsigned NumElements) {
3235	V = IRB.CreateVectorSplat(NumElts: NumElements, V, Name: "vsplat");
3236	LLVM_DEBUG(dbgs() << " splat: " << *V << "\n");
3237	return V;
3238	}
3239
3240	bool visitMemSetInst(MemSetInst &II) {
3241	LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3242	assert(II.getRawDest() == OldPtr);
3243
3244	AAMDNodes AATags = II.getAAMetadata();
3245
3246	// If the memset has a variable size, it cannot be split, just adjust the
3247	// pointer to the new alloca.
3248	if (!isa<ConstantInt>(Val: II.getLength())) {
3249	assert(!IsSplit);
3250	assert(NewBeginOffset == BeginOffset);
3251	II.setDest(getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType()));
3252	II.setDestAlignment(getSliceAlign());
3253	// In theory we should call migrateDebugInfo here. However, we do not
3254	// emit dbg.assign intrinsics for mem intrinsics storing through non-
3255	// constant geps, or storing a variable number of bytes.
3256	assert(at::getAssignmentMarkers(&II).empty() &&
3257	at::getDVRAssignmentMarkers(&II).empty() &&
3258	"AT: Unexpected link to non-const GEP");
3259	deleteIfTriviallyDead(V: OldPtr);
3260	return false;
3261	}
3262
3263	// Record this instruction for deletion.
3264	Pass.DeadInsts.push_back(Elt: &II);
3265
3266	Type *AllocaTy = NewAI.getAllocatedType();
3267	Type *ScalarTy = AllocaTy->getScalarType();
3268
3269	const bool CanContinue = [&]() {
3270	if (VecTy \|\| IntTy)
3271	return true;
3272	if (BeginOffset > NewAllocaBeginOffset \|\| EndOffset < NewAllocaEndOffset)
3273	return false;
3274	// Length must be in range for FixedVectorType.
3275	auto *C = cast<ConstantInt>(Val: II.getLength());
3276	const uint64_t Len = C->getLimitedValue();
3277	if (Len > std::numeric_limits<unsigned>::max())
3278	return false;
3279	auto *Int8Ty = IntegerType::getInt8Ty(C&: NewAI.getContext());
3280	auto *SrcTy = FixedVectorType::get(ElementType: Int8Ty, NumElts: Len);
3281	return canConvertValue(DL, OldTy: SrcTy, NewTy: AllocaTy) &&
3282	DL.isLegalInteger(Width: DL.getTypeSizeInBits(Ty: ScalarTy).getFixedValue());
3283	}();
3284
3285	// If this doesn't map cleanly onto the alloca type, and that type isn't
3286	// a single value type, just emit a memset.
3287	if (!CanContinue) {
3288	Type *SizeTy = II.getLength()->getType();
3289	unsigned Sz = NewEndOffset - NewBeginOffset;
3290	Constant *Size = ConstantInt::get(Ty: SizeTy, V: Sz);
3291	MemIntrinsic *New = cast<MemIntrinsic>(Val: IRB.CreateMemSet(
3292	Ptr: getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType()), Val: II.getValue(), Size,
3293	Align: MaybeAlign (getSliceAlign()), isVolatile: II.isVolatile()));
3294	if (AATags)
3295	New->setAAMetadata(
3296	AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset, AccessSize: Sz));
3297
3298	migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * `8`, SliceSizeInBits: SliceSize * `8`, OldInst: &II,
3299	Inst: New, Dest: New->getRawDest(), Value: nullptr, DL);
3300
3301	LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3302	return false;
3303	}
3304
3305	// If we can represent this as a simple value, we have to build the actual
3306	// value to store, which requires expanding the byte present in memset to
3307	// a sensible representation for the alloca type. This is essentially
3308	// splatting the byte to a sufficiently wide integer, splatting it across
3309	// any desired vector width, and bitcasting to the final type.
3310	Value *V;
3311
3312	if (VecTy) {
3313	// If this is a memset of a vectorized alloca, insert it.
3314	assert(ElementTy == ScalarTy);
3315
3316	unsigned BeginIndex = getIndex(Offset: NewBeginOffset);
3317	unsigned EndIndex = getIndex(Offset: NewEndOffset);
3318	assert(EndIndex > BeginIndex && "Empty vector!");
3319	unsigned NumElements = EndIndex - BeginIndex;
3320	assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
3321	"Too many elements!");
3322
3323	Value *Splat = getIntegerSplat(
3324	V: II.getValue(), Size: DL.getTypeSizeInBits(Ty: ElementTy).getFixedValue() / `8`);
3325	Splat = convertValue(DL, IRB, V: Splat, NewTy: ElementTy);
3326	if (NumElements > `1`)
3327	Splat = getVectorSplat(V: Splat, NumElements);
3328
3329	Value *Old = IRB.CreateAlignedLoad(Ty: NewAI.getAllocatedType(), Ptr: &NewAI,
3330	Align: NewAI.getAlign(), Name: "oldload");
3331	V = insertVector(IRB, Old, V: Splat, BeginIndex, Name: "vec");
3332	} else if (IntTy) {
3333	// If this is a memset on an alloca where we can widen stores, insert the
3334	// set integer.
3335	assert(!II.isVolatile());
3336
3337	uint64_t Size = NewEndOffset - NewBeginOffset;
3338	V = getIntegerSplat(V: II.getValue(), Size);
3339
3340	if (IntTy && (BeginOffset != NewAllocaBeginOffset \|\|
3341	EndOffset != NewAllocaBeginOffset)) {
3342	Value *Old = IRB.CreateAlignedLoad(Ty: NewAI.getAllocatedType(), Ptr: &NewAI,
3343	Align: NewAI.getAlign(), Name: "oldload");
3344	Old = convertValue(DL, IRB, V: Old, NewTy: IntTy);
3345	uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3346	V = insertInteger(DL, IRB, Old, V, Offset, Name: "insert");
3347	} else {
3348	assert(V->getType() == IntTy &&
3349	"Wrong type for an alloca wide integer!");
3350	}
3351	V = convertValue(DL, IRB, V, NewTy: AllocaTy);
3352	} else {
3353	// Established these invariants above.
3354	assert(NewBeginOffset == NewAllocaBeginOffset);
3355	assert(NewEndOffset == NewAllocaEndOffset);
3356
3357	V = getIntegerSplat(V: II.getValue(),
3358	Size: DL.getTypeSizeInBits(Ty: ScalarTy).getFixedValue() / `8`);
3359	if (VectorType *AllocaVecTy = dyn_cast<VectorType>(Val: AllocaTy))
3360	V = getVectorSplat(
3361	V, NumElements: cast<FixedVectorType>(Val: AllocaVecTy)->getNumElements());
3362
3363	V = convertValue(DL, IRB, V, NewTy: AllocaTy);
3364	}
3365
3366	Value *NewPtr = getPtrToNewAI(AddrSpace: II.getDestAddressSpace(), IsVolatile: II.isVolatile());
3367	StoreInst *New =
3368	IRB.CreateAlignedStore(Val: V, Ptr: NewPtr, Align: NewAI.getAlign(), isVolatile: II.isVolatile());
3369	New->copyMetadata(SrcInst: II, WL: {LLVMContext::MD_mem_parallel_loop_access,
3370	LLVMContext::MD_access_group});
3371	if (AATags)
3372	New->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3373	AccessTy: V->getType(), DL));
3374
3375	migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * `8`, SliceSizeInBits: SliceSize * `8`, OldInst: &II,
3376	Inst: New, Dest: New->getPointerOperand(), Value: V, DL);
3377
3378	LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3379	return !II.isVolatile();
3380	}
3381
3382	bool visitMemTransferInst(MemTransferInst &II) {
3383	// Rewriting of memory transfer instructions can be a bit tricky. We break
3384	// them into two categories: split intrinsics and unsplit intrinsics.
3385
3386	LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3387
3388	AAMDNodes AATags = II.getAAMetadata();
3389
3390	bool IsDest = &II.getRawDestUse() == OldUse;
3391	assert((IsDest && II.getRawDest() == OldPtr) \|\|
3392	(!IsDest && II.getRawSource() == OldPtr));
3393
3394	Align SliceAlign = getSliceAlign();
3395	// For unsplit intrinsics, we simply modify the source and destination
3396	// pointers in place. This isn't just an optimization, it is a matter of
3397	// correctness. With unsplit intrinsics we may be dealing with transfers
3398	// within a single alloca before SROA ran, or with transfers that have
3399	// a variable length. We may also be dealing with memmove instead of
3400	// memcpy, and so simply updating the pointers is the necessary for us to
3401	// update both source and dest of a single call.
3402	if (!IsSplittable) {
3403	Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType());
3404	if (IsDest) {
3405	// Update the address component of linked dbg.assigns.
3406	auto UpdateAssignAddress = [&](auto *DbgAssign) {
3407	if (llvm::is_contained(DbgAssign->location_ops(), II.getDest()) \|\|
3408	DbgAssign->getAddress() == II.getDest())
3409	DbgAssign->replaceVariableLocationOp(II.getDest(), AdjustedPtr);
3410	};
3411	for_each(Range: at::getAssignmentMarkers(Inst: &II), F: UpdateAssignAddress);
3412	for_each(Range: at::getDVRAssignmentMarkers(Inst: &II), F: UpdateAssignAddress);
3413	II.setDest(AdjustedPtr);
3414	II.setDestAlignment(SliceAlign);
3415	} else {
3416	II.setSource(AdjustedPtr);
3417	II.setSourceAlignment(SliceAlign);
3418	}
3419
3420	LLVM_DEBUG(dbgs() << " to: " << II << "\n");
3421	deleteIfTriviallyDead(V: OldPtr);
3422	return false;
3423	}
3424	// For split transfer intrinsics we have an incredibly useful assurance:
3425	// the source and destination do not reside within the same alloca, and at
3426	// least one of them does not escape. This means that we can replace
3427	// memmove with memcpy, and we don't need to worry about all manner of
3428	// downsides to splitting and transforming the operations.
3429
3430	// If this doesn't map cleanly onto the alloca type, and that type isn't
3431	// a single value type, just emit a memcpy.
3432	bool EmitMemCpy =
3433	!VecTy && !IntTy &&
3434	(BeginOffset > NewAllocaBeginOffset \|\| EndOffset < NewAllocaEndOffset \|\|
3435	SliceSize !=
3436	DL.getTypeStoreSize(Ty: NewAI.getAllocatedType()).getFixedValue() \|\|
3437	!DL.typeSizeEqualsStoreSize(Ty: NewAI.getAllocatedType()) \|\|
3438	!NewAI.getAllocatedType()->isSingleValueType());
3439
3440	// If we're just going to emit a memcpy, the alloca hasn't changed, and the
3441	// size hasn't been shrunk based on analysis of the viable range, this is
3442	// a no-op.
3443	if (EmitMemCpy && &OldAI == &NewAI) {
3444	// Ensure the start lines up.
3445	assert(NewBeginOffset == BeginOffset);
3446
3447	// Rewrite the size as needed.
3448	if (NewEndOffset != EndOffset)
3449	II.setLength(ConstantInt::get(Ty: II.getLength()->getType(),
3450	V: NewEndOffset - NewBeginOffset));
3451	return false;
3452	}
3453	// Record this instruction for deletion.
3454	Pass.DeadInsts.push_back(Elt: &II);
3455
3456	// Strip all inbounds GEPs and pointer casts to try to dig out any root
3457	// alloca that should be re-examined after rewriting this instruction.
3458	Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
3459	if (AllocaInst *AI =
3460	dyn_cast<AllocaInst>(Val: OtherPtr->stripInBoundsOffsets())) {
3461	assert(AI != &OldAI && AI != &NewAI &&
3462	"Splittable transfers cannot reach the same alloca on both ends.");
3463	Pass.Worklist.insert(X: AI);
3464	}
3465
3466	Type *OtherPtrTy = OtherPtr->getType();
3467	unsigned OtherAS = OtherPtrTy->getPointerAddressSpace();
3468
3469	// Compute the relative offset for the other pointer within the transfer.
3470	unsigned OffsetWidth = DL.getIndexSizeInBits(AS: OtherAS);
3471	APInt OtherOffset(OffsetWidth, NewBeginOffset - BeginOffset);
3472	Align OtherAlign =
3473	(IsDest ? II.getSourceAlign() : II.getDestAlign()).valueOrOne();
3474	OtherAlign =
3475	commonAlignment(A: OtherAlign, Offset: OtherOffset.zextOrTrunc(width: `64`).getZExtValue());
3476
3477	if (EmitMemCpy) {
3478	// Compute the other pointer, folding as much as possible to produce
3479	// a single, simple GEP in most cases.
3480	OtherPtr = getAdjustedPtr(IRB, DL, Ptr: OtherPtr, Offset: OtherOffset, PointerTy: OtherPtrTy,
3481	NamePrefix: OtherPtr->getName() + ".");
3482
3483	Value *OurPtr = getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType());
3484	Type *SizeTy = II.getLength()->getType();
3485	Constant *Size = ConstantInt::get(Ty: SizeTy, V: NewEndOffset - NewBeginOffset);
3486
3487	Value DestPtr, SrcPtr;
3488	MaybeAlign DestAlign, SrcAlign;
3489	// Note: IsDest is true iff we're copying into the new alloca slice
3490	if (IsDest) {
3491	DestPtr = OurPtr;
3492	DestAlign = SliceAlign;
3493	SrcPtr = OtherPtr;
3494	SrcAlign = OtherAlign;
3495	} else {
3496	DestPtr = OtherPtr;
3497	DestAlign = OtherAlign;
3498	SrcPtr = OurPtr;
3499	SrcAlign = SliceAlign;
3500	}
3501	CallInst *New = IRB.CreateMemCpy(Dst: DestPtr, DstAlign: DestAlign, Src: SrcPtr, SrcAlign,
3502	Size, isVolatile: II.isVolatile());
3503	if (AATags)
3504	New->setAAMetadata(AATags.shift(Offset: NewBeginOffset - BeginOffset));
3505
3506	APInt Offset(DL.getIndexTypeSizeInBits(Ty: DestPtr->getType()), `0`);
3507	if (IsDest) {
3508	migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * `8`, SliceSizeInBits: SliceSize * `8`,
3509	OldInst: &II, Inst: New, Dest: DestPtr, Value: nullptr, DL);
3510	} else if (AllocaInst *Base = dyn_cast<AllocaInst>(
3511	Val: DestPtr->stripAndAccumulateConstantOffsets(
3512	DL, Offset, /AllowNonInbounds/ true))) {
3513	migrateDebugInfo(OldAlloca: Base, IsSplit, OldAllocaOffsetInBits: Offset.getZExtValue() * `8`,
3514	SliceSizeInBits: SliceSize * `8`, OldInst: &II, Inst: New, Dest: DestPtr, Value: nullptr, DL);
3515	}
3516	LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3517	return false;
3518	}
3519
3520	bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset &&
3521	NewEndOffset == NewAllocaEndOffset;
3522	uint64_t Size = NewEndOffset - NewBeginOffset;
3523	unsigned BeginIndex = VecTy ? getIndex(Offset: NewBeginOffset) : `0`;
3524	unsigned EndIndex = VecTy ? getIndex(Offset: NewEndOffset) : `0`;
3525	unsigned NumElements = EndIndex - BeginIndex;
3526	IntegerType *SubIntTy =
3527	IntTy ? Type::getIntNTy(C&: IntTy->getContext(), N: Size * `8`) : nullptr;
3528
3529	// Reset the other pointer type to match the register type we're going to
3530	// use, but using the address space of the original other pointer.
3531	Type *OtherTy;
3532	if (VecTy && !IsWholeAlloca) {
3533	if (NumElements == `1`)
3534	OtherTy = VecTy->getElementType();
3535	else
3536	OtherTy = FixedVectorType::get(ElementType: VecTy->getElementType(), NumElts: NumElements);
3537	} else if (IntTy && !IsWholeAlloca) {
3538	OtherTy = SubIntTy;
3539	} else {
3540	OtherTy = NewAllocaTy;
3541	}
3542
3543	Value *AdjPtr = getAdjustedPtr(IRB, DL, Ptr: OtherPtr, Offset: OtherOffset, PointerTy: OtherPtrTy,
3544	NamePrefix: OtherPtr->getName() + ".");
3545	MaybeAlign SrcAlign = OtherAlign;
3546	MaybeAlign DstAlign = SliceAlign;
3547	if (!IsDest)
3548	std::swap(a&: SrcAlign, b&: DstAlign);
3549
3550	Value *SrcPtr;
3551	Value *DstPtr;
3552
3553	if (IsDest) {
3554	DstPtr = getPtrToNewAI(AddrSpace: II.getDestAddressSpace(), IsVolatile: II.isVolatile());
3555	SrcPtr = AdjPtr;
3556	} else {
3557	DstPtr = AdjPtr;
3558	SrcPtr = getPtrToNewAI(AddrSpace: II.getSourceAddressSpace(), IsVolatile: II.isVolatile());
3559	}
3560
3561	Value *Src;
3562	if (VecTy && !IsWholeAlloca && !IsDest) {
3563	Src = IRB.CreateAlignedLoad(Ty: NewAI.getAllocatedType(), Ptr: &NewAI,
3564	Align: NewAI.getAlign(), Name: "load");
3565	Src = extractVector(IRB, V: Src, BeginIndex, EndIndex, Name: "vec");
3566	} else if (IntTy && !IsWholeAlloca && !IsDest) {
3567	Src = IRB.CreateAlignedLoad(Ty: NewAI.getAllocatedType(), Ptr: &NewAI,
3568	Align: NewAI.getAlign(), Name: "load");
3569	Src = convertValue(DL, IRB, V: Src, NewTy: IntTy);
3570	uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3571	Src = extractInteger(DL, IRB, V: Src, Ty: SubIntTy, Offset, Name: "extract");
3572	} else {
3573	LoadInst *Load = IRB.CreateAlignedLoad(Ty: OtherTy, Ptr: SrcPtr, Align: SrcAlign,
3574	isVolatile: II.isVolatile(), Name: "copyload");
3575	Load->copyMetadata(SrcInst: II, WL: {LLVMContext::MD_mem_parallel_loop_access,
3576	LLVMContext::MD_access_group});
3577	if (AATags)
3578	Load->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3579	AccessTy: Load->getType(), DL));
3580	Src = Load;
3581	}
3582
3583	if (VecTy && !IsWholeAlloca && IsDest) {
3584	Value *Old = IRB.CreateAlignedLoad(Ty: NewAI.getAllocatedType(), Ptr: &NewAI,
3585	Align: NewAI.getAlign(), Name: "oldload");
3586	Src = insertVector(IRB, Old, V: Src, BeginIndex, Name: "vec");
3587	} else if (IntTy && !IsWholeAlloca && IsDest) {
3588	Value *Old = IRB.CreateAlignedLoad(Ty: NewAI.getAllocatedType(), Ptr: &NewAI,
3589	Align: NewAI.getAlign(), Name: "oldload");
3590	Old = convertValue(DL, IRB, V: Old, NewTy: IntTy);
3591	uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3592	Src = insertInteger(DL, IRB, Old, V: Src, Offset, Name: "insert");
3593	Src = convertValue(DL, IRB, V: Src, NewTy: NewAllocaTy);
3594	}
3595
3596	StoreInst *Store = cast<StoreInst>(
3597	Val: IRB.CreateAlignedStore(Val: Src, Ptr: DstPtr, Align: DstAlign, isVolatile: II.isVolatile()));
3598	Store->copyMetadata(SrcInst: II, WL: {LLVMContext::MD_mem_parallel_loop_access,
3599	LLVMContext::MD_access_group});
3600	if (AATags)
3601	Store->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3602	AccessTy: Src->getType(), DL));
3603
3604	APInt Offset(DL.getIndexTypeSizeInBits(Ty: DstPtr->getType()), `0`);
3605	if (IsDest) {
3606
3607	migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * `8`, SliceSizeInBits: SliceSize * `8`, OldInst: &II,
3608	Inst: Store, Dest: DstPtr, Value: Src, DL);
3609	} else if (AllocaInst *Base = dyn_cast<AllocaInst>(
3610	Val: DstPtr->stripAndAccumulateConstantOffsets(
3611	DL, Offset, /AllowNonInbounds/ true))) {
3612	migrateDebugInfo(OldAlloca: Base, IsSplit, OldAllocaOffsetInBits: Offset.getZExtValue() * `8`, SliceSizeInBits: SliceSize * `8`,
3613	OldInst: &II, Inst: Store, Dest: DstPtr, Value: Src, DL);
3614	}
3615
3616	LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3617	return !II.isVolatile();
3618	}
3619
3620	bool visitIntrinsicInst(IntrinsicInst &II) {
3621	assert((II.isLifetimeStartOrEnd() \|\| II.isLaunderOrStripInvariantGroup() \|\|
3622	II.isDroppable()) &&
3623	"Unexpected intrinsic!");
3624	LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3625
3626	// Record this instruction for deletion.
3627	Pass.DeadInsts.push_back(Elt: &II);
3628
3629	if (II.isDroppable()) {
3630	assert(II.getIntrinsicID() == Intrinsic::assume && "Expected assume");
3631	// TODO For now we forget assumed information, this can be improved.
3632	OldPtr->dropDroppableUsesIn(Usr&: II);
3633	return true;
3634	}
3635
3636	if (II.isLaunderOrStripInvariantGroup())
3637	return true;
3638
3639	assert(II.getArgOperand(`1`) == OldPtr);
3640	// Lifetime intrinsics are only promotable if they cover the whole alloca.
3641	// Therefore, we drop lifetime intrinsics which don't cover the whole
3642	// alloca.
3643	// (In theory, intrinsics which partially cover an alloca could be
3644	// promoted, but PromoteMemToReg doesn't handle that case.)
3645	// FIXME: Check whether the alloca is promotable before dropping the
3646	// lifetime intrinsics?
3647	if (NewBeginOffset != NewAllocaBeginOffset \|\|
3648	NewEndOffset != NewAllocaEndOffset)
3649	return true;
3650
3651	ConstantInt *Size =
3652	ConstantInt::get(Ty: cast<IntegerType>(Val: II.getArgOperand(i: `0`)->getType()),
3653	V: NewEndOffset - NewBeginOffset);
3654	// Lifetime intrinsics always expect an i8 so directly get such a pointer*
3655	// for the new alloca slice.
3656	Type *PointerTy = IRB.getPtrTy(AddrSpace: OldPtr->getType()->getPointerAddressSpace());
3657	Value *Ptr = getNewAllocaSlicePtr(IRB, PointerTy);
3658	Value *New;
3659	if (II.getIntrinsicID() == Intrinsic::lifetime_start)
3660	New = IRB.CreateLifetimeStart(Ptr, Size);
3661	else
3662	New = IRB.CreateLifetimeEnd(Ptr, Size);
3663
3664	(void)New;
3665	LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3666
3667	return true;
3668	}
3669
3670	void fixLoadStoreAlign(Instruction &Root) {
3671	// This algorithm implements the same visitor loop as
3672	// hasUnsafePHIOrSelectUse, and fixes the alignment of each load
3673	// or store found.
3674	SmallPtrSet<Instruction *, `4`> Visited;
3675	SmallVector<Instruction *, `4`> Uses;
3676	Visited.insert(Ptr: &Root);
3677	Uses.push_back(Elt: &Root);
3678	do {
3679	Instruction *I = Uses.pop_back_val();
3680
3681	if (LoadInst *LI = dyn_cast<LoadInst>(Val: I)) {
3682	LI->setAlignment(std::min(a: LI->getAlign(), b: getSliceAlign()));
3683	continue;
3684	}
3685	if (StoreInst *SI = dyn_cast<StoreInst>(Val: I)) {
3686	SI->setAlignment(std::min(a: SI->getAlign(), b: getSliceAlign()));
3687	continue;
3688	}
3689
3690	assert(isa<BitCastInst>(I) \|\| isa<AddrSpaceCastInst>(I) \|\|
3691	isa<PHINode>(I) \|\| isa<SelectInst>(I) \|\|
3692	isa<GetElementPtrInst>(I));
3693	for (User *U : I->users())
3694	if (Visited.insert(Ptr: cast<Instruction>(Val: U)).second)
3695	Uses.push_back(Elt: cast<Instruction>(Val: U));
3696	} while (!Uses.empty());
3697	}
3698
3699	bool visitPHINode(PHINode &PN) {
3700	LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
3701	assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable");
3702	assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable");
3703
3704	// We would like to compute a new pointer in only one place, but have it be
3705	// as local as possible to the PHI. To do that, we re-use the location of
3706	// the old pointer, which necessarily must be in the right position to
3707	// dominate the PHI.
3708	IRBuilderBase::InsertPointGuard Guard(IRB);
3709	if (isa<PHINode>(Val: OldPtr))
3710	IRB.SetInsertPoint(TheBB: OldPtr->getParent(),
3711	IP: OldPtr->getParent()->getFirstInsertionPt());
3712	else
3713	IRB.SetInsertPoint(OldPtr);
3714	IRB.SetCurrentDebugLocation(OldPtr->getDebugLoc());
3715
3716	Value *NewPtr = getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType());
3717	// Replace the operands which were using the old pointer.
3718	std::replace(first: PN.op_begin(), last: PN.op_end(), old_value: cast<Value>(Val: OldPtr), new_value: NewPtr);
3719
3720	LLVM_DEBUG(dbgs() << " to: " << PN << "\n");
3721	deleteIfTriviallyDead(V: OldPtr);
3722
3723	// Fix the alignment of any loads or stores using this PHI node.
3724	fixLoadStoreAlign(Root&: PN);
3725
3726	// PHIs can't be promoted on their own, but often can be speculated. We
3727	// check the speculation outside of the rewriter so that we see the
3728	// fully-rewritten alloca.
3729	PHIUsers.insert(X: &PN);
3730	return true;
3731	}
3732
3733	bool visitSelectInst(SelectInst &SI) {
3734	LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3735	assert((SI.getTrueValue() == OldPtr \|\| SI.getFalseValue() == OldPtr) &&
3736	"Pointer isn't an operand!");
3737	assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
3738	assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable");
3739
3740	Value *NewPtr = getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType());
3741	// Replace the operands which were using the old pointer.
3742	if (SI.getOperand(i_nocapture: `1`) == OldPtr)
3743	SI.setOperand(i_nocapture: `1`, Val_nocapture: NewPtr);
3744	if (SI.getOperand(i_nocapture: `2`) == OldPtr)
3745	SI.setOperand(i_nocapture: `2`, Val_nocapture: NewPtr);
3746
3747	LLVM_DEBUG(dbgs() << " to: " << SI << "\n");
3748	deleteIfTriviallyDead(V: OldPtr);
3749
3750	// Fix the alignment of any loads or stores using this select.
3751	fixLoadStoreAlign(Root&: SI);
3752
3753	// Selects can't be promoted on their own, but often can be speculated. We
3754	// check the speculation outside of the rewriter so that we see the
3755	// fully-rewritten alloca.
3756	SelectUsers.insert(X: &SI);
3757	return true;
3758	}
3759	};
3760
3761	/// Visitor to rewrite aggregate loads and stores as scalar.
3762	///
3763	/// This pass aggressively rewrites all aggregate loads and stores on
3764	/// a particular pointer (or any pointer derived from it which we can identify)
3765	/// with scalar loads and stores.
3766	class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
3767	// Befriend the base class so it can delegate to private visit methods.
3768	friend class InstVisitor<AggLoadStoreRewriter, bool>;
3769
3770	/// Queue of pointer uses to analyze and potentially rewrite.
3771	SmallVector<Use *, `8`> Queue;
3772
3773	/// Set to prevent us from cycling with phi nodes and loops.
3774	SmallPtrSet<User *, `8`> Visited;
3775
3776	/// The current pointer use being rewritten. This is used to dig up the used
3777	/// value (as opposed to the user).
3778	Use U = nullptr*;
3779
3780	/// Used to calculate offsets, and hence alignment, of subobjects.
3781	const DataLayout &DL;
3782
3783	IRBuilderTy &IRB;
3784
3785	public:
3786	AggLoadStoreRewriter(const DataLayout &DL, IRBuilderTy &IRB)
3787	: DL(DL), IRB(IRB) {}
3788
3789	/// Rewrite loads and stores through a pointer and all pointers derived from
3790	/// it.
3791	bool rewrite(Instruction &I) {
3792	LLVM_DEBUG(dbgs() << " Rewriting FCA loads and stores...\n");
3793	enqueueUsers(I);
3794	bool Changed = false;
3795	while (!Queue.empty()) {
3796	U = Queue.pop_back_val();
3797	Changed \|= visit(I: cast<Instruction>(Val: U->getUser()));
3798	}
3799	return Changed;
3800	}
3801
3802	private:
3803	/// Enqueue all the users of the given instruction for further processing.
3804	/// This uses a set to de-duplicate users.
3805	void enqueueUsers(Instruction &I) {
3806	for (Use &U : I.uses())
3807	if (Visited.insert(Ptr: U.getUser()).second)
3808	Queue.push_back(Elt: &U);
3809	}
3810
3811	// Conservative default is to not rewrite anything.
3812	bool visitInstruction(Instruction &I) { return false; }
3813
3814	/// Generic recursive split emission class.
3815	template <typename Derived> class OpSplitter {
3816	protected:
3817	/// The builder used to form new instructions.
3818	IRBuilderTy &IRB;
3819
3820	/// The indices which to be used with insert- or extractvalue to select the
3821	/// appropriate value within the aggregate.
3822	SmallVector<unsigned, `4`> Indices;
3823
3824	/// The indices to a GEP instruction which will move Ptr to the correct slot
3825	/// within the aggregate.
3826	SmallVector<Value *, `4`> GEPIndices;
3827
3828	/// The base pointer of the original op, used as a base for GEPing the
3829	/// split operations.
3830	Value *Ptr;
3831
3832	/// The base pointee type being GEPed into.
3833	Type *BaseTy;
3834
3835	/// Known alignment of the base pointer.
3836	Align BaseAlign;
3837
3838	/// To calculate offset of each component so we can correctly deduce
3839	/// alignments.
3840	const DataLayout &DL;
3841
3842	/// Initialize the splitter with an insertion point, Ptr and start with a
3843	/// single zero GEP index.
3844	OpSplitter(Instruction InsertionPoint, Value Ptr, Type *BaseTy,
3845	Align BaseAlign, const DataLayout &DL, IRBuilderTy &IRB)
3846	: IRB(IRB), GEPIndices (`1`, IRB.getInt32(C: `0`)), Ptr(Ptr), BaseTy(BaseTy),
3847	BaseAlign (BaseAlign), DL(DL) {
3848	IRB.SetInsertPoint(InsertionPoint);
3849	}
3850
3851	public:
3852	/// Generic recursive split emission routine.
3853	///
3854	/// This method recursively splits an aggregate op (load or store) into
3855	/// scalar or vector ops. It splits recursively until it hits a single value
3856	/// and emits that single value operation via the template argument.
3857	///
3858	/// The logic of this routine relies on GEPs and insertvalue and
3859	/// extractvalue all operating with the same fundamental index list, merely
3860	/// formatted differently (GEPs need actual values).
3861	///
3862	/// \param Ty The type being split recursively into smaller ops.
3863	/// \param Agg The aggregate value being built up or stored, depending on
3864	/// whether this is splitting a load or a store respectively.
3865	void emitSplitOps(Type Ty, Value &Agg, const Twine &Name) {
3866	if (Ty->isSingleValueType()) {
3867	unsigned Offset = DL.getIndexedOffsetInType(ElemTy: BaseTy, Indices: GEPIndices);
3868	return static_cast<Derived >(this*)->emitFunc(
3869	Ty, Agg, commonAlignment(A: BaseAlign, Offset), Name);
3870	}
3871
3872	if (ArrayType *ATy = dyn_cast<ArrayType>(Val: Ty)) {
3873	unsigned OldSize = Indices.size();
3874	(void)OldSize;
3875	for (unsigned Idx = `0`, Size = ATy->getNumElements(); Idx != Size;
3876	++Idx) {
3877	assert(Indices.size() == OldSize && "Did not return to the old size");
3878	Indices.push_back(Elt: Idx);
3879	GEPIndices.push_back(Elt: IRB.getInt32(C: Idx));
3880	emitSplitOps(Ty: ATy->getElementType(), Agg, Name: Name + "." + Twine (Idx));
3881	GEPIndices.pop_back();
3882	Indices.pop_back();
3883	}
3884	return;
3885	}
3886
3887	if (StructType *STy = dyn_cast<StructType>(Val: Ty)) {
3888	unsigned OldSize = Indices.size();
3889	(void)OldSize;
3890	for (unsigned Idx = `0`, Size = STy->getNumElements(); Idx != Size;
3891	++Idx) {
3892	assert(Indices.size() == OldSize && "Did not return to the old size");
3893	Indices.push_back(Elt: Idx);
3894	GEPIndices.push_back(Elt: IRB.getInt32(C: Idx));
3895	emitSplitOps(Ty: STy->getElementType(N: Idx), Agg, Name: Name + "." + Twine (Idx));
3896	GEPIndices.pop_back();
3897	Indices.pop_back();
3898	}
3899	return;
3900	}
3901
3902	llvm_unreachable("Only arrays and structs are aggregate loadable types");
3903	}
3904	};
3905
3906	struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
3907	AAMDNodes AATags;
3908	// A vector to hold the split components that we want to emit
3909	// separate fake uses for.
3910	SmallVector<Value *, `4`> Components;
3911	// A vector to hold all the fake uses of the struct that we are splitting.
3912	// Usually there should only be one, but we are handling the general case.
3913	SmallVector<Instruction *, `1`> FakeUses;
3914
3915	LoadOpSplitter(Instruction InsertionPoint, Value Ptr, Type *BaseTy,
3916	AAMDNodes AATags, Align BaseAlign, const DataLayout &DL,
3917	IRBuilderTy &IRB)
3918	: OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, DL,
3919	IRB),
3920	AATags (AATags) {}
3921
3922	/// Emit a leaf load of a single value. This is called at the leaves of the
3923	/// recursive emission to actually load values.
3924	void emitFunc(Type Ty, Value &Agg, Align Alignment, const Twine &Name) {
3925	assert(Ty->isSingleValueType());
3926	// Load the single value and insert it using the indices.
3927	Value *GEP =
3928	IRB.CreateInBoundsGEP(Ty: BaseTy, Ptr, IdxList: GEPIndices, Name: Name + ".gep");
3929	LoadInst *Load =
3930	IRB.CreateAlignedLoad(Ty, Ptr: GEP, Align: Alignment, Name: Name + ".load");
3931
3932	APInt Offset(
3933	DL.getIndexSizeInBits(AS: Ptr->getType()->getPointerAddressSpace()), `0`);
3934	if (AATags &&
3935	GEPOperator::accumulateConstantOffset(SourceType: BaseTy, Index: GEPIndices, DL, Offset))
3936	Load->setAAMetadata(
3937	AATags.adjustForAccess(Offset: Offset.getZExtValue(), AccessTy: Load->getType(), DL));
3938	// Record the load so we can generate a fake use for this aggregate
3939	// component.
3940	Components.push_back(Elt: Load);
3941
3942	Agg = IRB.CreateInsertValue(Agg, Val: Load, Idxs: Indices, Name: Name + ".insert");
3943	LLVM_DEBUG(dbgs() << " to: " << *Load << "\n");
3944	}
3945
3946	// Stash the fake uses that use the value generated by this instruction.
3947	void recordFakeUses(LoadInst &LI) {
3948	for (Use &U : LI.uses())
3949	if (auto *II = dyn_cast<IntrinsicInst>(Val: U.getUser()))
3950	if (II->getIntrinsicID() == Intrinsic::fake_use)
3951	FakeUses.push_back(Elt: II);
3952	}
3953
3954	// Replace all fake uses of the aggregate with a series of fake uses, one
3955	// for each split component.
3956	void emitFakeUses() {
3957	for (Instruction *I : FakeUses) {
3958	IRB.SetInsertPoint(I);
3959	for (auto *V : Components)
3960	IRB.CreateIntrinsic(ID: Intrinsic::fake_use, Args: {V});
3961	I->eraseFromParent();
3962	}
3963	}
3964	};
3965
3966	bool visitLoadInst(LoadInst &LI) {
3967	assert(LI.getPointerOperand() == *U);
3968	if (!LI.isSimple() \|\| LI.getType()->isSingleValueType())
3969	return false;
3970
3971	// We have an aggregate being loaded, split it apart.
3972	LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
3973	LoadOpSplitter Splitter(&LI, *U, LI.getType(), LI.getAAMetadata(),
3974	getAdjustedAlignment(I: &LI, Offset: `0`), DL, IRB);
3975	Splitter.recordFakeUses(LI);
3976	Value *V = PoisonValue::get(T: LI.getType());
3977	Splitter.emitSplitOps(Ty: LI.getType(), Agg&: V, Name: LI.getName() + ".fca");
3978	Splitter.emitFakeUses();
3979	Visited.erase(Ptr: &LI);
3980	LI.replaceAllUsesWith(V);
3981	LI.eraseFromParent();
3982	return true;
3983	}
3984
3985	struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
3986	StoreOpSplitter(Instruction InsertionPoint, Value Ptr, Type *BaseTy,
3987	AAMDNodes AATags, StoreInst *AggStore, Align BaseAlign,
3988	const DataLayout &DL, IRBuilderTy &IRB)
3989	: OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
3990	DL, IRB),
3991	AATags (AATags), AggStore(AggStore) {}
3992	AAMDNodes AATags;
3993	StoreInst *AggStore;
3994	/// Emit a leaf store of a single value. This is called at the leaves of the
3995	/// recursive emission to actually produce stores.
3996	void emitFunc(Type Ty, Value &Agg, Align Alignment, const Twine &Name) {
3997	assert(Ty->isSingleValueType());
3998	// Extract the single value and store it using the indices.
3999	//
4000	// The gep and extractvalue values are factored out of the CreateStore
4001	// call to make the output independent of the argument evaluation order.
4002	Value *ExtractValue =
4003	IRB.CreateExtractValue(Agg, Idxs: Indices, Name: Name + ".extract");
4004	Value *InBoundsGEP =
4005	IRB.CreateInBoundsGEP(Ty: BaseTy, Ptr, IdxList: GEPIndices, Name: Name + ".gep");
4006	StoreInst *Store =
4007	IRB.CreateAlignedStore(Val: ExtractValue, Ptr: InBoundsGEP, Align: Alignment);
4008
4009	APInt Offset(
4010	DL.getIndexSizeInBits(AS: Ptr->getType()->getPointerAddressSpace()), `0`);
4011	GEPOperator::accumulateConstantOffset(SourceType: BaseTy, Index: GEPIndices, DL, Offset);
4012	if (AATags) {
4013	Store->setAAMetadata(AATags.adjustForAccess(
4014	Offset: Offset.getZExtValue(), AccessTy: ExtractValue->getType(), DL));
4015	}
4016
4017	// migrateDebugInfo requires the base Alloca. Walk to it from this gep.
4018	// If we cannot (because there's an intervening non-const or unbounded
4019	// gep) then we wouldn't expect to see dbg.assign intrinsics linked to
4020	// this instruction.
4021	Value *Base = AggStore->getPointerOperand()->stripInBoundsOffsets();
4022	if (auto *OldAI = dyn_cast<AllocaInst>(Val: Base)) {
4023	uint64_t SizeInBits =
4024	DL.getTypeSizeInBits(Ty: Store->getValueOperand()->getType());
4025	migrateDebugInfo(OldAlloca: OldAI, /IsSplit/ true, OldAllocaOffsetInBits: Offset.getZExtValue() * `8`,
4026	SliceSizeInBits: SizeInBits, OldInst: AggStore, Inst: Store,
4027	Dest: Store->getPointerOperand(), Value: Store->getValueOperand(),
4028	DL);
4029	} else {
4030	assert(at::getAssignmentMarkers(Store).empty() &&
4031	at::getDVRAssignmentMarkers(Store).empty() &&
4032	"AT: unexpected debug.assign linked to store through "
4033	"unbounded GEP");
4034	}
4035	LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
4036	}
4037	};
4038
4039	bool visitStoreInst(StoreInst &SI) {
4040	if (!SI.isSimple() \|\| SI.getPointerOperand() != *U)
4041	return false;
4042	Value *V = SI.getValueOperand();
4043	if (V->getType()->isSingleValueType())
4044	return false;
4045
4046	// We have an aggregate being stored, split it apart.
4047	LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
4048	StoreOpSplitter Splitter(&SI, *U, V->getType(), SI.getAAMetadata(), &SI,
4049	getAdjustedAlignment(I: &SI, Offset: `0`), DL, IRB);
4050	Splitter.emitSplitOps(Ty: V->getType(), Agg&: V, Name: V->getName() + ".fca");
4051	Visited.erase(Ptr: &SI);
4052	// The stores replacing SI each have markers describing fragments of the
4053	// assignment so delete the assignment markers linked to SI.
4054	at::deleteAssignmentMarkers(Inst: &SI);
4055	SI.eraseFromParent();
4056	return true;
4057	}
4058
4059	bool visitBitCastInst(BitCastInst &BC) {
4060	enqueueUsers(I&: BC);
4061	return false;
4062	}
4063
4064	bool visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
4065	enqueueUsers(I&: ASC);
4066	return false;
4067	}
4068
4069	// Unfold gep (select cond, ptr1, ptr2), idx
4070	// => select cond, gep(ptr1, idx), gep(ptr2, idx)
4071	// and gep ptr, (select cond, idx1, idx2)
4072	// => select cond, gep(ptr, idx1), gep(ptr, idx2)
4073	bool unfoldGEPSelect(GetElementPtrInst &GEPI) {
4074	// Check whether the GEP has exactly one select operand and all indices
4075	// will become constant after the transform.
4076	SelectInst *Sel = dyn_cast<SelectInst>(Val: GEPI.getPointerOperand());
4077	for (Value *Op : GEPI.indices()) {
4078	if (auto *SI = dyn_cast<SelectInst>(Val: Op)) {
4079	if (Sel)
4080	return false;
4081
4082	Sel = SI;
4083	if (!isa<ConstantInt>(Val: Sel->getTrueValue()) \|\|
4084	!isa<ConstantInt>(Val: Sel->getFalseValue()))
4085	return false;
4086	continue;
4087	}
4088
4089	if (!isa<ConstantInt>(Val: Op))
4090	return false;
4091	}
4092
4093	if (!Sel)
4094	return false;
4095
4096	LLVM_DEBUG(dbgs() << " Rewriting gep(select) -> select(gep):\n";
4097	dbgs() << " original: " << *Sel << "\n";
4098	dbgs() << " " << GEPI << "\n";);
4099
4100	auto GetNewOps = [&](Value *SelOp) {
4101	SmallVector<Value *> NewOps;
4102	for (Value *Op : GEPI.operands())
4103	if (Op == Sel)
4104	NewOps.push_back(Elt: SelOp);
4105	else
4106	NewOps.push_back(Elt: Op);
4107	return NewOps;
4108	};
4109
4110	Value *True = Sel->getTrueValue();
4111	Value *False = Sel->getFalseValue();
4112	SmallVector<Value *> TrueOps = GetNewOps(True);
4113	SmallVector<Value *> FalseOps = GetNewOps(False);
4114
4115	IRB.SetInsertPoint(&GEPI);
4116	GEPNoWrapFlags NW = GEPI.getNoWrapFlags();
4117
4118	Type *Ty = GEPI.getSourceElementType();
4119	Value *NTrue = IRB.CreateGEP(Ty, Ptr: TrueOps [`0`], IdxList: ArrayRef(TrueOps).drop_front(),
4120	Name: True->getName() + ".sroa.gep", NW);
4121
4122	Value *NFalse =
4123	IRB.CreateGEP(Ty, Ptr: FalseOps [`0`], IdxList: ArrayRef(FalseOps).drop_front(),
4124	Name: False->getName() + ".sroa.gep", NW);
4125
4126	Value *NSel = IRB.CreateSelect(C: Sel->getCondition(), True: NTrue, False: NFalse,
4127	Name: Sel->getName() + ".sroa.sel");
4128	Visited.erase(Ptr: &GEPI);
4129	GEPI.replaceAllUsesWith(V: NSel);
4130	GEPI.eraseFromParent();
4131	Instruction *NSelI = cast<Instruction>(Val: NSel);
4132	Visited.insert(Ptr: NSelI);
4133	enqueueUsers(I&: *NSelI);
4134
4135	LLVM_DEBUG(dbgs() << " to: " << *NTrue << "\n";
4136	dbgs() << " " << *NFalse << "\n";
4137	dbgs() << " " << *NSel << "\n";);
4138
4139	return true;
4140	}
4141
4142	// Unfold gep (phi ptr1, ptr2), idx
4143	// => phi ((gep ptr1, idx), (gep ptr2, idx))
4144	// and gep ptr, (phi idx1, idx2)
4145	// => phi ((gep ptr, idx1), (gep ptr, idx2))
4146	bool unfoldGEPPhi(GetElementPtrInst &GEPI) {
4147	// To prevent infinitely expanding recursive phis, bail if the GEP pointer
4148	// operand (looking through the phi if it is the phi we want to unfold) is
4149	// an instruction besides a static alloca.
4150	PHINode *Phi = dyn_cast<PHINode>(Val: GEPI.getPointerOperand());
4151	auto IsInvalidPointerOperand = [](Value *V) {
4152	if (!isa<Instruction>(Val: V))
4153	return false;
4154	if (auto *AI = dyn_cast<AllocaInst>(Val: V))
4155	return !AI->isStaticAlloca();
4156	return true;
4157	};
4158	if (Phi) {
4159	if (any_of(Range: Phi->operands(), P: IsInvalidPointerOperand))
4160	return false;
4161	} else {
4162	if (IsInvalidPointerOperand(GEPI.getPointerOperand()))
4163	return false;
4164	}
4165	// Check whether the GEP has exactly one phi operand (including the pointer
4166	// operand) and all indices will become constant after the transform.
4167	for (Value *Op : GEPI.indices()) {
4168	if (auto *SI = dyn_cast<PHINode>(Val: Op)) {
4169	if (Phi)
4170	return false;
4171
4172	Phi = SI;
4173	if (!all_of(Range: Phi->incoming_values(),
4174	P: [](Value V) { return* isa<ConstantInt>(Val: V); }))
4175	return false;
4176	continue;
4177	}
4178
4179	if (!isa<ConstantInt>(Val: Op))
4180	return false;
4181	}
4182
4183	if (!Phi)
4184	return false;
4185
4186	LLVM_DEBUG(dbgs() << " Rewriting gep(phi) -> phi(gep):\n";
4187	dbgs() << " original: " << *Phi << "\n";
4188	dbgs() << " " << GEPI << "\n";);
4189
4190	auto GetNewOps = [&](Value *PhiOp) {
4191	SmallVector<Value *> NewOps;
4192	for (Value *Op : GEPI.operands())
4193	if (Op == Phi)
4194	NewOps.push_back(Elt: PhiOp);
4195	else
4196	NewOps.push_back(Elt: Op);
4197	return NewOps;
4198	};
4199
4200	IRB.SetInsertPoint(Phi);
4201	PHINode *NewPhi = IRB.CreatePHI(Ty: GEPI.getType(), NumReservedValues: Phi->getNumIncomingValues(),
4202	Name: Phi->getName() + ".sroa.phi");
4203
4204	Type *SourceTy = GEPI.getSourceElementType();
4205	// We only handle arguments, constants, and static allocas here, so we can
4206	// insert GEPs at the end of the entry block.
4207	IRB.SetInsertPoint(GEPI.getFunction()->getEntryBlock().getTerminator());
4208	for (unsigned I = `0`, E = Phi->getNumIncomingValues(); I != E; ++I) {
4209	Value *Op = Phi->getIncomingValue(i: I);
4210	BasicBlock *BB = Phi->getIncomingBlock(i: I);
4211	Value *NewGEP;
4212	if (int NI = NewPhi->getBasicBlockIndex(BB); NI >= `0`) {
4213	NewGEP = NewPhi->getIncomingValue(i: NI);
4214	} else {
4215	SmallVector<Value *> NewOps = GetNewOps(Op);
4216	NewGEP =
4217	IRB.CreateGEP(Ty: SourceTy, Ptr: NewOps [`0`], IdxList: ArrayRef(NewOps).drop_front(),
4218	Name: Phi->getName() + ".sroa.gep", NW: GEPI.getNoWrapFlags());
4219	}
4220	NewPhi->addIncoming(V: NewGEP, BB);
4221	}
4222
4223	Visited.erase(Ptr: &GEPI);
4224	GEPI.replaceAllUsesWith(V: NewPhi);
4225	GEPI.eraseFromParent();
4226	Visited.insert(Ptr: NewPhi);
4227	enqueueUsers(I&: *NewPhi);
4228
4229	LLVM_DEBUG(dbgs() << " to: ";
4230	for (Value *In
4231	: NewPhi->incoming_values()) dbgs()
4232	<< "\n " << *In;
4233	dbgs() << "\n " << *NewPhi << `'\n'`);
4234
4235	return true;
4236	}
4237
4238	bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
4239	if (unfoldGEPSelect(GEPI))
4240	return true;
4241
4242	if (unfoldGEPPhi(GEPI))
4243	return true;
4244
4245	enqueueUsers(I&: GEPI);
4246	return false;
4247	}
4248
4249	bool visitPHINode(PHINode &PN) {
4250	enqueueUsers(I&: PN);
4251	return false;
4252	}
4253
4254	bool visitSelectInst(SelectInst &SI) {
4255	enqueueUsers(I&: SI);
4256	return false;
4257	}
4258	};
4259
4260	} // end anonymous namespace
4261
4262	/// Strip aggregate type wrapping.
4263	///
4264	/// This removes no-op aggregate types wrapping an underlying type. It will
4265	/// strip as many layers of types as it can without changing either the type
4266	/// size or the allocated size.
4267	static Type stripAggregateTypeWrapping(const* DataLayout &DL, Type *Ty) {
4268	if (Ty->isSingleValueType())
4269	return Ty;
4270
4271	uint64_t AllocSize = DL.getTypeAllocSize(Ty).getFixedValue();
4272	uint64_t TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
4273
4274	Type *InnerTy;
4275	if (ArrayType *ArrTy = dyn_cast<ArrayType>(Val: Ty)) {
4276	InnerTy = ArrTy->getElementType();
4277	} else if (StructType *STy = dyn_cast<StructType>(Val: Ty)) {
4278	const StructLayout *SL = DL.getStructLayout(Ty: STy);
4279	unsigned Index = SL->getElementContainingOffset(FixedOffset: `0`);
4280	InnerTy = STy->getElementType(N: Index);
4281	} else {
4282	return Ty;
4283	}
4284
4285	if (AllocSize > DL.getTypeAllocSize(Ty: InnerTy).getFixedValue() \|\|
4286	TypeSize > DL.getTypeSizeInBits(Ty: InnerTy).getFixedValue())
4287	return Ty;
4288
4289	return stripAggregateTypeWrapping(DL, Ty: InnerTy);
4290	}
4291
4292	/// Try to find a partition of the aggregate type passed in for a given
4293	/// offset and size.
4294	///
4295	/// This recurses through the aggregate type and tries to compute a subtype
4296	/// based on the offset and size. When the offset and size span a sub-section
4297	/// of an array, it will even compute a new array type for that sub-section,
4298	/// and the same for structs.
4299	///
4300	/// Note that this routine is very strict and tries to find a partition of the
4301	/// type which produces the exact* right offset and size. It is not forgiving*
4302	/// when the size or offset cause either end of type-based partition to be off.
4303	/// Also, this is a best-effort routine. It is reasonable to give up and not
4304	/// return a type if necessary.
4305	static Type getTypePartition(const* DataLayout &DL, Type *Ty, uint64_t Offset,
4306	uint64_t Size) {
4307	if (Offset == `0` && DL.getTypeAllocSize(Ty).getFixedValue() == Size)
4308	return stripAggregateTypeWrapping(DL, Ty);
4309	if (Offset > DL.getTypeAllocSize(Ty).getFixedValue() \|\|
4310	(DL.getTypeAllocSize(Ty).getFixedValue() - Offset) < Size)
4311	return nullptr;
4312
4313	if (isa<ArrayType>(Val: Ty) \|\| isa<VectorType>(Val: Ty)) {
4314	Type *ElementTy;
4315	uint64_t TyNumElements;
4316	if (auto *AT = dyn_cast<ArrayType>(Val: Ty)) {
4317	ElementTy = AT->getElementType();
4318	TyNumElements = AT->getNumElements();
4319	} else {
4320	// FIXME: This isn't right for vectors with non-byte-sized or
4321	// non-power-of-two sized elements.
4322	auto *VT = cast<FixedVectorType>(Val: Ty);
4323	ElementTy = VT->getElementType();
4324	TyNumElements = VT->getNumElements();
4325	}
4326	uint64_t ElementSize = DL.getTypeAllocSize(Ty: ElementTy).getFixedValue();
4327	uint64_t NumSkippedElements = Offset / ElementSize;
4328	if (NumSkippedElements >= TyNumElements)
4329	return nullptr;
4330	Offset -= NumSkippedElements * ElementSize;
4331
4332	// First check if we need to recurse.
4333	if (Offset > `0` \|\| Size < ElementSize) {
4334	// Bail if the partition ends in a different array element.
4335	if ((Offset + Size) > ElementSize)
4336	return nullptr;
4337	// Recurse through the element type trying to peel off offset bytes.
4338	return getTypePartition(DL, Ty: ElementTy, Offset, Size);
4339	}
4340	assert(Offset == `0`);
4341
4342	if (Size == ElementSize)
4343	return stripAggregateTypeWrapping(DL, Ty: ElementTy);
4344	assert(Size > ElementSize);
4345	uint64_t NumElements = Size / ElementSize;
4346	if (NumElements * ElementSize != Size)
4347	return nullptr;
4348	return ArrayType::get(ElementType: ElementTy, NumElements);
4349	}
4350
4351	StructType *STy = dyn_cast<StructType>(Val: Ty);
4352	if (!STy)
4353	return nullptr;
4354
4355	const StructLayout *SL = DL.getStructLayout(Ty: STy);
4356
4357	if (SL->getSizeInBits().isScalable())
4358	return nullptr;
4359
4360	if (Offset >= SL->getSizeInBytes())
4361	return nullptr;
4362	uint64_t EndOffset = Offset + Size;
4363	if (EndOffset > SL->getSizeInBytes())
4364	return nullptr;
4365
4366	unsigned Index = SL->getElementContainingOffset(FixedOffset: Offset);
4367	Offset -= SL->getElementOffset(Idx: Index);
4368
4369	Type *ElementTy = STy->getElementType(N: Index);
4370	uint64_t ElementSize = DL.getTypeAllocSize(Ty: ElementTy).getFixedValue();
4371	if (Offset >= ElementSize)
4372	return nullptr; // The offset points into alignment padding.
4373
4374	// See if any partition must be contained by the element.
4375	if (Offset > `0` \|\| Size < ElementSize) {
4376	if ((Offset + Size) > ElementSize)
4377	return nullptr;
4378	return getTypePartition(DL, Ty: ElementTy, Offset, Size);
4379	}
4380	assert(Offset == `0`);
4381
4382	if (Size == ElementSize)
4383	return stripAggregateTypeWrapping(DL, Ty: ElementTy);
4384
4385	StructType::element_iterator EI = STy->element_begin() + Index,
4386	EE = STy->element_end();
4387	if (EndOffset < SL->getSizeInBytes()) {
4388	unsigned EndIndex = SL->getElementContainingOffset(FixedOffset: EndOffset);
4389	if (Index == EndIndex)
4390	return nullptr; // Within a single element and its padding.
4391
4392	// Don't try to form "natural" types if the elements don't line up with the
4393	// expected size.
4394	// FIXME: We could potentially recurse down through the last element in the
4395	// sub-struct to find a natural end point.
4396	if (SL->getElementOffset(Idx: EndIndex) != EndOffset)
4397	return nullptr;
4398
4399	assert(Index < EndIndex);
4400	EE = STy->element_begin() + EndIndex;
4401	}
4402
4403	// Try to build up a sub-structure.
4404	StructType *SubTy =
4405	StructType::get(Context&: STy->getContext(), Elements: ArrayRef(EI, EE), isPacked: STy->isPacked());
4406	const StructLayout *SubSL = DL.getStructLayout(Ty: SubTy);
4407	if (Size != SubSL->getSizeInBytes())
4408	return nullptr; // The sub-struct doesn't have quite the size needed.
4409
4410	return SubTy;
4411	}
4412
4413	/// Pre-split loads and stores to simplify rewriting.
4414	///
4415	/// We want to break up the splittable load+store pairs as much as
4416	/// possible. This is important to do as a preprocessing step, as once we
4417	/// start rewriting the accesses to partitions of the alloca we lose the
4418	/// necessary information to correctly split apart paired loads and stores
4419	/// which both point into this alloca. The case to consider is something like
4420	/// the following:
4421	///
4422	/// %a = alloca [12 x i8]
4423	/// %gep1 = getelementptr i8, ptr %a, i32 0
4424	/// %gep2 = getelementptr i8, ptr %a, i32 4
4425	/// %gep3 = getelementptr i8, ptr %a, i32 8
4426	/// store float 0.0, ptr %gep1
4427	/// store float 1.0, ptr %gep2
4428	/// %v = load i64, ptr %gep1
4429	/// store i64 %v, ptr %gep2
4430	/// %f1 = load float, ptr %gep2
4431	/// %f2 = load float, ptr %gep3
4432	///
4433	/// Here we want to form 3 partitions of the alloca, each 4 bytes large, and
4434	/// promote everything so we recover the 2 SSA values that should have been
4435	/// there all along.
4436	///
4437	/// \returns true if any changes are made.
4438	bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
4439	LLVM_DEBUG(dbgs() << "Pre-splitting loads and stores\n");
4440
4441	// Track the loads and stores which are candidates for pre-splitting here, in
4442	// the order they first appear during the partition scan. These give stable
4443	// iteration order and a basis for tracking which loads and stores we
4444	// actually split.
4445	SmallVector<LoadInst *, `4`> Loads;
4446	SmallVector<StoreInst *, `4`> Stores;
4447
4448	// We need to accumulate the splits required of each load or store where we
4449	// can find them via a direct lookup. This is important to cross-check loads
4450	// and stores against each other. We also track the slice so that we can kill
4451	// all the slices that end up split.
4452	struct SplitOffsets {
4453	Slice *S;
4454	std::vector<uint64_t> Splits;
4455	};
4456	SmallDenseMap<Instruction *, SplitOffsets, `8`> SplitOffsetsMap;
4457
4458	// Track loads out of this alloca which cannot, for any reason, be pre-split.
4459	// This is important as we also cannot pre-split stores of those loads!
4460	// FIXME: This is all pretty gross. It means that we can be more aggressive
4461	// in pre-splitting when the load feeding the store happens to come from
4462	// a separate alloca. Put another way, the effectiveness of SROA would be
4463	// decreased by a frontend which just concatenated all of its local allocas
4464	// into one big flat alloca. But defeating such patterns is exactly the job
4465	// SROA is tasked with! Sadly, to not have this discrepancy we would have
4466	// change store pre-splitting to actually force pre-splitting of the load
4467	// that feeds it and all stores. That makes pre-splitting much harder, but
4468	// maybe it would make it more principled?
4469	SmallPtrSet<LoadInst *, `8`> UnsplittableLoads;
4470
4471	LLVM_DEBUG(dbgs() << " Searching for candidate loads and stores\n");
4472	for (auto &P : AS.partitions()) {
4473	for (Slice &S : P) {
4474	Instruction *I = cast<Instruction>(Val: S.getUse()->getUser());
4475	if (!S.isSplittable() \|\| S.endOffset() <= P.endOffset()) {
4476	// If this is a load we have to track that it can't participate in any
4477	// pre-splitting. If this is a store of a load we have to track that
4478	// that load also can't participate in any pre-splitting.
4479	if (auto *LI = dyn_cast<LoadInst>(Val: I))
4480	UnsplittableLoads.insert(Ptr: LI);
4481	else if (auto *SI = dyn_cast<StoreInst>(Val: I))
4482	if (auto *LI = dyn_cast<LoadInst>(Val: SI->getValueOperand()))
4483	UnsplittableLoads.insert(Ptr: LI);
4484	continue;
4485	}
4486	assert(P.endOffset() > S.beginOffset() &&
4487	"Empty or backwards partition!");
4488
4489	// Determine if this is a pre-splittable slice.
4490	if (auto *LI = dyn_cast<LoadInst>(Val: I)) {
4491	assert(!LI->isVolatile() && "Cannot split volatile loads!");
4492
4493	// The load must be used exclusively to store into other pointers for
4494	// us to be able to arbitrarily pre-split it. The stores must also be
4495	// simple to avoid changing semantics.
4496	auto IsLoadSimplyStored = [](LoadInst *LI) {
4497	for (User *LU : LI->users()) {
4498	auto *SI = dyn_cast<StoreInst>(Val: LU);
4499	if (!SI \|\| !SI->isSimple())
4500	return false;
4501	}
4502	return true;
4503	};
4504	if (!IsLoadSimplyStored (LI)) {
4505	UnsplittableLoads.insert(Ptr: LI);
4506	continue;
4507	}
4508
4509	Loads.push_back(Elt: LI);
4510	} else if (auto *SI = dyn_cast<StoreInst>(Val: I)) {
4511	if (S.getUse() != &SI->getOperandUse(i: SI->getPointerOperandIndex()))
4512	// Skip stores of* pointers. FIXME: This shouldn't even be possible!*
4513	continue;
4514	auto *StoredLoad = dyn_cast<LoadInst>(Val: SI->getValueOperand());
4515	if (!StoredLoad \|\| !StoredLoad->isSimple())
4516	continue;
4517	assert(!SI->isVolatile() && "Cannot split volatile stores!");
4518
4519	Stores.push_back(Elt: SI);
4520	} else {
4521	// Other uses cannot be pre-split.
4522	continue;
4523	}
4524
4525	// Record the initial split.
4526	LLVM_DEBUG(dbgs() << " Candidate: " << *I << "\n");
4527	auto &Offsets = SplitOffsetsMap [I];
4528	assert(Offsets.Splits.empty() &&
4529	"Should not have splits the first time we see an instruction!");
4530	Offsets.S = &S;
4531	Offsets.Splits.push_back(x: P.endOffset() - S.beginOffset());
4532	}
4533
4534	// Now scan the already split slices, and add a split for any of them which
4535	// we're going to pre-split.
4536	for (Slice *S : P.splitSliceTails()) {
4537	auto SplitOffsetsMapI =
4538	SplitOffsetsMap.find(Val: cast<Instruction>(Val: S->getUse()->getUser()));
4539	if (SplitOffsetsMapI == SplitOffsetsMap.end())
4540	continue;
4541	auto &Offsets = SplitOffsetsMapI ->second;
4542
4543	assert(Offsets.S == S && "Found a mismatched slice!");
4544	assert(!Offsets.Splits.empty() &&
4545	"Cannot have an empty set of splits on the second partition!");
4546	assert(Offsets.Splits.back() ==
4547	P.beginOffset() - Offsets.S->beginOffset() &&
4548	"Previous split does not end where this one begins!");
4549
4550	// Record each split. The last partition's end isn't needed as the size
4551	// of the slice dictates that.
4552	if (S->endOffset() > P.endOffset())
4553	Offsets.Splits.push_back(x: P.endOffset() - Offsets.S->beginOffset());
4554	}
4555	}
4556
4557	// We may have split loads where some of their stores are split stores. For
4558	// such loads and stores, we can only pre-split them if their splits exactly
4559	// match relative to their starting offset. We have to verify this prior to
4560	// any rewriting.
4561	llvm::erase_if(C&: Stores, P: [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
4562	// Lookup the load we are storing in our map of split
4563	// offsets.
4564	auto *LI = cast<LoadInst>(Val: SI->getValueOperand());
4565	// If it was completely unsplittable, then we're done,
4566	// and this store can't be pre-split.
4567	if (UnsplittableLoads.count(Ptr: LI))
4568	return true;
4569
4570	auto LoadOffsetsI = SplitOffsetsMap.find(Val: LI);
4571	if (LoadOffsetsI == SplitOffsetsMap.end())
4572	return false; // Unrelated loads are definitely safe.
4573	auto &LoadOffsets = LoadOffsetsI ->second;
4574
4575	// Now lookup the store's offsets.
4576	auto &StoreOffsets = SplitOffsetsMap [SI];
4577
4578	// If the relative offsets of each split in the load and
4579	// store match exactly, then we can split them and we
4580	// don't need to remove them here.
4581	if (LoadOffsets.Splits == StoreOffsets.Splits)
4582	return false;
4583
4584	LLVM_DEBUG(dbgs() << " Mismatched splits for load and store:\n"
4585	<< " " << *LI << "\n"
4586	<< " " << *SI << "\n");
4587
4588	// We've found a store and load that we need to split
4589	// with mismatched relative splits. Just give up on them
4590	// and remove both instructions from our list of
4591	// candidates.
4592	UnsplittableLoads.insert(Ptr: LI);
4593	return true;
4594	});
4595	// Now we have to go back* through all the stores, because a later store may*
4596	// have caused an earlier store's load to become unsplittable and if it is
4597	// unsplittable for the later store, then we can't rely on it being split in
4598	// the earlier store either.
4599	llvm::erase_if(C&: Stores, P: [&UnsplittableLoads](StoreInst *SI) {
4600	auto *LI = cast<LoadInst>(Val: SI->getValueOperand());
4601	return UnsplittableLoads.count(Ptr: LI);
4602	});
4603	// Once we've established all the loads that can't be split for some reason,
4604	// filter any that made it into our list out.
4605	llvm::erase_if(C&: Loads, P: [&UnsplittableLoads](LoadInst *LI) {
4606	return UnsplittableLoads.count(Ptr: LI);
4607	});
4608
4609	// If no loads or stores are left, there is no pre-splitting to be done for
4610	// this alloca.
4611	if (Loads.empty() && Stores.empty())
4612	return false;
4613
4614	// From here on, we can't fail and will be building new accesses, so rig up
4615	// an IR builder.
4616	IRBuilderTy IRB(&AI);
4617
4618	// Collect the new slices which we will merge into the alloca slices.
4619	SmallVector<Slice, `4`> NewSlices;
4620
4621	// Track any allocas we end up splitting loads and stores for so we iterate
4622	// on them.
4623	SmallPtrSet<AllocaInst *, `4`> ResplitPromotableAllocas;
4624
4625	// At this point, we have collected all of the loads and stores we can
4626	// pre-split, and the specific splits needed for them. We actually do the
4627	// splitting in a specific order in order to handle when one of the loads in
4628	// the value operand to one of the stores.
4629	//
4630	// First, we rewrite all of the split loads, and just accumulate each split
4631	// load in a parallel structure. We also build the slices for them and append
4632	// them to the alloca slices.
4633	SmallDenseMap<LoadInst , std::vector<LoadInst >, `1`> SplitLoadsMap;
4634	std::vector<LoadInst *> SplitLoads;
4635	const DataLayout &DL = AI.getDataLayout();
4636	for (LoadInst *LI : Loads) {
4637	SplitLoads.clear();
4638
4639	auto &Offsets = SplitOffsetsMap [LI];
4640	unsigned SliceSize = Offsets.S->endOffset() - Offsets.S->beginOffset();
4641	assert(LI->getType()->getIntegerBitWidth() % `8` == `0` &&
4642	"Load must have type size equal to store size");
4643	assert(LI->getType()->getIntegerBitWidth() / `8` >= SliceSize &&
4644	"Load must be >= slice size");
4645
4646	uint64_t BaseOffset = Offsets.S->beginOffset();
4647	assert(BaseOffset + SliceSize > BaseOffset &&
4648	"Cannot represent alloca access size using 64-bit integers!");
4649
4650	Instruction *BasePtr = cast<Instruction>(Val: LI->getPointerOperand());
4651	IRB.SetInsertPoint(LI);
4652
4653	LLVM_DEBUG(dbgs() << " Splitting load: " << *LI << "\n");
4654
4655	uint64_t PartOffset = `0`, PartSize = Offsets.Splits.front();
4656	int Idx = `0`, Size = Offsets.Splits.size();
4657	for (;;) {
4658	auto PartTy = Type::getIntNTy(C&: LI->getContext(), N: PartSize `8`);
4659	auto AS = LI->getPointerAddressSpace();
4660	auto *PartPtrTy = LI->getPointerOperandType();
4661	LoadInst *PLoad = IRB.CreateAlignedLoad(
4662	Ty: PartTy,
4663	Ptr: getAdjustedPtr(IRB, DL, Ptr: BasePtr,
4664	Offset: APInt (DL.getIndexSizeInBits(AS), PartOffset),
4665	PointerTy: PartPtrTy, NamePrefix: BasePtr->getName() + "."),
4666	Align: getAdjustedAlignment(I: LI, Offset: PartOffset),
4667	/IsVolatile/ isVolatile: false, Name: LI->getName());
4668	PLoad->copyMetadata(SrcInst: *LI, WL: {LLVMContext::MD_mem_parallel_loop_access,
4669	LLVMContext::MD_access_group});
4670
4671	// Append this load onto the list of split loads so we can find it later
4672	// to rewrite the stores.
4673	SplitLoads.push_back(x: PLoad);
4674
4675	// Now build a new slice for the alloca.
4676	NewSlices.push_back(
4677	Elt: Slice (BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
4678	&PLoad->getOperandUse(i: PLoad->getPointerOperandIndex()),
4679	/IsSplittable/ false));
4680	LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
4681	<< ", " << NewSlices.back().endOffset()
4682	<< "): " << *PLoad << "\n");
4683
4684	// See if we've handled all the splits.
4685	if (Idx >= Size)
4686	break;
4687
4688	// Setup the next partition.
4689	PartOffset = Offsets.Splits [Idx];
4690	++Idx;
4691	PartSize = (Idx < Size ? Offsets.Splits [Idx] : SliceSize) - PartOffset;
4692	}
4693
4694	// Now that we have the split loads, do the slow walk over all uses of the
4695	// load and rewrite them as split stores, or save the split loads to use
4696	// below if the store is going to be split there anyways.
4697	bool DeferredStores = false;
4698	for (User *LU : LI->users()) {
4699	StoreInst *SI = cast<StoreInst>(Val: LU);
4700	if (!Stores.empty() && SplitOffsetsMap.count(Val: SI)) {
4701	DeferredStores = true;
4702	LLVM_DEBUG(dbgs() << " Deferred splitting of store: " << *SI
4703	<< "\n");
4704	continue;
4705	}
4706
4707	Value *StoreBasePtr = SI->getPointerOperand();
4708	IRB.SetInsertPoint(SI);
4709	AAMDNodes AATags = SI->getAAMetadata();
4710
4711	LLVM_DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n");
4712
4713	for (int Idx = `0`, Size = SplitLoads.size(); Idx < Size; ++Idx) {
4714	LoadInst *PLoad = SplitLoads [Idx];
4715	uint64_t PartOffset = Idx == `0` ? `0` : Offsets.Splits [Idx - `1`];
4716	auto *PartPtrTy = SI->getPointerOperandType();
4717
4718	auto AS = SI->getPointerAddressSpace();
4719	StoreInst *PStore = IRB.CreateAlignedStore(
4720	Val: PLoad,
4721	Ptr: getAdjustedPtr(IRB, DL, Ptr: StoreBasePtr,
4722	Offset: APInt (DL.getIndexSizeInBits(AS), PartOffset),
4723	PointerTy: PartPtrTy, NamePrefix: StoreBasePtr->getName() + "."),
4724	Align: getAdjustedAlignment(I: SI, Offset: PartOffset),
4725	/IsVolatile/ isVolatile: false);
4726	PStore->copyMetadata(SrcInst: *SI, WL: {LLVMContext::MD_mem_parallel_loop_access,
4727	LLVMContext::MD_access_group,
4728	LLVMContext::MD_DIAssignID});
4729
4730	if (AATags)
4731	PStore->setAAMetadata(
4732	AATags.adjustForAccess(Offset: PartOffset, AccessTy: PLoad->getType(), DL));
4733	LLVM_DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n");
4734	}
4735
4736	// We want to immediately iterate on any allocas impacted by splitting
4737	// this store, and we have to track any promotable alloca (indicated by
4738	// a direct store) as needing to be resplit because it is no longer
4739	// promotable.
4740	if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(Val: StoreBasePtr)) {
4741	ResplitPromotableAllocas.insert(Ptr: OtherAI);
4742	Worklist.insert(X: OtherAI);
4743	} else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
4744	Val: StoreBasePtr->stripInBoundsOffsets())) {
4745	Worklist.insert(X: OtherAI);
4746	}
4747
4748	// Mark the original store as dead.
4749	DeadInsts.push_back(Elt: SI);
4750	}
4751
4752	// Save the split loads if there are deferred stores among the users.
4753	if (DeferredStores)
4754	SplitLoadsMap.insert(KV: std::make_pair(x&: LI, y: std::move(SplitLoads)));
4755
4756	// Mark the original load as dead and kill the original slice.
4757	DeadInsts.push_back(Elt: LI);
4758	Offsets.S->kill();
4759	}
4760
4761	// Second, we rewrite all of the split stores. At this point, we know that
4762	// all loads from this alloca have been split already. For stores of such
4763	// loads, we can simply look up the pre-existing split loads. For stores of
4764	// other loads, we split those loads first and then write split stores of
4765	// them.
4766	for (StoreInst *SI : Stores) {
4767	auto *LI = cast<LoadInst>(Val: SI->getValueOperand());
4768	IntegerType *Ty = cast<IntegerType>(Val: LI->getType());
4769	assert(Ty->getBitWidth() % `8` == `0`);
4770	uint64_t StoreSize = Ty->getBitWidth() / `8`;
4771	assert(StoreSize > `0` && "Cannot have a zero-sized integer store!");
4772
4773	auto &Offsets = SplitOffsetsMap [SI];
4774	assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
4775	"Slice size should always match load size exactly!");
4776	uint64_t BaseOffset = Offsets.S->beginOffset();
4777	assert(BaseOffset + StoreSize > BaseOffset &&
4778	"Cannot represent alloca access size using 64-bit integers!");
4779
4780	Value *LoadBasePtr = LI->getPointerOperand();
4781	Instruction *StoreBasePtr = cast<Instruction>(Val: SI->getPointerOperand());
4782
4783	LLVM_DEBUG(dbgs() << " Splitting store: " << *SI << "\n");
4784
4785	// Check whether we have an already split load.
4786	auto SplitLoadsMapI = SplitLoadsMap.find(Val: LI);
4787	std::vector<LoadInst > SplitLoads = nullptr;
4788	if (SplitLoadsMapI != SplitLoadsMap.end()) {
4789	SplitLoads = &SplitLoadsMapI ->second;
4790	assert(SplitLoads->size() == Offsets.Splits.size() + `1` &&
4791	"Too few split loads for the number of splits in the store!");
4792	} else {
4793	LLVM_DEBUG(dbgs() << " of load: " << *LI << "\n");
4794	}
4795
4796	uint64_t PartOffset = `0`, PartSize = Offsets.Splits.front();
4797	int Idx = `0`, Size = Offsets.Splits.size();
4798	for (;;) {
4799	auto PartTy = Type::getIntNTy(C&: Ty->getContext(), N: PartSize `8`);
4800	auto *LoadPartPtrTy = LI->getPointerOperandType();
4801	auto *StorePartPtrTy = SI->getPointerOperandType();
4802
4803	// Either lookup a split load or create one.
4804	LoadInst *PLoad;
4805	if (SplitLoads) {
4806	PLoad = (*SplitLoads)[Idx];
4807	} else {
4808	IRB.SetInsertPoint(LI);
4809	auto AS = LI->getPointerAddressSpace();
4810	PLoad = IRB.CreateAlignedLoad(
4811	Ty: PartTy,
4812	Ptr: getAdjustedPtr(IRB, DL, Ptr: LoadBasePtr,
4813	Offset: APInt (DL.getIndexSizeInBits(AS), PartOffset),
4814	PointerTy: LoadPartPtrTy, NamePrefix: LoadBasePtr->getName() + "."),
4815	Align: getAdjustedAlignment(I: LI, Offset: PartOffset),
4816	/IsVolatile/ isVolatile: false, Name: LI->getName());
4817	PLoad->copyMetadata(SrcInst: *LI, WL: {LLVMContext::MD_mem_parallel_loop_access,
4818	LLVMContext::MD_access_group});
4819	}
4820
4821	// And store this partition.
4822	IRB.SetInsertPoint(SI);
4823	auto AS = SI->getPointerAddressSpace();
4824	StoreInst *PStore = IRB.CreateAlignedStore(
4825	Val: PLoad,
4826	Ptr: getAdjustedPtr(IRB, DL, Ptr: StoreBasePtr,
4827	Offset: APInt (DL.getIndexSizeInBits(AS), PartOffset),
4828	PointerTy: StorePartPtrTy, NamePrefix: StoreBasePtr->getName() + "."),
4829	Align: getAdjustedAlignment(I: SI, Offset: PartOffset),
4830	/IsVolatile/ isVolatile: false);
4831	PStore->copyMetadata(SrcInst: *SI, WL: {LLVMContext::MD_mem_parallel_loop_access,
4832	LLVMContext::MD_access_group});
4833
4834	// Now build a new slice for the alloca.
4835	NewSlices.push_back(
4836	Elt: Slice (BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
4837	&PStore->getOperandUse(i: PStore->getPointerOperandIndex()),
4838	/IsSplittable/ false));
4839	LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
4840	<< ", " << NewSlices.back().endOffset()
4841	<< "): " << *PStore << "\n");
4842	if (!SplitLoads) {
4843	LLVM_DEBUG(dbgs() << " of split load: " << *PLoad << "\n");
4844	}
4845
4846	// See if we've finished all the splits.
4847	if (Idx >= Size)
4848	break;
4849
4850	// Setup the next partition.
4851	PartOffset = Offsets.Splits [Idx];
4852	++Idx;
4853	PartSize = (Idx < Size ? Offsets.Splits [Idx] : StoreSize) - PartOffset;
4854	}
4855
4856	// We want to immediately iterate on any allocas impacted by splitting
4857	// this load, which is only relevant if it isn't a load of this alloca and
4858	// thus we didn't already split the loads above. We also have to keep track
4859	// of any promotable allocas we split loads on as they can no longer be
4860	// promoted.
4861	if (!SplitLoads) {
4862	if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(Val: LoadBasePtr)) {
4863	assert(OtherAI != &AI && "We can't re-split our own alloca!");
4864	ResplitPromotableAllocas.insert(Ptr: OtherAI);
4865	Worklist.insert(X: OtherAI);
4866	} else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
4867	Val: LoadBasePtr->stripInBoundsOffsets())) {
4868	assert(OtherAI != &AI && "We can't re-split our own alloca!");
4869	Worklist.insert(X: OtherAI);
4870	}
4871	}
4872
4873	// Mark the original store as dead now that we've split it up and kill its
4874	// slice. Note that we leave the original load in place unless this store
4875	// was its only use. It may in turn be split up if it is an alloca load
4876	// for some other alloca, but it may be a normal load. This may introduce
4877	// redundant loads, but where those can be merged the rest of the optimizer
4878	// should handle the merging, and this uncovers SSA splits which is more
4879	// important. In practice, the original loads will almost always be fully
4880	// split and removed eventually, and the splits will be merged by any
4881	// trivial CSE, including instcombine.
4882	if (LI->hasOneUse()) {
4883	assert(*LI->user_begin() == SI && "Single use isn't this store!");
4884	DeadInsts.push_back(Elt: LI);
4885	}
4886	DeadInsts.push_back(Elt: SI);
4887	Offsets.S->kill();
4888	}
4889
4890	// Remove the killed slices that have ben pre-split.
4891	llvm::erase_if(C&: AS, P: [](const Slice &S) { return S.isDead(); });
4892
4893	// Insert our new slices. This will sort and merge them into the sorted
4894	// sequence.
4895	AS.insert(NewSlices);
4896
4897	LLVM_DEBUG(dbgs() << " Pre-split slices:\n");
4898	#ifndef NDEBUG
4899	for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
4900	LLVM_DEBUG(AS.print(dbgs(), I, " "));
4901	#endif
4902
4903	// Finally, don't try to promote any allocas that new require re-splitting.
4904	// They have already been added to the worklist above.
4905	PromotableAllocas.set_subtract(ResplitPromotableAllocas);
4906
4907	return true;
4908	}
4909
4910	/// Rewrite an alloca partition's users.
4911	///
4912	/// This routine drives both of the rewriting goals of the SROA pass. It tries
4913	/// to rewrite uses of an alloca partition to be conducive for SSA value
4914	/// promotion. If the partition needs a new, more refined alloca, this will
4915	/// build that new alloca, preserving as much type information as possible, and
4916	/// rewrite the uses of the old alloca to point at the new one and have the
4917	/// appropriate new offsets. It also evaluates how successful the rewrite was
4918	/// at enabling promotion and if it was successful queues the alloca to be
4919	/// promoted.
4920	AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
4921	Partition &P) {
4922	// Try to compute a friendly type for this partition of the alloca. This
4923	// won't always succeed, in which case we fall back to a legal integer type
4924	// or an i8 array of an appropriate size.
4925	Type SliceTy = nullptr*;
4926	VectorType SliceVecTy = nullptr*;
4927	const DataLayout &DL = AI.getDataLayout();
4928	unsigned VScale = AI.getFunction()->getVScaleValue();
4929
4930	std::pair<Type , IntegerType > CommonUseTy =
4931	findCommonType(B: P.begin(), E: P.end(), EndOffset: P.endOffset());
4932	// Do all uses operate on the same type?
4933	if (CommonUseTy.first) {
4934	TypeSize CommonUseSize = DL.getTypeAllocSize(Ty: CommonUseTy.first);
4935	if (CommonUseSize.isFixed() && CommonUseSize.getFixedValue() >= P.size()) {
4936	SliceTy = CommonUseTy.first;
4937	SliceVecTy = dyn_cast<VectorType>(Val: SliceTy);
4938	}
4939	}
4940	// If not, can we find an appropriate subtype in the original allocated type?
4941	if (!SliceTy)
4942	if (Type *TypePartitionTy = getTypePartition(DL, Ty: AI.getAllocatedType(),
4943	Offset: P.beginOffset(), Size: P.size()))
4944	SliceTy = TypePartitionTy;
4945
4946	// If still not, can we use the largest bitwidth integer type used?
4947	if (!SliceTy && CommonUseTy.second)
4948	if (DL.getTypeAllocSize(Ty: CommonUseTy.second).getFixedValue() >= P.size()) {
4949	SliceTy = CommonUseTy.second;
4950	SliceVecTy = dyn_cast<VectorType>(Val: SliceTy);
4951	}
4952	if ((!SliceTy \|\| (SliceTy->isArrayTy() &&
4953	SliceTy->getArrayElementType()->isIntegerTy())) &&
4954	DL.isLegalInteger(Width: P.size() * `8`)) {
4955	SliceTy = Type::getIntNTy(C&: C, N: P.size() `8`);
4956	}
4957
4958	// If the common use types are not viable for promotion then attempt to find
4959	// another type that is viable.
4960	if (SliceVecTy && !checkVectorTypeForPromotion(P, VTy: SliceVecTy, DL, VScale))
4961	if (Type *TypePartitionTy = getTypePartition(DL, Ty: AI.getAllocatedType(),
4962	Offset: P.beginOffset(), Size: P.size())) {
4963	VectorType *TypePartitionVecTy = dyn_cast<VectorType>(Val: TypePartitionTy);
4964	if (TypePartitionVecTy &&
4965	checkVectorTypeForPromotion(P, VTy: TypePartitionVecTy, DL, VScale))
4966	SliceTy = TypePartitionTy;
4967	}
4968
4969	if (!SliceTy)
4970	SliceTy = ArrayType::get(ElementType: Type::getInt8Ty(C&: *C), NumElements: P.size());
4971	assert(DL.getTypeAllocSize(SliceTy).getFixedValue() >= P.size());
4972
4973	bool IsIntegerPromotable = isIntegerWideningViable(P, AllocaTy: SliceTy, DL);
4974
4975	VectorType *VecTy =
4976	IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL, VScale);
4977	if (VecTy)
4978	SliceTy = VecTy;
4979
4980	// Check for the case where we're going to rewrite to a new alloca of the
4981	// exact same type as the original, and with the same access offsets. In that
4982	// case, re-use the existing alloca, but still run through the rewriter to
4983	// perform phi and select speculation.
4984	// P.beginOffset() can be non-zero even with the same type in a case with
4985	// out-of-bounds access (e.g. @PR35657 function in SROA/basictest.ll).
4986	AllocaInst *NewAI;
4987	if (SliceTy == AI.getAllocatedType() && P.beginOffset() == `0`) {
4988	NewAI = &AI;
4989	// FIXME: We should be able to bail at this point with "nothing changed".
4990	// FIXME: We might want to defer PHI speculation until after here.
4991	// FIXME: return nullptr;
4992	} else {
4993	// Make sure the alignment is compatible with P.beginOffset().
4994	const Align Alignment = commonAlignment(A: AI.getAlign(), Offset: P.beginOffset());
4995	// If we will get at least this much alignment from the type alone, leave
4996	// the alloca's alignment unconstrained.
4997	const bool IsUnconstrained = Alignment <= DL.getABITypeAlign(Ty: SliceTy);
4998	NewAI = new AllocaInst (
4999	SliceTy, AI.getAddressSpace(), nullptr,
5000	IsUnconstrained ? DL.getPrefTypeAlign(Ty: SliceTy) : Alignment,
5001	AI.getName() + ".sroa." + Twine (P.begin() - AS.begin()),
5002	AI.getIterator());
5003	// Copy the old AI debug location over to the new one.
5004	NewAI->setDebugLoc(AI.getDebugLoc());
5005	++NumNewAllocas;
5006	}
5007
5008	LLVM_DEBUG(dbgs() << "Rewriting alloca partition " << "[" << P.beginOffset()
5009	<< "," << P.endOffset() << ") to: " << *NewAI << "\n");
5010
5011	// Track the high watermark on the worklist as it is only relevant for
5012	// promoted allocas. We will reset it to this point if the alloca is not in
5013	// fact scheduled for promotion.
5014	unsigned PPWOldSize = PostPromotionWorklist.size();
5015	unsigned NumUses = `0`;
5016	SmallSetVector<PHINode *, `8`> PHIUsers;
5017	SmallSetVector<SelectInst *, `8`> SelectUsers;
5018
5019	AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(),
5020	P.endOffset(), IsIntegerPromotable, VecTy,
5021	PHIUsers, SelectUsers);
5022	bool Promotable = true;
5023	for (Slice *S : P.splitSliceTails()) {
5024	Promotable &= Rewriter.visit(I: S);
5025	++NumUses;
5026	}
5027	for (Slice &S : P) {
5028	Promotable &= Rewriter.visit(I: &S);
5029	++NumUses;
5030	}
5031
5032	NumAllocaPartitionUses += NumUses;
5033	MaxUsesPerAllocaPartition.updateMax(V: NumUses);
5034
5035	// Now that we've processed all the slices in the new partition, check if any
5036	// PHIs or Selects would block promotion.
5037	for (PHINode *PHI : PHIUsers)
5038	if (!isSafePHIToSpeculate(PN&: *PHI)) {
5039	Promotable = false;
5040	PHIUsers.clear();
5041	SelectUsers.clear();
5042	break;
5043	}
5044
5045	SmallVector<std::pair<SelectInst *, RewriteableMemOps>, `2`>
5046	NewSelectsToRewrite;
5047	NewSelectsToRewrite.reserve(N: SelectUsers.size());
5048	for (SelectInst *Sel : SelectUsers) {
5049	std::optional<RewriteableMemOps> Ops =
5050	isSafeSelectToSpeculate(SI&: *Sel, PreserveCFG);
5051	if (!Ops) {
5052	Promotable = false;
5053	PHIUsers.clear();
5054	SelectUsers.clear();
5055	NewSelectsToRewrite.clear();
5056	break;
5057	}
5058	NewSelectsToRewrite.emplace_back(Args: std::make_pair(x&: Sel, y&: *Ops));
5059	}
5060
5061	if (Promotable) {
5062	for (Use *U : AS.getDeadUsesIfPromotable()) {
5063	auto *OldInst = dyn_cast<Instruction>(Val: U->get());
5064	Value::dropDroppableUse(U&: *U);
5065	if (OldInst)
5066	if (isInstructionTriviallyDead(I: OldInst))
5067	DeadInsts.push_back(Elt: OldInst);
5068	}
5069	if (PHIUsers.empty() && SelectUsers.empty()) {
5070	// Promote the alloca.
5071	PromotableAllocas.insert(X: NewAI);
5072	} else {
5073	// If we have either PHIs or Selects to speculate, add them to those
5074	// worklists and re-queue the new alloca so that we promote in on the
5075	// next iteration.
5076	SpeculatablePHIs.insert_range(R&: PHIUsers);
5077	SelectsToRewrite.reserve(NumEntries: SelectsToRewrite.size() +
5078	NewSelectsToRewrite.size());
5079	for (auto &&KV : llvm::make_range(
5080	x: std::make_move_iterator(i: NewSelectsToRewrite.begin()),
5081	y: std::make_move_iterator(i: NewSelectsToRewrite.end())))
5082	SelectsToRewrite.insert(KV: std::move(KV));
5083	Worklist.insert(X: NewAI);
5084	}
5085	} else {
5086	// Drop any post-promotion work items if promotion didn't happen.
5087	while (PostPromotionWorklist.size() > PPWOldSize)
5088	PostPromotionWorklist.pop_back();
5089
5090	// We couldn't promote and we didn't create a new partition, nothing
5091	// happened.
5092	if (NewAI == &AI)
5093	return nullptr;
5094
5095	// If we can't promote the alloca, iterate on it to check for new
5096	// refinements exposed by splitting the current alloca. Don't iterate on an
5097	// alloca which didn't actually change and didn't get promoted.
5098	Worklist.insert(X: NewAI);
5099	}
5100
5101	return NewAI;
5102	}
5103
5104	// There isn't a shared interface to get the "address" parts out of a
5105	// dbg.declare and dbg.assign, so provide some wrappers now for
5106	// both debug intrinsics and records.
5107	const Value getAddress(const* DbgVariableIntrinsic *DVI) {
5108	if (const auto *DAI = dyn_cast<DbgAssignIntrinsic>(Val: DVI))
5109	return DAI->getAddress();
5110	return cast<DbgDeclareInst>(Val: DVI)->getAddress();
5111	}
5112
5113	const Value getAddress(const* DbgVariableRecord *DVR) {
5114	return DVR->getAddress();
5115	}
5116
5117	bool isKillAddress(const DbgVariableIntrinsic *DVI) {
5118	if (const auto *DAI = dyn_cast<DbgAssignIntrinsic>(Val: DVI))
5119	return DAI->isKillAddress();
5120	return cast<DbgDeclareInst>(Val: DVI)->isKillLocation();
5121	}
5122
5123	bool isKillAddress(const DbgVariableRecord *DVR) {
5124	if (DVR->getType() == DbgVariableRecord::LocationType::Assign)
5125	return DVR->isKillAddress();
5126	return DVR->isKillLocation();
5127	}
5128
5129	const DIExpression getAddressExpression(const* DbgVariableIntrinsic *DVI) {
5130	if (const auto *DAI = dyn_cast<DbgAssignIntrinsic>(Val: DVI))
5131	return DAI->getAddressExpression();
5132	return cast<DbgDeclareInst>(Val: DVI)->getExpression();
5133	}
5134
5135	const DIExpression getAddressExpression(const* DbgVariableRecord *DVR) {
5136	if (DVR->getType() == DbgVariableRecord::LocationType::Assign)
5137	return DVR->getAddressExpression();
5138	return DVR->getExpression();
5139	}
5140
5141	/// Create or replace an existing fragment in a DIExpression with \p Frag.
5142	/// If the expression already contains a DW_OP_LLVM_extract_bits_[sz]ext
5143	/// operation, add \p BitExtractOffset to the offset part.
5144	///
5145	/// Returns the new expression, or nullptr if this fails (see details below).
5146	///
5147	/// This function is similar to DIExpression::createFragmentExpression except
5148	/// for 3 important distinctions:
5149	/// 1. The new fragment isn't relative to an existing fragment.
5150	/// 2. It assumes the computed location is a memory location. This means we
5151	/// don't need to perform checks that creating the fragment preserves the
5152	/// expression semantics.
5153	/// 3. Existing extract_bits are modified independently of fragment changes
5154	/// using \p BitExtractOffset. A change to the fragment offset or size
5155	/// may affect a bit extract. But a bit extract offset can change
5156	/// independently of the fragment dimensions.
5157	///
5158	/// Returns the new expression, or nullptr if one couldn't be created.
5159	/// Ideally this is only used to signal that a bit-extract has become
5160	/// zero-sized (and thus the new debug record has no size and can be
5161	/// dropped), however, it fails for other reasons too - see the FIXME below.
5162	///
5163	/// FIXME: To keep the change that introduces this function NFC it bails
5164	/// in some situations unecessarily, e.g. when fragment and bit extract
5165	/// sizes differ.
5166	static DIExpression createOrReplaceFragment(const* DIExpression *Expr,
5167	DIExpression::FragmentInfo Frag,
5168	int64_t BitExtractOffset) {
5169	SmallVector<uint64_t, `8`> Ops;
5170	bool HasFragment = false;
5171	bool HasBitExtract = false;
5172
5173	for (auto &Op : Expr->expr_ops()) {
5174	if (Op.getOp() == dwarf::DW_OP_LLVM_fragment) {
5175	HasFragment = true;
5176	continue;
5177	}
5178	if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext \|\|
5179	Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_sext) {
5180	HasBitExtract = true;
5181	int64_t ExtractOffsetInBits = Op.getArg(I: `0`);
5182	int64_t ExtractSizeInBits = Op.getArg(I: `1`);
5183
5184	// DIExpression::createFragmentExpression doesn't know how to handle
5185	// a fragment that is smaller than the extract. Copy the behaviour
5186	// (bail) to avoid non-NFC changes.
5187	// FIXME: Don't do this.
5188	if (Frag.SizeInBits < uint64_t(ExtractSizeInBits))
5189	return nullptr;
5190
5191	assert(BitExtractOffset <= `0`);
5192	int64_t AdjustedOffset = ExtractOffsetInBits + BitExtractOffset;
5193
5194	// DIExpression::createFragmentExpression doesn't know what to do
5195	// if the new extract starts "outside" the existing one. Copy the
5196	// behaviour (bail) to avoid non-NFC changes.
5197	// FIXME: Don't do this.
5198	if (AdjustedOffset < `0`)
5199	return nullptr;
5200
5201	Ops.push_back(Elt: Op.getOp());
5202	Ops.push_back(Elt: std::max<int64_t>(a: `0`, b: AdjustedOffset));
5203	Ops.push_back(Elt: ExtractSizeInBits);
5204	continue;
5205	}
5206	Op.appendToVector(V&: Ops);
5207	}
5208
5209	// Unsupported by createFragmentExpression, so don't support it here yet to
5210	// preserve NFC-ness.
5211	if (HasFragment && HasBitExtract)
5212	return nullptr;
5213
5214	if (!HasBitExtract) {
5215	Ops.push_back(Elt: dwarf::DW_OP_LLVM_fragment);
5216	Ops.push_back(Elt: Frag.OffsetInBits);
5217	Ops.push_back(Elt: Frag.SizeInBits);
5218	}
5219	return DIExpression::get(Context&: Expr->getContext(), Elements: Ops);
5220	}
5221
5222	/// Insert a new dbg.declare.
5223	/// \p Orig Original to copy debug loc and variable from.
5224	/// \p NewAddr Location's new base address.
5225	/// \p NewAddrExpr New expression to apply to address.
5226	/// \p BeforeInst Insert position.
5227	/// \p NewFragment New fragment (absolute, non-relative).
5228	/// \p BitExtractAdjustment Offset to apply to any extract_bits op.
5229	static void
5230	insertNewDbgInst(DIBuilder &DIB, DbgDeclareInst Orig, AllocaInst NewAddr,
5231	DIExpression NewAddrExpr, Instruction BeforeInst,
5232	std::optional<DIExpression::FragmentInfo> NewFragment,
5233	int64_t BitExtractAdjustment) {
5234	if (NewFragment)
5235	NewAddrExpr = createOrReplaceFragment(Expr: NewAddrExpr, Frag: *NewFragment,
5236	BitExtractOffset: BitExtractAdjustment);
5237	if (!NewAddrExpr)
5238	return;
5239
5240	DIB.insertDeclare(Storage: NewAddr, VarInfo: Orig->getVariable(), Expr: NewAddrExpr,
5241	DL: Orig->getDebugLoc(), InsertPt: BeforeInst->getIterator());
5242	}
5243
5244	/// Insert a new dbg.assign.
5245	/// \p Orig Original to copy debug loc, variable, value and value expression
5246	/// from.
5247	/// \p NewAddr Location's new base address.
5248	/// \p NewAddrExpr New expression to apply to address.
5249	/// \p BeforeInst Insert position.
5250	/// \p NewFragment New fragment (absolute, non-relative).
5251	/// \p BitExtractAdjustment Offset to apply to any extract_bits op.
5252	static void
5253	insertNewDbgInst(DIBuilder &DIB, DbgAssignIntrinsic Orig, AllocaInst NewAddr,
5254	DIExpression NewAddrExpr, Instruction BeforeInst,
5255	std::optional<DIExpression::FragmentInfo> NewFragment,
5256	int64_t BitExtractAdjustment) {
5257	// DIBuilder::insertDbgAssign will insert the #dbg_assign after NewAddr.
5258	(void)BeforeInst;
5259
5260	// A dbg.assign puts fragment info in the value expression only. The address
5261	// expression has already been built: NewAddrExpr.
5262	DIExpression *NewFragmentExpr = Orig->getExpression();
5263	if (NewFragment)
5264	NewFragmentExpr = createOrReplaceFragment(Expr: NewFragmentExpr, Frag: *NewFragment,
5265	BitExtractOffset: BitExtractAdjustment);
5266	if (!NewFragmentExpr)
5267	return;
5268
5269	// Apply a DIAssignID to the store if it doesn't already have it.
5270	if (!NewAddr->hasMetadata(KindID: LLVMContext::MD_DIAssignID)) {
5271	NewAddr->setMetadata(KindID: LLVMContext::MD_DIAssignID,
5272	Node: DIAssignID::getDistinct(Context&: NewAddr->getContext()));
5273	}
5274
5275	Instruction NewAssign = cast<Instruction >(Val: DIB.insertDbgAssign(
5276	LinkedInstr: NewAddr, Val: Orig->getValue(), SrcVar: Orig->getVariable(), ValExpr: NewFragmentExpr, Addr: NewAddr,
5277	AddrExpr: NewAddrExpr, DL: Orig->getDebugLoc()));
5278	LLVM_DEBUG(dbgs() << "Created new assign intrinsic: " << *NewAssign << "\n");
5279	(void)NewAssign;
5280	}
5281
5282	/// Insert a new DbgRecord.
5283	/// \p Orig Original to copy record type, debug loc and variable from, and
5284	/// additionally value and value expression for dbg_assign records.
5285	/// \p NewAddr Location's new base address.
5286	/// \p NewAddrExpr New expression to apply to address.
5287	/// \p BeforeInst Insert position.
5288	/// \p NewFragment New fragment (absolute, non-relative).
5289	/// \p BitExtractAdjustment Offset to apply to any extract_bits op.
5290	static void
5291	insertNewDbgInst(DIBuilder &DIB, DbgVariableRecord Orig, AllocaInst NewAddr,
5292	DIExpression NewAddrExpr, Instruction BeforeInst,
5293	std::optional<DIExpression::FragmentInfo> NewFragment,
5294	int64_t BitExtractAdjustment) {
5295	(void)DIB;
5296
5297	// A dbg_assign puts fragment info in the value expression only. The address
5298	// expression has already been built: NewAddrExpr. A dbg_declare puts the
5299	// new fragment info into NewAddrExpr (as it only has one expression).
5300	DIExpression *NewFragmentExpr =
5301	Orig->isDbgAssign() ? Orig->getExpression() : NewAddrExpr;
5302	if (NewFragment)
5303	NewFragmentExpr = createOrReplaceFragment(Expr: NewFragmentExpr, Frag: *NewFragment,
5304	BitExtractOffset: BitExtractAdjustment);
5305	if (!NewFragmentExpr)
5306	return;
5307
5308	if (Orig->isDbgDeclare()) {
5309	DbgVariableRecord *DVR = DbgVariableRecord::createDVRDeclare(
5310	Address: NewAddr, DV: Orig->getVariable(), Expr: NewFragmentExpr, DI: Orig->getDebugLoc());
5311	BeforeInst->getParent()->insertDbgRecordBefore(DR: DVR,
5312	Here: BeforeInst->getIterator());
5313	return;
5314	}
5315
5316	if (Orig->isDbgValue()) {
5317	DbgVariableRecord *DVR = DbgVariableRecord::createDbgVariableRecord(
5318	Location: NewAddr, DV: Orig->getVariable(), Expr: NewFragmentExpr, DI: Orig->getDebugLoc());
5319	// Drop debug information if the expression doesn't start with a
5320	// DW_OP_deref. This is because without a DW_OP_deref, the #dbg_value
5321	// describes the address of alloca rather than the value inside the alloca.
5322	if (!NewFragmentExpr->startsWithDeref())
5323	DVR->setKillAddress();
5324	BeforeInst->getParent()->insertDbgRecordBefore(DR: DVR,
5325	Here: BeforeInst->getIterator());
5326	return;
5327	}
5328
5329	// Apply a DIAssignID to the store if it doesn't already have it.
5330	if (!NewAddr->hasMetadata(KindID: LLVMContext::MD_DIAssignID)) {
5331	NewAddr->setMetadata(KindID: LLVMContext::MD_DIAssignID,
5332	Node: DIAssignID::getDistinct(Context&: NewAddr->getContext()));
5333	}
5334
5335	DbgVariableRecord *NewAssign = DbgVariableRecord::createLinkedDVRAssign(
5336	LinkedInstr: NewAddr, Val: Orig->getValue(), Variable: Orig->getVariable(), Expression: NewFragmentExpr, Address: NewAddr,
5337	AddressExpression: NewAddrExpr, DI: Orig->getDebugLoc());
5338	LLVM_DEBUG(dbgs() << "Created new DVRAssign: " << *NewAssign << "\n");
5339	(void)NewAssign;
5340	}
5341
5342	/// Walks the slices of an alloca and form partitions based on them,
5343	/// rewriting each of their uses.
5344	bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
5345	if (AS.begin() == AS.end())
5346	return false;
5347
5348	unsigned NumPartitions = `0`;
5349	bool Changed = false;
5350	const DataLayout &DL = AI.getModule()->getDataLayout();
5351
5352	// First try to pre-split loads and stores.
5353	Changed \|= presplitLoadsAndStores(AI, AS);
5354
5355	// Now that we have identified any pre-splitting opportunities,
5356	// mark loads and stores unsplittable except for the following case.
5357	// We leave a slice splittable if all other slices are disjoint or fully
5358	// included in the slice, such as whole-alloca loads and stores.
5359	// If we fail to split these during pre-splitting, we want to force them
5360	// to be rewritten into a partition.
5361	bool IsSorted = true;
5362
5363	uint64_t AllocaSize =
5364	DL.getTypeAllocSize(Ty: AI.getAllocatedType()).getFixedValue();
5365	const uint64_t MaxBitVectorSize = `1024`;
5366	if (AllocaSize <= MaxBitVectorSize) {
5367	// If a byte boundary is included in any load or store, a slice starting or
5368	// ending at the boundary is not splittable.
5369	SmallBitVector SplittableOffset(AllocaSize + `1`, true);
5370	for (Slice &S : AS)
5371	for (unsigned O = S.beginOffset() + `1`;
5372	O < S.endOffset() && O < AllocaSize; O++)
5373	SplittableOffset.reset(Idx: O);
5374
5375	for (Slice &S : AS) {
5376	if (!S.isSplittable())
5377	continue;
5378
5379	if ((S.beginOffset() > AllocaSize \|\| SplittableOffset [S.beginOffset()]) &&
5380	(S.endOffset() > AllocaSize \|\| SplittableOffset [S.endOffset()]))
5381	continue;
5382
5383	if (isa<LoadInst>(Val: S.getUse()->getUser()) \|\|
5384	isa<StoreInst>(Val: S.getUse()->getUser())) {
5385	S.makeUnsplittable();
5386	IsSorted = false;
5387	}
5388	}
5389	} else {
5390	// We only allow whole-alloca splittable loads and stores
5391	// for a large alloca to avoid creating too large BitVector.
5392	for (Slice &S : AS) {
5393	if (!S.isSplittable())
5394	continue;
5395
5396	if (S.beginOffset() == `0` && S.endOffset() >= AllocaSize)
5397	continue;
5398
5399	if (isa<LoadInst>(Val: S.getUse()->getUser()) \|\|
5400	isa<StoreInst>(Val: S.getUse()->getUser())) {
5401	S.makeUnsplittable();
5402	IsSorted = false;
5403	}
5404	}
5405	}
5406
5407	if (!IsSorted)
5408	llvm::stable_sort(Range&: AS);
5409
5410	/// Describes the allocas introduced by rewritePartition in order to migrate
5411	/// the debug info.
5412	struct Fragment {
5413	AllocaInst *Alloca;
5414	uint64_t Offset;
5415	uint64_t Size;
5416	Fragment(AllocaInst *AI, uint64_t O, uint64_t S)
5417	: Alloca(AI), Offset(O), Size(S) {}
5418	};
5419	SmallVector<Fragment, `4`> Fragments;
5420
5421	// Rewrite each partition.
5422	for (auto &P : AS.partitions()) {
5423	if (AllocaInst *NewAI = rewritePartition(AI, AS, P)) {
5424	Changed = true;
5425	if (NewAI != &AI) {
5426	uint64_t SizeOfByte = `8`;
5427	uint64_t AllocaSize =
5428	DL.getTypeSizeInBits(Ty: NewAI->getAllocatedType()).getFixedValue();
5429	// Don't include any padding.
5430	uint64_t Size = std::min(a: AllocaSize, b: P.size() * SizeOfByte);
5431	Fragments.push_back(
5432	Elt: Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
5433	}
5434	}
5435	++NumPartitions;
5436	}
5437
5438	NumAllocaPartitions += NumPartitions;
5439	MaxPartitionsPerAlloca.updateMax(V: NumPartitions);
5440
5441	// Migrate debug information from the old alloca to the new alloca(s)
5442	// and the individual partitions.
5443	auto MigrateOne = [&](auto *DbgVariable) {
5444	// Can't overlap with undef memory.
5445	if (isKillAddress(DbgVariable))
5446	return;
5447
5448	const Value *DbgPtr = getAddress(DbgVariable);
5449	DIExpression::FragmentInfo VarFrag =
5450	DbgVariable->getFragmentOrEntireVariable();
5451	// Get the address expression constant offset if one exists and the ops
5452	// that come after it.
5453	int64_t CurrentExprOffsetInBytes = `0`;
5454	SmallVector<uint64_t> PostOffsetOps;
5455	if (!getAddressExpression(DbgVariable)
5456	->extractLeadingOffset(CurrentExprOffsetInBytes, PostOffsetOps))
5457	return; // Couldn't interpret this DIExpression - drop the var.
5458
5459	// Offset defined by a DW_OP_LLVM_extract_bits_[sz]ext.
5460	int64_t ExtractOffsetInBits = `0`;
5461	for (auto Op : getAddressExpression(DbgVariable)->expr_ops()) {
5462	if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext \|\|
5463	Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_sext) {
5464	ExtractOffsetInBits = Op.getArg(`0`);
5465	break;
5466	}
5467	}
5468
5469	DIBuilder DIB(AI.getModule(), /AllowUnresolved/* false);
5470	for (auto Fragment : Fragments) {
5471	int64_t OffsetFromLocationInBits;
5472	std::optional<DIExpression::FragmentInfo> NewDbgFragment;
5473	// Find the variable fragment that the new alloca slice covers.
5474	// Drop debug info for this variable fragment if we can't compute an
5475	// intersect between it and the alloca slice.
5476	if (!DIExpression::calculateFragmentIntersect(
5477	DL, SliceStart: &AI, SliceOffsetInBits: Fragment.Offset, SliceSizeInBits: Fragment.Size, DbgPtr,
5478	DbgPtrOffsetInBits: CurrentExprOffsetInBytes * `8`, DbgExtractOffsetInBits: ExtractOffsetInBits, VarFrag,
5479	Result&: NewDbgFragment, OffsetFromLocationInBits))
5480	continue; // Do not migrate this fragment to this slice.
5481
5482	// Zero sized fragment indicates there's no intersect between the variable
5483	// fragment and the alloca slice. Skip this slice for this variable
5484	// fragment.
5485	if (NewDbgFragment && !NewDbgFragment ->SizeInBits)
5486	continue; // Do not migrate this fragment to this slice.
5487
5488	// No fragment indicates DbgVariable's variable or fragment exactly
5489	// overlaps the slice; copy its fragment (or nullopt if there isn't one).
5490	if (!NewDbgFragment)
5491	NewDbgFragment = DbgVariable->getFragment();
5492
5493	// Reduce the new expression offset by the bit-extract offset since
5494	// we'll be keeping that.
5495	int64_t OffestFromNewAllocaInBits =
5496	OffsetFromLocationInBits - ExtractOffsetInBits;
5497	// We need to adjust an existing bit extract if the offset expression
5498	// can't eat the slack (i.e., if the new offset would be negative).
5499	int64_t BitExtractOffset =
5500	std::min<int64_t>(a: `0`, b: OffestFromNewAllocaInBits);
5501	// The magnitude of a negative value indicates the number of bits into
5502	// the existing variable fragment that the memory region begins. The new
5503	// variable fragment already excludes those bits - the new DbgPtr offset
5504	// only needs to be applied if it's positive.
5505	OffestFromNewAllocaInBits =
5506	std::max(a: int64_t(`0`), b: OffestFromNewAllocaInBits);
5507
5508	// Rebuild the expression:
5509	// {Offset(OffestFromNewAllocaInBits), PostOffsetOps, NewDbgFragment}
5510	// Add NewDbgFragment later, because dbg.assigns don't want it in the
5511	// address expression but the value expression instead.
5512	DIExpression *NewExpr = DIExpression::get(Context&: AI.getContext(), Elements: PostOffsetOps);
5513	if (OffestFromNewAllocaInBits > `0`) {
5514	int64_t OffsetInBytes = (OffestFromNewAllocaInBits + `7`) / `8`;
5515	NewExpr = DIExpression::prepend(Expr: NewExpr, /flags=/Flags: `0`, Offset: OffsetInBytes);
5516	}
5517
5518	// Remove any existing intrinsics on the new alloca describing
5519	// the variable fragment.
5520	auto RemoveOne = [DbgVariable](auto *OldDII) {
5521	auto SameVariableFragment = [](const auto LHS, const* auto *RHS) {
5522	return LHS->getVariable() == RHS->getVariable() &&
5523	LHS->getDebugLoc()->getInlinedAt() ==
5524	RHS->getDebugLoc()->getInlinedAt();
5525	};
5526	if (SameVariableFragment(OldDII, DbgVariable))
5527	OldDII->eraseFromParent();
5528	};
5529	for_each(findDbgDeclares(V: Fragment.Alloca), RemoveOne);
5530	for_each(findDVRDeclares(V: Fragment.Alloca), RemoveOne);
5531	for_each(findDVRValues(V: Fragment.Alloca), RemoveOne);
5532	insertNewDbgInst(DIB, DbgVariable, Fragment.Alloca, NewExpr, &AI,
5533	NewDbgFragment, BitExtractOffset);
5534	}
5535	};
5536
5537	// Migrate debug information from the old alloca to the new alloca(s)
5538	// and the individual partitions.
5539	for_each(Range: findDbgDeclares(V: &AI), F: MigrateOne);
5540	for_each(Range: findDVRDeclares(V: &AI), F: MigrateOne);
5541	for_each(Range: findDVRValues(V: &AI), F: MigrateOne);
5542	for_each(Range: at::getAssignmentMarkers(Inst: &AI), F: MigrateOne);
5543	for_each(Range: at::getDVRAssignmentMarkers(Inst: &AI), F: MigrateOne);
5544
5545	return Changed;
5546	}
5547
5548	/// Clobber a use with poison, deleting the used value if it becomes dead.
5549	void SROA::clobberUse(Use &U) {
5550	Value *OldV = U;
5551	// Replace the use with an poison value.
5552	U = PoisonValue::get(T: OldV->getType());
5553
5554	// Check for this making an instruction dead. We have to garbage collect
5555	// all the dead instructions to ensure the uses of any alloca end up being
5556	// minimal.
5557	if (Instruction *OldI = dyn_cast<Instruction>(Val: OldV))
5558	if (isInstructionTriviallyDead(I: OldI)) {
5559	DeadInsts.push_back(Elt: OldI);
5560	}
5561	}
5562
5563	/// A basic LoadAndStorePromoter that does not remove store nodes.
5564	class BasicLoadAndStorePromoter : public LoadAndStorePromoter {
5565	public:
5566	BasicLoadAndStorePromoter(ArrayRef<const Instruction *> Insts, SSAUpdater &S,
5567	Type *ZeroType)
5568	: LoadAndStorePromoter (Insts, S), ZeroType(ZeroType) {}
5569	bool shouldDelete(Instruction I) const* override {
5570	return !isa<StoreInst>(Val: I) && !isa<AllocaInst>(Val: I);
5571	}
5572
5573	Value getValueToUseForAlloca(Instruction I) const override {
5574	return UndefValue::get(T: ZeroType);
5575	}
5576
5577	private:
5578	Type *ZeroType;
5579	};
5580
5581	bool SROA::propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS) {
5582	// Look through each "partition", looking for slices with the same start/end
5583	// that do not overlap with any before them. The slices are sorted by
5584	// increasing beginOffset. We don't use AS.partitions(), as it will use a more
5585	// sophisticated algorithm that takes splittable slices into account.
5586	LLVM_DEBUG(dbgs() << "Attempting to propagate values on " << AI << "\n");
5587	bool AllSameAndValid = true;
5588	Type PartitionType = nullptr*;
5589	SmallVector<Instruction *> Insts;
5590	uint64_t BeginOffset = `0`;
5591	uint64_t EndOffset = `0`;
5592
5593	auto Flush = [&]() {
5594	if (AllSameAndValid && !Insts.empty()) {
5595	LLVM_DEBUG(dbgs() << "Propagate values on slice [" << BeginOffset << ", "
5596	<< EndOffset << ")\n");
5597	SmallVector<PHINode *, `4`> NewPHIs;
5598	SSAUpdater SSA(&NewPHIs);
5599	Insts.push_back(Elt: &AI);
5600	BasicLoadAndStorePromoter Promoter(Insts, SSA, PartitionType);
5601	Promoter.run(Insts);
5602	}
5603	AllSameAndValid = true;
5604	PartitionType = nullptr;
5605	Insts.clear();
5606	};
5607
5608	for (Slice &S : AS) {
5609	auto *User = cast<Instruction>(Val: S.getUse()->getUser());
5610	if (isAssumeLikeIntrinsic(I: User)) {
5611	LLVM_DEBUG({
5612	dbgs() << "Ignoring slice: ";
5613	AS.print(dbgs(), &S);
5614	});
5615	continue;
5616	}
5617	if (S.beginOffset() >= EndOffset) {
5618	Flush ();
5619	BeginOffset = S.beginOffset();
5620	EndOffset = S.endOffset();
5621	} else if (S.beginOffset() != BeginOffset \|\| S.endOffset() != EndOffset) {
5622	if (AllSameAndValid) {
5623	LLVM_DEBUG({
5624	dbgs() << "Slice does not match range [" << BeginOffset << ", "
5625	<< EndOffset << ")";
5626	AS.print(dbgs(), &S);
5627	});
5628	AllSameAndValid = false;
5629	}
5630	EndOffset = std::max(a: EndOffset, b: S.endOffset());
5631	continue;
5632	}
5633
5634	if (auto *LI = dyn_cast<LoadInst>(Val: User)) {
5635	Type *UserTy = LI->getType();
5636	// LoadAndStorePromoter requires all the types to be the same.
5637	if (!LI->isSimple() \|\| (PartitionType && UserTy != PartitionType))
5638	AllSameAndValid = false;
5639	PartitionType = UserTy;
5640	Insts.push_back(Elt: User);
5641	} else if (auto *SI = dyn_cast<StoreInst>(Val: User)) {
5642	Type *UserTy = SI->getValueOperand()->getType();
5643	if (!SI->isSimple() \|\| (PartitionType && UserTy != PartitionType))
5644	AllSameAndValid = false;
5645	PartitionType = UserTy;
5646	Insts.push_back(Elt: User);
5647	} else {
5648	AllSameAndValid = false;
5649	}
5650	}
5651
5652	Flush ();
5653	return true;
5654	}
5655
5656	/// Analyze an alloca for SROA.
5657	///
5658	/// This analyzes the alloca to ensure we can reason about it, builds
5659	/// the slices of the alloca, and then hands it off to be split and
5660	/// rewritten as needed.
5661	std::pair<bool /Changed/, bool /CFGChanged/>
5662	SROA::runOnAlloca(AllocaInst &AI) {
5663	bool Changed = false;
5664	bool CFGChanged = false;
5665
5666	LLVM_DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
5667	++NumAllocasAnalyzed;
5668
5669	// Special case dead allocas, as they're trivial.
5670	if (AI.use_empty()) {
5671	AI.eraseFromParent();
5672	Changed = true;
5673	return {Changed, CFGChanged};
5674	}
5675	const DataLayout &DL = AI.getDataLayout();
5676
5677	// Skip alloca forms that this analysis can't handle.
5678	auto *AT = AI.getAllocatedType();
5679	TypeSize Size = DL.getTypeAllocSize(Ty: AT);
5680	if (AI.isArrayAllocation() \|\| !AT->isSized() \|\| Size.isScalable() \|\|
5681	Size.getFixedValue() == `0`)
5682	return {Changed, CFGChanged};
5683
5684	// First, split any FCA loads and stores touching this alloca to promote
5685	// better splitting and promotion opportunities.
5686	IRBuilderTy IRB(&AI);
5687	AggLoadStoreRewriter AggRewriter(DL, IRB);
5688	Changed \|= AggRewriter.rewrite(I&: AI);
5689
5690	// Build the slices using a recursive instruction-visiting builder.
5691	AllocaSlices AS(DL, AI);
5692	LLVM_DEBUG(AS.print(dbgs()));
5693	if (AS.isEscaped())
5694	return {Changed, CFGChanged};
5695
5696	if (AS.isEscapedReadOnly()) {
5697	Changed \|= propagateStoredValuesToLoads(AI, AS);
5698	return {Changed, CFGChanged};
5699	}
5700
5701	// Delete all the dead users of this alloca before splitting and rewriting it.
5702	for (Instruction *DeadUser : AS.getDeadUsers()) {
5703	// Free up everything used by this instruction.
5704	for (Use &DeadOp : DeadUser->operands())
5705	clobberUse(U&: DeadOp);
5706
5707	// Now replace the uses of this instruction.
5708	DeadUser->replaceAllUsesWith(V: PoisonValue::get(T: DeadUser->getType()));
5709
5710	// And mark it for deletion.
5711	DeadInsts.push_back(Elt: DeadUser);
5712	Changed = true;
5713	}
5714	for (Use *DeadOp : AS.getDeadOperands()) {
5715	clobberUse(U&: *DeadOp);
5716	Changed = true;
5717	}
5718
5719	// No slices to split. Leave the dead alloca for a later pass to clean up.
5720	if (AS.begin() == AS.end())
5721	return {Changed, CFGChanged};
5722
5723	Changed \|= splitAlloca(AI, AS);
5724
5725	LLVM_DEBUG(dbgs() << " Speculating PHIs\n");
5726	while (!SpeculatablePHIs.empty())
5727	speculatePHINodeLoads(IRB, PN&: *SpeculatablePHIs.pop_back_val());
5728
5729	LLVM_DEBUG(dbgs() << " Rewriting Selects\n");
5730	auto RemainingSelectsToRewrite = SelectsToRewrite.takeVector();
5731	while (!RemainingSelectsToRewrite.empty()) {
5732	const auto [K, V] = RemainingSelectsToRewrite.pop_back_val();
5733	CFGChanged \|=
5734	rewriteSelectInstMemOps(SI&: K, Ops: V, IRB, DTU: PreserveCFG ? nullptr* : DTU);
5735	}
5736
5737	return {Changed, CFGChanged};
5738	}
5739
5740	/// Delete the dead instructions accumulated in this run.
5741	///
5742	/// Recursively deletes the dead instructions we've accumulated. This is done
5743	/// at the very end to maximize locality of the recursive delete and to
5744	/// minimize the problems of invalidated instruction pointers as such pointers
5745	/// are used heavily in the intermediate stages of the algorithm.
5746	///
5747	/// We also record the alloca instructions deleted here so that they aren't
5748	/// subsequently handed to mem2reg to promote.
5749	bool SROA::deleteDeadInstructions(
5750	SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
5751	bool Changed = false;
5752	while (!DeadInsts.empty()) {
5753	Instruction *I = dyn_cast_or_null<Instruction>(Val: DeadInsts.pop_back_val());
5754	if (!I)
5755	continue;
5756	LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
5757
5758	// If the instruction is an alloca, find the possible dbg.declare connected
5759	// to it, and remove it too. We must do this before calling RAUW or we will
5760	// not be able to find it.
5761	if (AllocaInst *AI = dyn_cast<AllocaInst>(Val: I)) {
5762	DeletedAllocas.insert(Ptr: AI);
5763	for (DbgDeclareInst *OldDII : findDbgDeclares(V: AI))
5764	OldDII->eraseFromParent();
5765	for (DbgVariableRecord *OldDII : findDVRDeclares(V: AI))
5766	OldDII->eraseFromParent();
5767	}
5768
5769	at::deleteAssignmentMarkers(Inst: I);
5770	I->replaceAllUsesWith(V: UndefValue::get(T: I->getType()));
5771
5772	for (Use &Operand : I->operands())
5773	if (Instruction *U = dyn_cast<Instruction>(Val&: Operand)) {
5774	// Zero out the operand and see if it becomes trivially dead.
5775	Operand = nullptr;
5776	if (isInstructionTriviallyDead(I: U))
5777	DeadInsts.push_back(Elt: U);
5778	}
5779
5780	++NumDeleted;
5781	I->eraseFromParent();
5782	Changed = true;
5783	}
5784	return Changed;
5785	}
5786	/// Promote the allocas, using the best available technique.
5787	///
5788	/// This attempts to promote whatever allocas have been identified as viable in
5789	/// the PromotableAllocas list. If that list is empty, there is nothing to do.
5790	/// This function returns whether any promotion occurred.
5791	bool SROA::promoteAllocas() {
5792	if (PromotableAllocas.empty())
5793	return false;
5794
5795	if (SROASkipMem2Reg) {
5796	LLVM_DEBUG(dbgs() << "Not promoting allocas with mem2reg!\n");
5797	} else {
5798	LLVM_DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
5799	NumPromoted += PromotableAllocas.size();
5800	PromoteMemToReg(Allocas: PromotableAllocas.getArrayRef(), DT&: DTU->getDomTree(), AC);
5801	}
5802
5803	PromotableAllocas.clear();
5804	return true;
5805	}
5806
5807	std::pair<bool /Changed/, bool /CFGChanged/> SROA::runSROA(Function &F) {
5808	LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
5809
5810	const DataLayout &DL = F.getDataLayout();
5811	BasicBlock &EntryBB = F.getEntryBlock();
5812	for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(x: EntryBB.end());
5813	I != E; ++I) {
5814	if (AllocaInst *AI = dyn_cast<AllocaInst>(Val&: I)) {
5815	if (DL.getTypeAllocSize(Ty: AI->getAllocatedType()).isScalable() &&
5816	isAllocaPromotable(AI))
5817	PromotableAllocas.insert(X: AI);
5818	else
5819	Worklist.insert(X: AI);
5820	}
5821	}
5822
5823	bool Changed = false;
5824	bool CFGChanged = false;
5825	// A set of deleted alloca instruction pointers which should be removed from
5826	// the list of promotable allocas.
5827	SmallPtrSet<AllocaInst *, `4`> DeletedAllocas;
5828
5829	do {
5830	while (!Worklist.empty()) {
5831	auto [IterationChanged, IterationCFGChanged] =
5832	runOnAlloca(AI&: *Worklist.pop_back_val());
5833	Changed \|= IterationChanged;
5834	CFGChanged \|= IterationCFGChanged;
5835
5836	Changed \|= deleteDeadInstructions(DeletedAllocas);
5837
5838	// Remove the deleted allocas from various lists so that we don't try to
5839	// continue processing them.
5840	if (!DeletedAllocas.empty()) {
5841	Worklist.set_subtract(DeletedAllocas);
5842	PostPromotionWorklist.set_subtract(DeletedAllocas);
5843	PromotableAllocas.set_subtract(DeletedAllocas);
5844	DeletedAllocas.clear();
5845	}
5846	}
5847
5848	Changed \|= promoteAllocas();
5849
5850	Worklist = PostPromotionWorklist;
5851	PostPromotionWorklist.clear();
5852	} while (!Worklist.empty());
5853
5854	assert((!CFGChanged \|\| Changed) && "Can not only modify the CFG.");
5855	assert((!CFGChanged \|\| !PreserveCFG) &&
5856	"Should not have modified the CFG when told to preserve it.");
5857
5858	if (Changed && isAssignmentTrackingEnabled(M: *F.getParent())) {
5859	for (auto &BB : F) {
5860	RemoveRedundantDbgInstrs(BB: &BB);
5861	}
5862	}
5863
5864	return {Changed, CFGChanged};
5865	}
5866
5867	PreservedAnalyses SROAPass::run(Function &F, FunctionAnalysisManager &AM) {
5868	DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(IR&: F);
5869	AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(IR&: F);
5870	DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
5871	auto [Changed, CFGChanged] =
5872	SROA (&F.getContext(), &DTU, &AC, PreserveCFG).runSROA(F);
5873	if (!Changed)
5874	return PreservedAnalyses::all();
5875	PreservedAnalyses PA;
5876	if (!CFGChanged)
5877	PA.preserveSet<CFGAnalyses>();
5878	PA.preserve<DominatorTreeAnalysis>();
5879	return PA;
5880	}
5881
5882	void SROAPass::printPipeline(
5883	raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
5884	static_cast<PassInfoMixin<SROAPass> >(this*)->printPipeline(
5885	OS, MapClassName2PassName);
5886	OS << (PreserveCFG == SROAOptions::PreserveCFG ? "<preserve-cfg>"
5887	: "<modify-cfg>");
5888	}
5889
5890	SROAPass::SROAPass(SROAOptions PreserveCFG) : PreserveCFG(PreserveCFG) {}
5891
5892	namespace {
5893
5894	/// A legacy pass for the legacy pass manager that wraps the \c SROA pass.
5895	class SROALegacyPass : public FunctionPass {
5896	SROAOptions PreserveCFG;
5897
5898	public:
5899	static char ID;
5900
5901	SROALegacyPass(SROAOptions PreserveCFG = SROAOptions::PreserveCFG)
5902	: FunctionPass (ID), PreserveCFG(PreserveCFG) {
5903	initializeSROALegacyPassPass(*PassRegistry::getPassRegistry());
5904	}
5905
5906	bool runOnFunction(Function &F) override {
5907	if (skipFunction(F))
5908	return false;
5909
5910	DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
5911	AssumptionCache &AC =
5912	getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
5913	DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
5914	auto [Changed, _] =
5915	SROA (&F.getContext(), &DTU, &AC, PreserveCFG).runSROA(F);
5916	return Changed;
5917	}
5918
5919	void getAnalysisUsage(AnalysisUsage &AU) const override {
5920	AU.addRequired<AssumptionCacheTracker>();
5921	AU.addRequired<DominatorTreeWrapperPass>();
5922	AU.addPreserved<GlobalsAAWrapperPass>();
5923	AU.addPreserved<DominatorTreeWrapperPass>();
5924	}
5925
5926	StringRef getPassName() const override { return "SROA"; }
5927	};
5928
5929	} // end anonymous namespace
5930
5931	char SROALegacyPass::ID = `0`;
5932
5933	FunctionPass llvm::createSROAPass(bool* PreserveCFG) {
5934	return new SROALegacyPass (PreserveCFG ? SROAOptions::PreserveCFG
5935	: SROAOptions::ModifyCFG);
5936	}
5937
5938	INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa",
5939	"Scalar Replacement Of Aggregates", false, false)
5940	INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
5941	INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
5942	INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates",
5943	false, false)
5944

Browse the source code of llvm_projects/llvm/lib/Transforms/Scalar/SROA.cpp