1//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This transformation implements the well known scalar replacement of
10/// aggregates transformation. It tries to identify promotable elements of an
11/// aggregate alloca, and promote them to registers. It will also try to
12/// convert uses of an element (or set of elements) of an alloca into a vector
13/// or bitfield-style integer scalar if appropriate.
14///
15/// It works to do this with minimal slicing of the alloca so that regions
16/// which are merely transferred in and out of external memory remain unchanged
17/// and are not decomposed to scalar code.
18///
19/// Because this also performs alloca promotion, it can be thought of as also
20/// serving the purpose of SSA formation. The algorithm iterates on the
21/// function until all opportunities for promotion have been realized.
22///
23//===----------------------------------------------------------------------===//
24
25#include "llvm/Transforms/Scalar/SROA.h"
26#include "llvm/ADT/APInt.h"
27#include "llvm/ADT/ArrayRef.h"
28#include "llvm/ADT/DenseMap.h"
29#include "llvm/ADT/MapVector.h"
30#include "llvm/ADT/PointerIntPair.h"
31#include "llvm/ADT/STLExtras.h"
32#include "llvm/ADT/SetVector.h"
33#include "llvm/ADT/SmallBitVector.h"
34#include "llvm/ADT/SmallPtrSet.h"
35#include "llvm/ADT/SmallVector.h"
36#include "llvm/ADT/Statistic.h"
37#include "llvm/ADT/StringRef.h"
38#include "llvm/ADT/Twine.h"
39#include "llvm/ADT/iterator.h"
40#include "llvm/ADT/iterator_range.h"
41#include "llvm/Analysis/AssumptionCache.h"
42#include "llvm/Analysis/DomTreeUpdater.h"
43#include "llvm/Analysis/GlobalsModRef.h"
44#include "llvm/Analysis/Loads.h"
45#include "llvm/Analysis/PtrUseVisitor.h"
46#include "llvm/Analysis/ValueTracking.h"
47#include "llvm/Config/llvm-config.h"
48#include "llvm/IR/BasicBlock.h"
49#include "llvm/IR/Constant.h"
50#include "llvm/IR/ConstantFolder.h"
51#include "llvm/IR/Constants.h"
52#include "llvm/IR/DIBuilder.h"
53#include "llvm/IR/DataLayout.h"
54#include "llvm/IR/DebugInfo.h"
55#include "llvm/IR/DebugInfoMetadata.h"
56#include "llvm/IR/DerivedTypes.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/GlobalAlias.h"
60#include "llvm/IR/IRBuilder.h"
61#include "llvm/IR/InstVisitor.h"
62#include "llvm/IR/Instruction.h"
63#include "llvm/IR/Instructions.h"
64#include "llvm/IR/IntrinsicInst.h"
65#include "llvm/IR/Intrinsics.h"
66#include "llvm/IR/LLVMContext.h"
67#include "llvm/IR/Metadata.h"
68#include "llvm/IR/Module.h"
69#include "llvm/IR/Operator.h"
70#include "llvm/IR/PassManager.h"
71#include "llvm/IR/Type.h"
72#include "llvm/IR/Use.h"
73#include "llvm/IR/User.h"
74#include "llvm/IR/Value.h"
75#include "llvm/IR/ValueHandle.h"
76#include "llvm/InitializePasses.h"
77#include "llvm/Pass.h"
78#include "llvm/Support/Casting.h"
79#include "llvm/Support/CommandLine.h"
80#include "llvm/Support/Compiler.h"
81#include "llvm/Support/Debug.h"
82#include "llvm/Support/ErrorHandling.h"
83#include "llvm/Support/raw_ostream.h"
84#include "llvm/Transforms/Scalar.h"
85#include "llvm/Transforms/Utils/BasicBlockUtils.h"
86#include "llvm/Transforms/Utils/Local.h"
87#include "llvm/Transforms/Utils/PromoteMemToReg.h"
88#include "llvm/Transforms/Utils/SSAUpdater.h"
89#include <algorithm>
90#include <cassert>
91#include <cstddef>
92#include <cstdint>
93#include <cstring>
94#include <iterator>
95#include <queue>
96#include <string>
97#include <tuple>
98#include <utility>
99#include <variant>
100#include <vector>
101
102using namespace llvm;
103
104#define DEBUG_TYPE "sroa"
105
106STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
107STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
108STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca");
109STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten");
110STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition");
111STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced");
112STATISTIC(NumPromoted, "Number of allocas promoted to SSA values");
113STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
114STATISTIC(NumLoadsPredicated,
115 "Number of loads rewritten into predicated loads to allow promotion");
116STATISTIC(
117 NumStoresPredicated,
118 "Number of stores rewritten into predicated loads to allow promotion");
119STATISTIC(NumDeleted, "Number of instructions deleted");
120STATISTIC(NumVectorized, "Number of vectorized aggregates");
121
122namespace llvm {
123/// Disable running mem2reg during SROA in order to test or debug SROA.
124static cl::opt<bool> SROASkipMem2Reg("sroa-skip-mem2reg", cl::init(Val: false),
125 cl::Hidden);
126extern cl::opt<bool> ProfcheckDisableMetadataFixes;
127} // namespace llvm
128
129namespace {
130
131class AllocaSliceRewriter;
132class AllocaSlices;
133class Partition;
134
135class SelectHandSpeculativity {
136 unsigned char Storage = 0; // None are speculatable by default.
137 using TrueVal = Bitfield::Element<bool, 0, 1>; // Low 0'th bit.
138 using FalseVal = Bitfield::Element<bool, 1, 1>; // Low 1'th bit.
139public:
140 SelectHandSpeculativity() = default;
141 SelectHandSpeculativity &setAsSpeculatable(bool isTrueVal);
142 bool isSpeculatable(bool isTrueVal) const;
143 bool areAllSpeculatable() const;
144 bool areAnySpeculatable() const;
145 bool areNoneSpeculatable() const;
146 // For interop as int half of PointerIntPair.
147 explicit operator intptr_t() const { return static_cast<intptr_t>(Storage); }
148 explicit SelectHandSpeculativity(intptr_t Storage_) : Storage(Storage_) {}
149};
150static_assert(sizeof(SelectHandSpeculativity) == sizeof(unsigned char));
151
152using PossiblySpeculatableLoad =
153 PointerIntPair<LoadInst *, 2, SelectHandSpeculativity>;
154using UnspeculatableStore = StoreInst *;
155using RewriteableMemOp =
156 std::variant<PossiblySpeculatableLoad, UnspeculatableStore>;
157using RewriteableMemOps = SmallVector<RewriteableMemOp, 2>;
158
159/// An optimization pass providing Scalar Replacement of Aggregates.
160///
161/// This pass takes allocations which can be completely analyzed (that is, they
162/// don't escape) and tries to turn them into scalar SSA values. There are
163/// a few steps to this process.
164///
165/// 1) It takes allocations of aggregates and analyzes the ways in which they
166/// are used to try to split them into smaller allocations, ideally of
167/// a single scalar data type. It will split up memcpy and memset accesses
168/// as necessary and try to isolate individual scalar accesses.
169/// 2) It will transform accesses into forms which are suitable for SSA value
170/// promotion. This can be replacing a memset with a scalar store of an
171/// integer value, or it can involve speculating operations on a PHI or
172/// select to be a PHI or select of the results.
173/// 3) Finally, this will try to detect a pattern of accesses which map cleanly
174/// onto insert and extract operations on a vector value, and convert them to
175/// this form. By doing so, it will enable promotion of vector aggregates to
176/// SSA vector values.
177class SROA {
178 LLVMContext *const C;
179 DomTreeUpdater *const DTU;
180 AssumptionCache *const AC;
181 const bool PreserveCFG;
182
183 /// Worklist of alloca instructions to simplify.
184 ///
185 /// Each alloca in the function is added to this. Each new alloca formed gets
186 /// added to it as well to recursively simplify unless that alloca can be
187 /// directly promoted. Finally, each time we rewrite a use of an alloca other
188 /// the one being actively rewritten, we add it back onto the list if not
189 /// already present to ensure it is re-visited.
190 SmallSetVector<AllocaInst *, 16> Worklist;
191
192 /// A collection of instructions to delete.
193 /// We try to batch deletions to simplify code and make things a bit more
194 /// efficient. We also make sure there is no dangling pointers.
195 SmallVector<WeakVH, 8> DeadInsts;
196
197 /// Post-promotion worklist.
198 ///
199 /// Sometimes we discover an alloca which has a high probability of becoming
200 /// viable for SROA after a round of promotion takes place. In those cases,
201 /// the alloca is enqueued here for re-processing.
202 ///
203 /// Note that we have to be very careful to clear allocas out of this list in
204 /// the event they are deleted.
205 SmallSetVector<AllocaInst *, 16> PostPromotionWorklist;
206
207 /// A collection of alloca instructions we can directly promote.
208 SetVector<AllocaInst *, SmallVector<AllocaInst *>,
209 SmallPtrSet<AllocaInst *, 16>, 16>
210 PromotableAllocas;
211
212 /// A worklist of PHIs to speculate prior to promoting allocas.
213 ///
214 /// All of these PHIs have been checked for the safety of speculation and by
215 /// being speculated will allow promoting allocas currently in the promotable
216 /// queue.
217 SmallSetVector<PHINode *, 8> SpeculatablePHIs;
218
219 /// A worklist of select instructions to rewrite prior to promoting
220 /// allocas.
221 SmallMapVector<SelectInst *, RewriteableMemOps, 8> SelectsToRewrite;
222
223 /// Select instructions that use an alloca and are subsequently loaded can be
224 /// rewritten to load both input pointers and then select between the result,
225 /// allowing the load of the alloca to be promoted.
226 /// From this:
227 /// %P2 = select i1 %cond, ptr %Alloca, ptr %Other
228 /// %V = load <type>, ptr %P2
229 /// to:
230 /// %V1 = load <type>, ptr %Alloca -> will be mem2reg'd
231 /// %V2 = load <type>, ptr %Other
232 /// %V = select i1 %cond, <type> %V1, <type> %V2
233 ///
234 /// We can do this to a select if its only uses are loads
235 /// and if either the operand to the select can be loaded unconditionally,
236 /// or if we are allowed to perform CFG modifications.
237 /// If found an intervening bitcast with a single use of the load,
238 /// allow the promotion.
239 static std::optional<RewriteableMemOps>
240 isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG);
241
242public:
243 SROA(LLVMContext *C, DomTreeUpdater *DTU, AssumptionCache *AC,
244 SROAOptions PreserveCFG_)
245 : C(C), DTU(DTU), AC(AC),
246 PreserveCFG(PreserveCFG_ == SROAOptions::PreserveCFG) {}
247
248 /// Main run method used by both the SROAPass and by the legacy pass.
249 std::pair<bool /*Changed*/, bool /*CFGChanged*/> runSROA(Function &F);
250
251private:
252 friend class AllocaSliceRewriter;
253
254 bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS);
255 std::pair<AllocaInst *, uint64_t>
256 rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P);
257 bool splitAlloca(AllocaInst &AI, AllocaSlices &AS);
258 bool propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS);
259 std::pair<bool /*Changed*/, bool /*CFGChanged*/> runOnAlloca(AllocaInst &AI);
260 void clobberUse(Use &U);
261 bool deleteDeadInstructions(SmallPtrSetImpl<AllocaInst *> &DeletedAllocas);
262 bool promoteAllocas();
263};
264
265} // end anonymous namespace
266
267/// Calculate the fragment of a variable to use when slicing a store
268/// based on the slice dimensions, existing fragment, and base storage
269/// fragment.
270/// Results:
271/// UseFrag - Use Target as the new fragment.
272/// UseNoFrag - The new slice already covers the whole variable.
273/// Skip - The new alloca slice doesn't include this variable.
274/// FIXME: Can we use calculateFragmentIntersect instead?
275namespace {
276enum FragCalcResult { UseFrag, UseNoFrag, Skip };
277}
278static FragCalcResult
279calculateFragment(DILocalVariable *Variable,
280 uint64_t NewStorageSliceOffsetInBits,
281 uint64_t NewStorageSliceSizeInBits,
282 std::optional<DIExpression::FragmentInfo> StorageFragment,
283 std::optional<DIExpression::FragmentInfo> CurrentFragment,
284 DIExpression::FragmentInfo &Target) {
285 // If the base storage describes part of the variable apply the offset and
286 // the size constraint.
287 if (StorageFragment) {
288 Target.SizeInBits =
289 std::min(a: NewStorageSliceSizeInBits, b: StorageFragment->SizeInBits);
290 Target.OffsetInBits =
291 NewStorageSliceOffsetInBits + StorageFragment->OffsetInBits;
292 } else {
293 Target.SizeInBits = NewStorageSliceSizeInBits;
294 Target.OffsetInBits = NewStorageSliceOffsetInBits;
295 }
296
297 // If this slice extracts the entirety of an independent variable from a
298 // larger alloca, do not produce a fragment expression, as the variable is
299 // not fragmented.
300 if (!CurrentFragment) {
301 if (auto Size = Variable->getSizeInBits()) {
302 // Treat the current fragment as covering the whole variable.
303 CurrentFragment = DIExpression::FragmentInfo(*Size, 0);
304 if (Target == CurrentFragment)
305 return UseNoFrag;
306 }
307 }
308
309 // No additional work to do if there isn't a fragment already, or there is
310 // but it already exactly describes the new assignment.
311 if (!CurrentFragment || *CurrentFragment == Target)
312 return UseFrag;
313
314 // Reject the target fragment if it doesn't fit wholly within the current
315 // fragment. TODO: We could instead chop up the target to fit in the case of
316 // a partial overlap.
317 if (Target.startInBits() < CurrentFragment->startInBits() ||
318 Target.endInBits() > CurrentFragment->endInBits())
319 return Skip;
320
321 // Target fits within the current fragment, return it.
322 return UseFrag;
323}
324
325static DebugVariable getAggregateVariable(DbgVariableRecord *DVR) {
326 return DebugVariable(DVR->getVariable(), std::nullopt,
327 DVR->getDebugLoc().getInlinedAt());
328}
329
330/// Find linked dbg.assign and generate a new one with the correct
331/// FragmentInfo. Link Inst to the new dbg.assign. If Value is nullptr the
332/// value component is copied from the old dbg.assign to the new.
333/// \param OldAlloca Alloca for the variable before splitting.
334/// \param IsSplit True if the store (not necessarily alloca)
335/// is being split.
336/// \param OldAllocaOffsetInBits Offset of the slice taken from OldAlloca.
337/// \param SliceSizeInBits New number of bits being written to.
338/// \param OldInst Instruction that is being split.
339/// \param Inst New instruction performing this part of the
340/// split store.
341/// \param Dest Store destination.
342/// \param Value Stored value.
343/// \param DL Datalayout.
344static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
345 uint64_t OldAllocaOffsetInBits,
346 uint64_t SliceSizeInBits, Instruction *OldInst,
347 Instruction *Inst, Value *Dest, Value *Value,
348 const DataLayout &DL) {
349 // If we want allocas to be migrated using this helper then we need to ensure
350 // that the BaseFragments map code still works. A simple solution would be
351 // to choose to always clone alloca dbg_assigns (rather than sometimes
352 // "stealing" them).
353 assert(!isa<AllocaInst>(Inst) && "Unexpected alloca");
354
355 auto DVRAssignMarkerRange = at::getDVRAssignmentMarkers(Inst: OldInst);
356 // Nothing to do if OldInst has no linked dbg.assign intrinsics.
357 if (DVRAssignMarkerRange.empty())
358 return;
359
360 LLVM_DEBUG(dbgs() << " migrateDebugInfo\n");
361 LLVM_DEBUG(dbgs() << " OldAlloca: " << *OldAlloca << "\n");
362 LLVM_DEBUG(dbgs() << " IsSplit: " << IsSplit << "\n");
363 LLVM_DEBUG(dbgs() << " OldAllocaOffsetInBits: " << OldAllocaOffsetInBits
364 << "\n");
365 LLVM_DEBUG(dbgs() << " SliceSizeInBits: " << SliceSizeInBits << "\n");
366 LLVM_DEBUG(dbgs() << " OldInst: " << *OldInst << "\n");
367 LLVM_DEBUG(dbgs() << " Inst: " << *Inst << "\n");
368 LLVM_DEBUG(dbgs() << " Dest: " << *Dest << "\n");
369 if (Value)
370 LLVM_DEBUG(dbgs() << " Value: " << *Value << "\n");
371
372 /// Map of aggregate variables to their fragment associated with OldAlloca.
373 DenseMap<DebugVariable, std::optional<DIExpression::FragmentInfo>>
374 BaseFragments;
375 for (auto *DVR : at::getDVRAssignmentMarkers(Inst: OldAlloca))
376 BaseFragments[getAggregateVariable(DVR)] =
377 DVR->getExpression()->getFragmentInfo();
378
379 // The new inst needs a DIAssignID unique metadata tag (if OldInst has
380 // one). It shouldn't already have one: assert this assumption.
381 assert(!Inst->getMetadata(LLVMContext::MD_DIAssignID));
382 DIAssignID *NewID = nullptr;
383 auto &Ctx = Inst->getContext();
384 DIBuilder DIB(*OldInst->getModule(), /*AllowUnresolved*/ false);
385 assert(OldAlloca->isStaticAlloca());
386
387 auto MigrateDbgAssign = [&](DbgVariableRecord *DbgAssign) {
388 LLVM_DEBUG(dbgs() << " existing dbg.assign is: " << *DbgAssign
389 << "\n");
390 auto *Expr = DbgAssign->getExpression();
391 bool SetKillLocation = false;
392
393 if (IsSplit) {
394 std::optional<DIExpression::FragmentInfo> BaseFragment;
395 {
396 auto R = BaseFragments.find(Val: getAggregateVariable(DVR: DbgAssign));
397 if (R == BaseFragments.end())
398 return;
399 BaseFragment = R->second;
400 }
401 std::optional<DIExpression::FragmentInfo> CurrentFragment =
402 Expr->getFragmentInfo();
403 DIExpression::FragmentInfo NewFragment;
404 FragCalcResult Result = calculateFragment(
405 Variable: DbgAssign->getVariable(), NewStorageSliceOffsetInBits: OldAllocaOffsetInBits, NewStorageSliceSizeInBits: SliceSizeInBits,
406 StorageFragment: BaseFragment, CurrentFragment, Target&: NewFragment);
407
408 if (Result == Skip)
409 return;
410 if (Result == UseFrag && !(NewFragment == CurrentFragment)) {
411 if (CurrentFragment) {
412 // Rewrite NewFragment to be relative to the existing one (this is
413 // what createFragmentExpression wants). CalculateFragment has
414 // already resolved the size for us. FIXME: Should it return the
415 // relative fragment too?
416 NewFragment.OffsetInBits -= CurrentFragment->OffsetInBits;
417 }
418 // Add the new fragment info to the existing expression if possible.
419 if (auto E = DIExpression::createFragmentExpression(
420 Expr, OffsetInBits: NewFragment.OffsetInBits, SizeInBits: NewFragment.SizeInBits)) {
421 Expr = *E;
422 } else {
423 // Otherwise, add the new fragment info to an empty expression and
424 // discard the value component of this dbg.assign as the value cannot
425 // be computed with the new fragment.
426 Expr = *DIExpression::createFragmentExpression(
427 Expr: DIExpression::get(Context&: Expr->getContext(), Elements: {}),
428 OffsetInBits: NewFragment.OffsetInBits, SizeInBits: NewFragment.SizeInBits);
429 SetKillLocation = true;
430 }
431 }
432 }
433
434 // If we haven't created a DIAssignID ID do that now and attach it to Inst.
435 if (!NewID) {
436 NewID = DIAssignID::getDistinct(Context&: Ctx);
437 Inst->setMetadata(KindID: LLVMContext::MD_DIAssignID, Node: NewID);
438 }
439
440 DbgVariableRecord *NewAssign;
441 if (IsSplit) {
442 ::Value *NewValue = Value ? Value : DbgAssign->getValue();
443 NewAssign = cast<DbgVariableRecord>(Val: cast<DbgRecord *>(
444 Val: DIB.insertDbgAssign(LinkedInstr: Inst, Val: NewValue, SrcVar: DbgAssign->getVariable(), ValExpr: Expr,
445 Addr: Dest, AddrExpr: DIExpression::get(Context&: Expr->getContext(), Elements: {}),
446 DL: DbgAssign->getDebugLoc())));
447 } else {
448 // The store is not split, simply steal the existing dbg_assign.
449 NewAssign = DbgAssign;
450 NewAssign->setAssignId(NewID); // FIXME: Can we avoid generating new IDs?
451 NewAssign->setAddress(Dest);
452 if (Value)
453 NewAssign->replaceVariableLocationOp(OpIdx: 0u, NewValue: Value);
454 assert(Expr == NewAssign->getExpression());
455 }
456
457 // If we've updated the value but the original dbg.assign has an arglist
458 // then kill it now - we can't use the requested new value.
459 // We can't replace the DIArgList with the new value as it'd leave
460 // the DIExpression in an invalid state (DW_OP_LLVM_arg operands without
461 // an arglist). And we can't keep the DIArgList in case the linked store
462 // is being split - in which case the DIArgList + expression may no longer
463 // be computing the correct value.
464 // This should be a very rare situation as it requires the value being
465 // stored to differ from the dbg.assign (i.e., the value has been
466 // represented differently in the debug intrinsic for some reason).
467 SetKillLocation |=
468 Value && (DbgAssign->hasArgList() ||
469 !DbgAssign->getExpression()->isSingleLocationExpression());
470 if (SetKillLocation)
471 NewAssign->setKillLocation();
472
473 // We could use more precision here at the cost of some additional (code)
474 // complexity - if the original dbg.assign was adjacent to its store, we
475 // could position this new dbg.assign adjacent to its store rather than the
476 // old dbg.assgn. That would result in interleaved dbg.assigns rather than
477 // what we get now:
478 // split store !1
479 // split store !2
480 // dbg.assign !1
481 // dbg.assign !2
482 // This (current behaviour) results results in debug assignments being
483 // noted as slightly offset (in code) from the store. In practice this
484 // should have little effect on the debugging experience due to the fact
485 // that all the split stores should get the same line number.
486 if (NewAssign != DbgAssign) {
487 NewAssign->moveBefore(MoveBefore: DbgAssign->getIterator());
488 NewAssign->setDebugLoc(DbgAssign->getDebugLoc());
489 }
490 LLVM_DEBUG(dbgs() << "Created new assign: " << *NewAssign << "\n");
491 };
492
493 for_each(Range&: DVRAssignMarkerRange, F: MigrateDbgAssign);
494}
495
496namespace {
497
498/// A custom IRBuilder inserter which prefixes all names, but only in
499/// Assert builds.
500class IRBuilderPrefixedInserter final : public IRBuilderDefaultInserter {
501 std::string Prefix;
502
503 Twine getNameWithPrefix(const Twine &Name) const {
504 return Name.isTriviallyEmpty() ? Name : Prefix + Name;
505 }
506
507public:
508 void SetNamePrefix(const Twine &P) { Prefix = P.str(); }
509
510 void InsertHelper(Instruction *I, const Twine &Name,
511 BasicBlock::iterator InsertPt) const override {
512 IRBuilderDefaultInserter::InsertHelper(I, Name: getNameWithPrefix(Name),
513 InsertPt);
514 }
515};
516
517/// Provide a type for IRBuilder that drops names in release builds.
518using IRBuilderTy = IRBuilder<ConstantFolder, IRBuilderPrefixedInserter>;
519
520/// A used slice of an alloca.
521///
522/// This structure represents a slice of an alloca used by some instruction. It
523/// stores both the begin and end offsets of this use, a pointer to the use
524/// itself, and a flag indicating whether we can classify the use as splittable
525/// or not when forming partitions of the alloca.
526class Slice {
527 /// The beginning offset of the range.
528 uint64_t BeginOffset = 0;
529
530 /// The ending offset, not included in the range.
531 uint64_t EndOffset = 0;
532
533 /// Storage for both the use of this slice and whether it can be
534 /// split.
535 PointerIntPair<Use *, 1, bool> UseAndIsSplittable;
536
537public:
538 Slice() = default;
539
540 Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable,
541 Value *ProtectedFieldDisc)
542 : BeginOffset(BeginOffset), EndOffset(EndOffset),
543 UseAndIsSplittable(U, IsSplittable),
544 ProtectedFieldDisc(ProtectedFieldDisc) {}
545
546 uint64_t beginOffset() const { return BeginOffset; }
547 uint64_t endOffset() const { return EndOffset; }
548
549 bool isSplittable() const { return UseAndIsSplittable.getInt(); }
550 void makeUnsplittable() { UseAndIsSplittable.setInt(false); }
551
552 Use *getUse() const { return UseAndIsSplittable.getPointer(); }
553
554 bool isDead() const { return getUse() == nullptr; }
555 void kill() { UseAndIsSplittable.setPointer(nullptr); }
556
557 // When this access is via an llvm.protected.field.ptr intrinsic, contains
558 // the second argument to the intrinsic, the discriminator.
559 Value *ProtectedFieldDisc;
560
561 /// Support for ordering ranges.
562 ///
563 /// This provides an ordering over ranges such that start offsets are
564 /// always increasing, and within equal start offsets, the end offsets are
565 /// decreasing. Thus the spanning range comes first in a cluster with the
566 /// same start position.
567 bool operator<(const Slice &RHS) const {
568 if (beginOffset() < RHS.beginOffset())
569 return true;
570 if (beginOffset() > RHS.beginOffset())
571 return false;
572 if (isSplittable() != RHS.isSplittable())
573 return !isSplittable();
574 if (endOffset() > RHS.endOffset())
575 return true;
576 return false;
577 }
578
579 /// Support comparison with a single offset to allow binary searches.
580 [[maybe_unused]] friend bool operator<(const Slice &LHS, uint64_t RHSOffset) {
581 return LHS.beginOffset() < RHSOffset;
582 }
583 [[maybe_unused]] friend bool operator<(uint64_t LHSOffset, const Slice &RHS) {
584 return LHSOffset < RHS.beginOffset();
585 }
586
587 bool operator==(const Slice &RHS) const {
588 return isSplittable() == RHS.isSplittable() &&
589 beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset();
590 }
591 bool operator!=(const Slice &RHS) const { return !operator==(RHS); }
592};
593
594/// Representation of the alloca slices.
595///
596/// This class represents the slices of an alloca which are formed by its
597/// various uses. If a pointer escapes, we can't fully build a representation
598/// for the slices used and we reflect that in this structure. The uses are
599/// stored, sorted by increasing beginning offset and with unsplittable slices
600/// starting at a particular offset before splittable slices.
601class AllocaSlices {
602public:
603 /// Construct the slices of a particular alloca.
604 AllocaSlices(const DataLayout &DL, AllocaInst &AI);
605
606 /// Test whether a pointer to the allocation escapes our analysis.
607 ///
608 /// If this is true, the slices are never fully built and should be
609 /// ignored.
610 bool isEscaped() const { return PointerEscapingInstr; }
611 bool isEscapedReadOnly() const { return PointerEscapingInstrReadOnly; }
612
613 /// Support for iterating over the slices.
614 /// @{
615 using iterator = SmallVectorImpl<Slice>::iterator;
616 using range = iterator_range<iterator>;
617
618 iterator begin() { return Slices.begin(); }
619 iterator end() { return Slices.end(); }
620
621 using const_iterator = SmallVectorImpl<Slice>::const_iterator;
622 using const_range = iterator_range<const_iterator>;
623
624 const_iterator begin() const { return Slices.begin(); }
625 const_iterator end() const { return Slices.end(); }
626 /// @}
627
628 /// Erase a range of slices.
629 void erase(iterator Start, iterator Stop) { Slices.erase(CS: Start, CE: Stop); }
630
631 /// Insert new slices for this alloca.
632 ///
633 /// This moves the slices into the alloca's slices collection, and re-sorts
634 /// everything so that the usual ordering properties of the alloca's slices
635 /// hold.
636 void insert(ArrayRef<Slice> NewSlices) {
637 int OldSize = Slices.size();
638 Slices.append(in_start: NewSlices.begin(), in_end: NewSlices.end());
639 auto SliceI = Slices.begin() + OldSize;
640 std::stable_sort(first: SliceI, last: Slices.end());
641 std::inplace_merge(first: Slices.begin(), middle: SliceI, last: Slices.end());
642 }
643
644 // Forward declare the iterator and range accessor for walking the
645 // partitions.
646 class partition_iterator;
647 iterator_range<partition_iterator> partitions();
648
649 /// Access the dead users for this alloca.
650 ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
651
652 /// Access the users for this alloca that are llvm.protected.field.ptr
653 /// intrinsics.
654 ArrayRef<IntrinsicInst *> getPFPUsers() const { return PFPUsers; }
655
656 /// Access Uses that should be dropped if the alloca is promotable.
657 ArrayRef<Use *> getDeadUsesIfPromotable() const {
658 return DeadUseIfPromotable;
659 }
660
661 /// Access the dead operands referring to this alloca.
662 ///
663 /// These are operands which have cannot actually be used to refer to the
664 /// alloca as they are outside its range and the user doesn't correct for
665 /// that. These mostly consist of PHI node inputs and the like which we just
666 /// need to replace with undef.
667 ArrayRef<Use *> getDeadOperands() const { return DeadOperands; }
668
669#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
670 void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const;
671 void printSlice(raw_ostream &OS, const_iterator I,
672 StringRef Indent = " ") const;
673 void printUse(raw_ostream &OS, const_iterator I,
674 StringRef Indent = " ") const;
675 void print(raw_ostream &OS) const;
676 void dump(const_iterator I) const;
677 void dump() const;
678#endif
679
680private:
681 template <typename DerivedT, typename RetT = void> class BuilderBase;
682 class SliceBuilder;
683
684 friend class AllocaSlices::SliceBuilder;
685
686#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
687 /// Handle to alloca instruction to simplify method interfaces.
688 AllocaInst &AI;
689#endif
690
691 /// The instruction responsible for this alloca not having a known set
692 /// of slices.
693 ///
694 /// When an instruction (potentially) escapes the pointer to the alloca, we
695 /// store a pointer to that here and abort trying to form slices of the
696 /// alloca. This will be null if the alloca slices are analyzed successfully.
697 Instruction *PointerEscapingInstr;
698 Instruction *PointerEscapingInstrReadOnly;
699
700 /// The slices of the alloca.
701 ///
702 /// We store a vector of the slices formed by uses of the alloca here. This
703 /// vector is sorted by increasing begin offset, and then the unsplittable
704 /// slices before the splittable ones. See the Slice inner class for more
705 /// details.
706 SmallVector<Slice, 8> Slices;
707
708 /// Instructions which will become dead if we rewrite the alloca.
709 ///
710 /// Note that these are not separated by slice. This is because we expect an
711 /// alloca to be completely rewritten or not rewritten at all. If rewritten,
712 /// all these instructions can simply be removed and replaced with poison as
713 /// they come from outside of the allocated space.
714 SmallVector<Instruction *, 8> DeadUsers;
715
716 /// Users that are llvm.protected.field.ptr intrinsics. These will be RAUW'd
717 /// to their first argument if we rewrite the alloca.
718 SmallVector<IntrinsicInst *, 0> PFPUsers;
719
720 /// Uses which will become dead if can promote the alloca.
721 SmallVector<Use *, 8> DeadUseIfPromotable;
722
723 /// Operands which will become dead if we rewrite the alloca.
724 ///
725 /// These are operands that in their particular use can be replaced with
726 /// poison when we rewrite the alloca. These show up in out-of-bounds inputs
727 /// to PHI nodes and the like. They aren't entirely dead (there might be
728 /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
729 /// want to swap this particular input for poison to simplify the use lists of
730 /// the alloca.
731 SmallVector<Use *, 8> DeadOperands;
732};
733
734/// A partition of the slices.
735///
736/// An ephemeral representation for a range of slices which can be viewed as
737/// a partition of the alloca. This range represents a span of the alloca's
738/// memory which cannot be split, and provides access to all of the slices
739/// overlapping some part of the partition.
740///
741/// Objects of this type are produced by traversing the alloca's slices, but
742/// are only ephemeral and not persistent.
743class Partition {
744private:
745 friend class AllocaSlices;
746 friend class AllocaSlices::partition_iterator;
747
748 using iterator = AllocaSlices::iterator;
749
750 /// The beginning and ending offsets of the alloca for this
751 /// partition.
752 uint64_t BeginOffset = 0, EndOffset = 0;
753
754 /// The start and end iterators of this partition.
755 iterator SI, SJ;
756
757 /// A collection of split slice tails overlapping the partition.
758 SmallVector<Slice *, 4> SplitTails;
759
760 /// Raw constructor builds an empty partition starting and ending at
761 /// the given iterator.
762 Partition(iterator SI) : SI(SI), SJ(SI) {}
763
764public:
765 /// The start offset of this partition.
766 ///
767 /// All of the contained slices start at or after this offset.
768 uint64_t beginOffset() const { return BeginOffset; }
769
770 /// The end offset of this partition.
771 ///
772 /// All of the contained slices end at or before this offset.
773 uint64_t endOffset() const { return EndOffset; }
774
775 /// The size of the partition.
776 ///
777 /// Note that this can never be zero.
778 uint64_t size() const {
779 assert(BeginOffset < EndOffset && "Partitions must span some bytes!");
780 return EndOffset - BeginOffset;
781 }
782
783 /// Test whether this partition contains no slices, and merely spans
784 /// a region occupied by split slices.
785 bool empty() const { return SI == SJ; }
786
787 /// \name Iterate slices that start within the partition.
788 /// These may be splittable or unsplittable. They have a begin offset >= the
789 /// partition begin offset.
790 /// @{
791 // FIXME: We should probably define a "concat_iterator" helper and use that
792 // to stitch together pointee_iterators over the split tails and the
793 // contiguous iterators of the partition. That would give a much nicer
794 // interface here. We could then additionally expose filtered iterators for
795 // split, unsplit, and unsplittable splices based on the usage patterns.
796 iterator begin() const { return SI; }
797 iterator end() const { return SJ; }
798 /// @}
799
800 /// Get the sequence of split slice tails.
801 ///
802 /// These tails are of slices which start before this partition but are
803 /// split and overlap into the partition. We accumulate these while forming
804 /// partitions.
805 ArrayRef<Slice *> splitSliceTails() const { return SplitTails; }
806};
807
808} // end anonymous namespace
809
810/// An iterator over partitions of the alloca's slices.
811///
812/// This iterator implements the core algorithm for partitioning the alloca's
813/// slices. It is a forward iterator as we don't support backtracking for
814/// efficiency reasons, and re-use a single storage area to maintain the
815/// current set of split slices.
816///
817/// It is templated on the slice iterator type to use so that it can operate
818/// with either const or non-const slice iterators.
819class AllocaSlices::partition_iterator
820 : public iterator_facade_base<partition_iterator, std::forward_iterator_tag,
821 Partition> {
822 friend class AllocaSlices;
823
824 /// Most of the state for walking the partitions is held in a class
825 /// with a nice interface for examining them.
826 Partition P;
827
828 /// We need to keep the end of the slices to know when to stop.
829 AllocaSlices::iterator SE;
830
831 /// We also need to keep track of the maximum split end offset seen.
832 /// FIXME: Do we really?
833 uint64_t MaxSplitSliceEndOffset = 0;
834
835 /// Sets the partition to be empty at given iterator, and sets the
836 /// end iterator.
837 partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE)
838 : P(SI), SE(SE) {
839 // If not already at the end, advance our state to form the initial
840 // partition.
841 if (SI != SE)
842 advance();
843 }
844
845 /// Advance the iterator to the next partition.
846 ///
847 /// Requires that the iterator not be at the end of the slices.
848 void advance() {
849 assert((P.SI != SE || !P.SplitTails.empty()) &&
850 "Cannot advance past the end of the slices!");
851
852 // Clear out any split uses which have ended.
853 if (!P.SplitTails.empty()) {
854 if (P.EndOffset >= MaxSplitSliceEndOffset) {
855 // If we've finished all splits, this is easy.
856 P.SplitTails.clear();
857 MaxSplitSliceEndOffset = 0;
858 } else {
859 // Remove the uses which have ended in the prior partition. This
860 // cannot change the max split slice end because we just checked that
861 // the prior partition ended prior to that max.
862 llvm::erase_if(C&: P.SplitTails,
863 P: [&](Slice *S) { return S->endOffset() <= P.EndOffset; });
864 assert(llvm::any_of(P.SplitTails,
865 [&](Slice *S) {
866 return S->endOffset() == MaxSplitSliceEndOffset;
867 }) &&
868 "Could not find the current max split slice offset!");
869 assert(llvm::all_of(P.SplitTails,
870 [&](Slice *S) {
871 return S->endOffset() <= MaxSplitSliceEndOffset;
872 }) &&
873 "Max split slice end offset is not actually the max!");
874 }
875 }
876
877 // If P.SI is already at the end, then we've cleared the split tail and
878 // now have an end iterator.
879 if (P.SI == SE) {
880 assert(P.SplitTails.empty() && "Failed to clear the split slices!");
881 return;
882 }
883
884 // If we had a non-empty partition previously, set up the state for
885 // subsequent partitions.
886 if (P.SI != P.SJ) {
887 // Accumulate all the splittable slices which started in the old
888 // partition into the split list.
889 for (Slice &S : P)
890 if (S.isSplittable() && S.endOffset() > P.EndOffset) {
891 P.SplitTails.push_back(Elt: &S);
892 MaxSplitSliceEndOffset =
893 std::max(a: S.endOffset(), b: MaxSplitSliceEndOffset);
894 }
895
896 // Start from the end of the previous partition.
897 P.SI = P.SJ;
898
899 // If P.SI is now at the end, we at most have a tail of split slices.
900 if (P.SI == SE) {
901 P.BeginOffset = P.EndOffset;
902 P.EndOffset = MaxSplitSliceEndOffset;
903 return;
904 }
905
906 // If the we have split slices and the next slice is after a gap and is
907 // not splittable immediately form an empty partition for the split
908 // slices up until the next slice begins.
909 if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset &&
910 !P.SI->isSplittable()) {
911 P.BeginOffset = P.EndOffset;
912 P.EndOffset = P.SI->beginOffset();
913 return;
914 }
915 }
916
917 // OK, we need to consume new slices. Set the end offset based on the
918 // current slice, and step SJ past it. The beginning offset of the
919 // partition is the beginning offset of the next slice unless we have
920 // pre-existing split slices that are continuing, in which case we begin
921 // at the prior end offset.
922 P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset;
923 P.EndOffset = P.SI->endOffset();
924 ++P.SJ;
925
926 // There are two strategies to form a partition based on whether the
927 // partition starts with an unsplittable slice or a splittable slice.
928 if (!P.SI->isSplittable()) {
929 // When we're forming an unsplittable region, it must always start at
930 // the first slice and will extend through its end.
931 assert(P.BeginOffset == P.SI->beginOffset());
932
933 // Form a partition including all of the overlapping slices with this
934 // unsplittable slice.
935 while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
936 if (!P.SJ->isSplittable())
937 P.EndOffset = std::max(a: P.EndOffset, b: P.SJ->endOffset());
938 ++P.SJ;
939 }
940
941 // We have a partition across a set of overlapping unsplittable
942 // partitions.
943 return;
944 }
945
946 // If we're starting with a splittable slice, then we need to form
947 // a synthetic partition spanning it and any other overlapping splittable
948 // splices.
949 assert(P.SI->isSplittable() && "Forming a splittable partition!");
950
951 // Collect all of the overlapping splittable slices.
952 while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset &&
953 P.SJ->isSplittable()) {
954 P.EndOffset = std::max(a: P.EndOffset, b: P.SJ->endOffset());
955 ++P.SJ;
956 }
957
958 // Back upiP.EndOffset if we ended the span early when encountering an
959 // unsplittable slice. This synthesizes the early end offset of
960 // a partition spanning only splittable slices.
961 if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
962 assert(!P.SJ->isSplittable());
963 P.EndOffset = P.SJ->beginOffset();
964 }
965 }
966
967public:
968 bool operator==(const partition_iterator &RHS) const {
969 assert(SE == RHS.SE &&
970 "End iterators don't match between compared partition iterators!");
971
972 // The observed positions of partitions is marked by the P.SI iterator and
973 // the emptiness of the split slices. The latter is only relevant when
974 // P.SI == SE, as the end iterator will additionally have an empty split
975 // slices list, but the prior may have the same P.SI and a tail of split
976 // slices.
977 if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) {
978 assert(P.SJ == RHS.P.SJ &&
979 "Same set of slices formed two different sized partitions!");
980 assert(P.SplitTails.size() == RHS.P.SplitTails.size() &&
981 "Same slice position with differently sized non-empty split "
982 "slice tails!");
983 return true;
984 }
985 return false;
986 }
987
988 partition_iterator &operator++() {
989 advance();
990 return *this;
991 }
992
993 Partition &operator*() { return P; }
994};
995
996/// A forward range over the partitions of the alloca's slices.
997///
998/// This accesses an iterator range over the partitions of the alloca's
999/// slices. It computes these partitions on the fly based on the overlapping
1000/// offsets of the slices and the ability to split them. It will visit "empty"
1001/// partitions to cover regions of the alloca only accessed via split
1002/// slices.
1003iterator_range<AllocaSlices::partition_iterator> AllocaSlices::partitions() {
1004 return make_range(x: partition_iterator(begin(), end()),
1005 y: partition_iterator(end(), end()));
1006}
1007
1008static Value *foldSelectInst(SelectInst &SI) {
1009 // If the condition being selected on is a constant or the same value is
1010 // being selected between, fold the select. Yes this does (rarely) happen
1011 // early on.
1012 if (ConstantInt *CI = dyn_cast<ConstantInt>(Val: SI.getCondition()))
1013 return SI.getOperand(i_nocapture: 1 + CI->isZero());
1014 if (SI.getOperand(i_nocapture: 1) == SI.getOperand(i_nocapture: 2))
1015 return SI.getOperand(i_nocapture: 1);
1016
1017 return nullptr;
1018}
1019
1020/// A helper that folds a PHI node or a select.
1021static Value *foldPHINodeOrSelectInst(Instruction &I) {
1022 if (PHINode *PN = dyn_cast<PHINode>(Val: &I)) {
1023 // If PN merges together the same value, return that value.
1024 return PN->hasConstantValue();
1025 }
1026 return foldSelectInst(SI&: cast<SelectInst>(Val&: I));
1027}
1028
1029/// Builder for the alloca slices.
1030///
1031/// This class builds a set of alloca slices by recursively visiting the uses
1032/// of an alloca and making a slice for each load and store at each offset.
1033class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
1034 friend class PtrUseVisitor<SliceBuilder>;
1035 friend class InstVisitor<SliceBuilder>;
1036
1037 using Base = PtrUseVisitor<SliceBuilder>;
1038
1039 const uint64_t AllocSize;
1040 AllocaSlices &AS;
1041
1042 SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap;
1043 SmallDenseMap<Instruction *, uint64_t> PHIOrSelectSizes;
1044
1045 /// Set to de-duplicate dead instructions found in the use walk.
1046 SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
1047
1048 // When this access is via an llvm.protected.field.ptr intrinsic, contains
1049 // the second argument to the intrinsic, the discriminator.
1050 Value *ProtectedFieldDisc = nullptr;
1051
1052public:
1053 SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
1054 : PtrUseVisitor<SliceBuilder>(DL),
1055 AllocSize(AI.getAllocationSize(DL)->getFixedValue()), AS(AS) {}
1056
1057private:
1058 void markAsDead(Instruction &I) {
1059 if (VisitedDeadInsts.insert(Ptr: &I).second)
1060 AS.DeadUsers.push_back(Elt: &I);
1061 }
1062
1063 void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
1064 bool IsSplittable = false) {
1065 // Completely skip uses which have a zero size or start either before or
1066 // past the end of the allocation.
1067 if (Size == 0 || Offset.uge(RHS: AllocSize)) {
1068 LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @"
1069 << Offset
1070 << " which has zero size or starts outside of the "
1071 << AllocSize << " byte alloca:\n"
1072 << " alloca: " << AS.AI << "\n"
1073 << " use: " << I << "\n");
1074 return markAsDead(I);
1075 }
1076
1077 uint64_t BeginOffset = Offset.getZExtValue();
1078 uint64_t EndOffset = BeginOffset + Size;
1079
1080 // Clamp the end offset to the end of the allocation. Note that this is
1081 // formulated to handle even the case where "BeginOffset + Size" overflows.
1082 // This may appear superficially to be something we could ignore entirely,
1083 // but that is not so! There may be widened loads or PHI-node uses where
1084 // some instructions are dead but not others. We can't completely ignore
1085 // them, and so have to record at least the information here.
1086 assert(AllocSize >= BeginOffset); // Established above.
1087 if (Size > AllocSize - BeginOffset) {
1088 LLVM_DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @"
1089 << Offset << " to remain within the " << AllocSize
1090 << " byte alloca:\n"
1091 << " alloca: " << AS.AI << "\n"
1092 << " use: " << I << "\n");
1093 EndOffset = AllocSize;
1094 }
1095
1096 AS.Slices.push_back(
1097 Elt: Slice(BeginOffset, EndOffset, U, IsSplittable, ProtectedFieldDisc));
1098 }
1099
1100 void visitBitCastInst(BitCastInst &BC) {
1101 if (BC.use_empty())
1102 return markAsDead(I&: BC);
1103
1104 return Base::visitBitCastInst(BC);
1105 }
1106
1107 void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
1108 if (ASC.use_empty())
1109 return markAsDead(I&: ASC);
1110
1111 return Base::visitAddrSpaceCastInst(ASC);
1112 }
1113
1114 void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
1115 if (GEPI.use_empty())
1116 return markAsDead(I&: GEPI);
1117
1118 return Base::visitGetElementPtrInst(GEPI);
1119 }
1120
1121 void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
1122 uint64_t Size, bool IsVolatile) {
1123 // We allow splitting of non-volatile loads and stores where the type is an
1124 // integer type. These may be used to implement 'memcpy' or other "transfer
1125 // of bits" patterns.
1126 bool IsSplittable =
1127 Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty);
1128
1129 insertUse(I, Offset, Size, IsSplittable);
1130 }
1131
1132 void visitLoadInst(LoadInst &LI) {
1133 assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
1134 "All simple FCA loads should have been pre-split");
1135
1136 // If there is a load with an unknown offset, we can still perform store
1137 // to load forwarding for other known-offset loads.
1138 if (!IsOffsetKnown)
1139 return PI.setEscapedReadOnly(&LI);
1140
1141 TypeSize Size = DL.getTypeStoreSize(Ty: LI.getType());
1142 if (Size.isScalable()) {
1143 unsigned VScale = LI.getFunction()->getVScaleValue();
1144 if (!VScale)
1145 return PI.setAborted(&LI);
1146
1147 Size = TypeSize::getFixed(ExactSize: Size.getKnownMinValue() * VScale);
1148 }
1149
1150 return handleLoadOrStore(Ty: LI.getType(), I&: LI, Offset, Size: Size.getFixedValue(),
1151 IsVolatile: LI.isVolatile());
1152 }
1153
1154 void visitStoreInst(StoreInst &SI) {
1155 Value *ValOp = SI.getValueOperand();
1156 if (ValOp == *U)
1157 return PI.setEscapedAndAborted(&SI);
1158 if (!IsOffsetKnown)
1159 return PI.setAborted(&SI);
1160
1161 TypeSize StoreSize = DL.getTypeStoreSize(Ty: ValOp->getType());
1162 if (StoreSize.isScalable()) {
1163 unsigned VScale = SI.getFunction()->getVScaleValue();
1164 if (!VScale)
1165 return PI.setAborted(&SI);
1166
1167 StoreSize = TypeSize::getFixed(ExactSize: StoreSize.getKnownMinValue() * VScale);
1168 }
1169
1170 uint64_t Size = StoreSize.getFixedValue();
1171
1172 // If this memory access can be shown to *statically* extend outside the
1173 // bounds of the allocation, it's behavior is undefined, so simply
1174 // ignore it. Note that this is more strict than the generic clamping
1175 // behavior of insertUse. We also try to handle cases which might run the
1176 // risk of overflow.
1177 // FIXME: We should instead consider the pointer to have escaped if this
1178 // function is being instrumented for addressing bugs or race conditions.
1179 if (Size > AllocSize || Offset.ugt(RHS: AllocSize - Size)) {
1180 LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @"
1181 << Offset << " which extends past the end of the "
1182 << AllocSize << " byte alloca:\n"
1183 << " alloca: " << AS.AI << "\n"
1184 << " use: " << SI << "\n");
1185 return markAsDead(I&: SI);
1186 }
1187
1188 assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
1189 "All simple FCA stores should have been pre-split");
1190 handleLoadOrStore(Ty: ValOp->getType(), I&: SI, Offset, Size, IsVolatile: SI.isVolatile());
1191 }
1192
1193 void visitMemSetInst(MemSetInst &II) {
1194 assert(II.getRawDest() == *U && "Pointer use is not the destination?");
1195 ConstantInt *Length = dyn_cast<ConstantInt>(Val: II.getLength());
1196 if ((Length && Length->getValue() == 0) ||
1197 (IsOffsetKnown && Offset.uge(RHS: AllocSize)))
1198 // Zero-length mem transfer intrinsics can be ignored entirely.
1199 return markAsDead(I&: II);
1200
1201 if (!IsOffsetKnown)
1202 return PI.setAborted(&II);
1203
1204 insertUse(I&: II, Offset,
1205 Size: Length ? Length->getLimitedValue()
1206 : AllocSize - Offset.getLimitedValue(),
1207 IsSplittable: (bool)Length);
1208 }
1209
1210 void visitMemTransferInst(MemTransferInst &II) {
1211 ConstantInt *Length = dyn_cast<ConstantInt>(Val: II.getLength());
1212 if (Length && Length->getValue() == 0)
1213 // Zero-length mem transfer intrinsics can be ignored entirely.
1214 return markAsDead(I&: II);
1215
1216 // Because we can visit these intrinsics twice, also check to see if the
1217 // first time marked this instruction as dead. If so, skip it.
1218 if (VisitedDeadInsts.count(Ptr: &II))
1219 return;
1220
1221 if (!IsOffsetKnown)
1222 return PI.setAborted(&II);
1223
1224 // This side of the transfer is completely out-of-bounds, and so we can
1225 // nuke the entire transfer. However, we also need to nuke the other side
1226 // if already added to our partitions.
1227 // FIXME: Yet another place we really should bypass this when
1228 // instrumenting for ASan.
1229 if (Offset.uge(RHS: AllocSize)) {
1230 SmallDenseMap<Instruction *, unsigned>::iterator MTPI =
1231 MemTransferSliceMap.find(Val: &II);
1232 if (MTPI != MemTransferSliceMap.end())
1233 AS.Slices[MTPI->second].kill();
1234 return markAsDead(I&: II);
1235 }
1236
1237 uint64_t RawOffset = Offset.getLimitedValue();
1238 uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset;
1239
1240 // Check for the special case where the same exact value is used for both
1241 // source and dest.
1242 if (*U == II.getRawDest() && *U == II.getRawSource()) {
1243 // For non-volatile transfers this is a no-op.
1244 if (!II.isVolatile())
1245 return markAsDead(I&: II);
1246
1247 return insertUse(I&: II, Offset, Size, /*IsSplittable=*/false);
1248 }
1249
1250 // If we have seen both source and destination for a mem transfer, then
1251 // they both point to the same alloca.
1252 bool Inserted;
1253 SmallDenseMap<Instruction *, unsigned>::iterator MTPI;
1254 std::tie(args&: MTPI, args&: Inserted) =
1255 MemTransferSliceMap.insert(KV: std::make_pair(x: &II, y: AS.Slices.size()));
1256 unsigned PrevIdx = MTPI->second;
1257 if (!Inserted) {
1258 Slice &PrevP = AS.Slices[PrevIdx];
1259
1260 // Check if the begin offsets match and this is a non-volatile transfer.
1261 // In that case, we can completely elide the transfer.
1262 if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) {
1263 PrevP.kill();
1264 return markAsDead(I&: II);
1265 }
1266
1267 // Otherwise we have an offset transfer within the same alloca. We can't
1268 // split those.
1269 PrevP.makeUnsplittable();
1270 }
1271
1272 // Insert the use now that we've fixed up the splittable nature.
1273 insertUse(I&: II, Offset, Size, /*IsSplittable=*/Inserted && Length);
1274
1275 // Check that we ended up with a valid index in the map.
1276 assert(AS.Slices[PrevIdx].getUse()->getUser() == &II &&
1277 "Map index doesn't point back to a slice with this user.");
1278 }
1279
1280 // Disable SRoA for any intrinsics except for lifetime invariants.
1281 // FIXME: What about debug intrinsics? This matches old behavior, but
1282 // doesn't make sense.
1283 void visitIntrinsicInst(IntrinsicInst &II) {
1284 if (II.isDroppable()) {
1285 AS.DeadUseIfPromotable.push_back(Elt: U);
1286 return;
1287 }
1288
1289 if (!IsOffsetKnown)
1290 return PI.setAborted(&II);
1291
1292 if (II.isLifetimeStartOrEnd()) {
1293 insertUse(I&: II, Offset, Size: AllocSize, IsSplittable: true);
1294 return;
1295 }
1296
1297 if (II.getIntrinsicID() == Intrinsic::protected_field_ptr) {
1298 // We only handle loads and stores as users of llvm.protected.field.ptr.
1299 // Other uses may add items to the worklist, which will cause
1300 // ProtectedFieldDisc to be tracked incorrectly.
1301 AS.PFPUsers.push_back(Elt: &II);
1302 ProtectedFieldDisc = II.getArgOperand(i: 1);
1303 for (Use &U : II.uses()) {
1304 this->U = &U;
1305 if (auto *LI = dyn_cast<LoadInst>(Val: U.getUser()))
1306 visitLoadInst(LI&: *LI);
1307 else if (auto *SI = dyn_cast<StoreInst>(Val: U.getUser()))
1308 visitStoreInst(SI&: *SI);
1309 else
1310 PI.setAborted(&II);
1311 if (PI.isAborted())
1312 break;
1313 }
1314 ProtectedFieldDisc = nullptr;
1315 return;
1316 }
1317
1318 Base::visitIntrinsicInst(II);
1319 }
1320
1321 Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) {
1322 // We consider any PHI or select that results in a direct load or store of
1323 // the same offset to be a viable use for slicing purposes. These uses
1324 // are considered unsplittable and the size is the maximum loaded or stored
1325 // size.
1326 SmallPtrSet<Instruction *, 4> Visited;
1327 SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses;
1328 Visited.insert(Ptr: Root);
1329 Uses.push_back(Elt: std::make_pair(x: cast<Instruction>(Val&: *U), y&: Root));
1330 const DataLayout &DL = Root->getDataLayout();
1331 // If there are no loads or stores, the access is dead. We mark that as
1332 // a size zero access.
1333 Size = 0;
1334 do {
1335 Instruction *I, *UsedI;
1336 std::tie(args&: UsedI, args&: I) = Uses.pop_back_val();
1337
1338 if (LoadInst *LI = dyn_cast<LoadInst>(Val: I)) {
1339 TypeSize LoadSize = DL.getTypeStoreSize(Ty: LI->getType());
1340 if (LoadSize.isScalable()) {
1341 PI.setAborted(LI);
1342 return nullptr;
1343 }
1344 Size = std::max(a: Size, b: LoadSize.getFixedValue());
1345 continue;
1346 }
1347 if (StoreInst *SI = dyn_cast<StoreInst>(Val: I)) {
1348 Value *Op = SI->getOperand(i_nocapture: 0);
1349 if (Op == UsedI)
1350 return SI;
1351 TypeSize StoreSize = DL.getTypeStoreSize(Ty: Op->getType());
1352 if (StoreSize.isScalable()) {
1353 PI.setAborted(SI);
1354 return nullptr;
1355 }
1356 Size = std::max(a: Size, b: StoreSize.getFixedValue());
1357 continue;
1358 }
1359
1360 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: I)) {
1361 if (!GEP->hasAllZeroIndices())
1362 return GEP;
1363 } else if (!isa<BitCastInst>(Val: I) && !isa<PHINode>(Val: I) &&
1364 !isa<SelectInst>(Val: I) && !isa<AddrSpaceCastInst>(Val: I)) {
1365 return I;
1366 }
1367
1368 for (User *U : I->users())
1369 if (Visited.insert(Ptr: cast<Instruction>(Val: U)).second)
1370 Uses.push_back(Elt: std::make_pair(x&: I, y: cast<Instruction>(Val: U)));
1371 } while (!Uses.empty());
1372
1373 return nullptr;
1374 }
1375
1376 void visitPHINodeOrSelectInst(Instruction &I) {
1377 assert(isa<PHINode>(I) || isa<SelectInst>(I));
1378 if (I.use_empty())
1379 return markAsDead(I);
1380
1381 // If this is a PHI node before a catchswitch, we cannot insert any non-PHI
1382 // instructions in this BB, which may be required during rewriting. Bail out
1383 // on these cases.
1384 if (isa<PHINode>(Val: I) && !I.getParent()->hasInsertionPt())
1385 return PI.setAborted(&I);
1386
1387 // TODO: We could use simplifyInstruction here to fold PHINodes and
1388 // SelectInsts. However, doing so requires to change the current
1389 // dead-operand-tracking mechanism. For instance, suppose neither loading
1390 // from %U nor %other traps. Then "load (select undef, %U, %other)" does not
1391 // trap either. However, if we simply replace %U with undef using the
1392 // current dead-operand-tracking mechanism, "load (select undef, undef,
1393 // %other)" may trap because the select may return the first operand
1394 // "undef".
1395 if (Value *Result = foldPHINodeOrSelectInst(I)) {
1396 if (Result == *U)
1397 // If the result of the constant fold will be the pointer, recurse
1398 // through the PHI/select as if we had RAUW'ed it.
1399 enqueueUsers(I);
1400 else
1401 // Otherwise the operand to the PHI/select is dead, and we can replace
1402 // it with poison.
1403 AS.DeadOperands.push_back(Elt: U);
1404
1405 return;
1406 }
1407
1408 if (!IsOffsetKnown)
1409 return PI.setAborted(&I);
1410
1411 // See if we already have computed info on this node.
1412 uint64_t &Size = PHIOrSelectSizes[&I];
1413 if (!Size) {
1414 // This is a new PHI/Select, check for an unsafe use of it.
1415 if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(Root: &I, Size))
1416 return PI.setAborted(UnsafeI);
1417 }
1418
1419 // For PHI and select operands outside the alloca, we can't nuke the entire
1420 // phi or select -- the other side might still be relevant, so we special
1421 // case them here and use a separate structure to track the operands
1422 // themselves which should be replaced with poison.
1423 // FIXME: This should instead be escaped in the event we're instrumenting
1424 // for address sanitization.
1425 if (Offset.uge(RHS: AllocSize)) {
1426 AS.DeadOperands.push_back(Elt: U);
1427 return;
1428 }
1429
1430 insertUse(I, Offset, Size);
1431 }
1432
1433 void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(I&: PN); }
1434
1435 void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(I&: SI); }
1436
1437 /// Disable SROA entirely if there are unhandled users of the alloca.
1438 void visitInstruction(Instruction &I) { PI.setAborted(&I); }
1439
1440 void visitCallBase(CallBase &CB) {
1441 // If the call operand is read-only and only does a read-only or address
1442 // capture, then we mark it as EscapedReadOnly.
1443 if (CB.isDataOperand(U) &&
1444 !capturesFullProvenance(CC: CB.getCaptureInfo(OpNo: U->getOperandNo())) &&
1445 CB.onlyReadsMemory(OpNo: U->getOperandNo())) {
1446 PI.setEscapedReadOnly(&CB);
1447 return;
1448 }
1449
1450 Base::visitCallBase(CB);
1451 }
1452};
1453
1454AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
1455 :
1456#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1457 AI(AI),
1458#endif
1459 PointerEscapingInstr(nullptr), PointerEscapingInstrReadOnly(nullptr) {
1460 SliceBuilder PB(DL, AI, *this);
1461 SliceBuilder::PtrInfo PtrI = PB.visitPtr(I&: AI);
1462 if (PtrI.isEscaped() || PtrI.isAborted()) {
1463 // FIXME: We should sink the escape vs. abort info into the caller nicely,
1464 // possibly by just storing the PtrInfo in the AllocaSlices.
1465 PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst()
1466 : PtrI.getAbortingInst();
1467 assert(PointerEscapingInstr && "Did not track a bad instruction");
1468 return;
1469 }
1470 PointerEscapingInstrReadOnly = PtrI.getEscapedReadOnlyInst();
1471
1472 llvm::erase_if(C&: Slices, P: [](const Slice &S) { return S.isDead(); });
1473
1474 // Sort the uses. This arranges for the offsets to be in ascending order,
1475 // and the sizes to be in descending order.
1476 llvm::stable_sort(Range&: Slices);
1477}
1478
1479#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1480
1481void AllocaSlices::print(raw_ostream &OS, const_iterator I,
1482 StringRef Indent) const {
1483 printSlice(OS, I, Indent);
1484 OS << "\n";
1485 printUse(OS, I, Indent);
1486}
1487
1488void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I,
1489 StringRef Indent) const {
1490 OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")"
1491 << " slice #" << (I - begin())
1492 << (I->isSplittable() ? " (splittable)" : "");
1493}
1494
1495void AllocaSlices::printUse(raw_ostream &OS, const_iterator I,
1496 StringRef Indent) const {
1497 OS << Indent << " used by: " << *I->getUse()->getUser() << "\n";
1498}
1499
1500void AllocaSlices::print(raw_ostream &OS) const {
1501 if (PointerEscapingInstr) {
1502 OS << "Can't analyze slices for alloca: " << AI << "\n"
1503 << " A pointer to this alloca escaped by:\n"
1504 << " " << *PointerEscapingInstr << "\n";
1505 return;
1506 }
1507
1508 if (PointerEscapingInstrReadOnly)
1509 OS << "Escapes into ReadOnly: " << *PointerEscapingInstrReadOnly << "\n";
1510
1511 OS << "Slices of alloca: " << AI << "\n";
1512 for (const_iterator I = begin(), E = end(); I != E; ++I)
1513 print(OS, I);
1514}
1515
1516LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const {
1517 print(dbgs(), I);
1518}
1519LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }
1520
1521#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1522
1523/// Walk the range of a partitioning looking for a common type to cover this
1524/// sequence of slices.
1525static std::pair<Type *, IntegerType *>
1526findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E,
1527 uint64_t EndOffset) {
1528 Type *Ty = nullptr;
1529 bool TyIsCommon = true;
1530 IntegerType *ITy = nullptr;
1531
1532 // Note that we need to look at *every* alloca slice's Use to ensure we
1533 // always get consistent results regardless of the order of slices.
1534 for (AllocaSlices::const_iterator I = B; I != E; ++I) {
1535 Use *U = I->getUse();
1536 if (isa<IntrinsicInst>(Val: *U->getUser()))
1537 continue;
1538 if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset)
1539 continue;
1540
1541 Type *UserTy = nullptr;
1542 if (LoadInst *LI = dyn_cast<LoadInst>(Val: U->getUser())) {
1543 UserTy = LI->getType();
1544 } else if (StoreInst *SI = dyn_cast<StoreInst>(Val: U->getUser())) {
1545 UserTy = SI->getValueOperand()->getType();
1546 }
1547
1548 if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(Val: UserTy)) {
1549 // If the type is larger than the partition, skip it. We only encounter
1550 // this for split integer operations where we want to use the type of the
1551 // entity causing the split. Also skip if the type is not a byte width
1552 // multiple.
1553 if (UserITy->getBitWidth() % 8 != 0 ||
1554 UserITy->getBitWidth() / 8 > (EndOffset - B->beginOffset()))
1555 continue;
1556
1557 // Track the largest bitwidth integer type used in this way in case there
1558 // is no common type.
1559 if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth())
1560 ITy = UserITy;
1561 }
1562
1563 // To avoid depending on the order of slices, Ty and TyIsCommon must not
1564 // depend on types skipped above.
1565 if (!UserTy || (Ty && Ty != UserTy))
1566 TyIsCommon = false; // Give up on anything but an iN type.
1567 else
1568 Ty = UserTy;
1569 }
1570
1571 return {TyIsCommon ? Ty : nullptr, ITy};
1572}
1573
1574/// PHI instructions that use an alloca and are subsequently loaded can be
1575/// rewritten to load both input pointers in the pred blocks and then PHI the
1576/// results, allowing the load of the alloca to be promoted.
1577/// From this:
1578/// %P2 = phi [i32* %Alloca, i32* %Other]
1579/// %V = load i32* %P2
1580/// to:
1581/// %V1 = load i32* %Alloca -> will be mem2reg'd
1582/// ...
1583/// %V2 = load i32* %Other
1584/// ...
1585/// %V = phi [i32 %V1, i32 %V2]
1586///
1587/// We can do this to a select if its only uses are loads and if the operands
1588/// to the select can be loaded unconditionally.
1589///
1590/// FIXME: This should be hoisted into a generic utility, likely in
1591/// Transforms/Util/Local.h
1592static bool isSafePHIToSpeculate(PHINode &PN) {
1593 const DataLayout &DL = PN.getDataLayout();
1594
1595 // For now, we can only do this promotion if the load is in the same block
1596 // as the PHI, and if there are no stores between the phi and load.
1597 // TODO: Allow recursive phi users.
1598 // TODO: Allow stores.
1599 BasicBlock *BB = PN.getParent();
1600 Align MaxAlign;
1601 uint64_t APWidth = DL.getIndexTypeSizeInBits(Ty: PN.getType());
1602 Type *LoadType = nullptr;
1603 for (User *U : PN.users()) {
1604 LoadInst *LI = dyn_cast<LoadInst>(Val: U);
1605 if (!LI || !LI->isSimple())
1606 return false;
1607
1608 // For now we only allow loads in the same block as the PHI. This is
1609 // a common case that happens when instcombine merges two loads through
1610 // a PHI.
1611 if (LI->getParent() != BB)
1612 return false;
1613
1614 if (LoadType) {
1615 if (LoadType != LI->getType())
1616 return false;
1617 } else {
1618 LoadType = LI->getType();
1619 }
1620
1621 // Ensure that there are no instructions between the PHI and the load that
1622 // could store.
1623 for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI)
1624 if (BBI->mayWriteToMemory())
1625 return false;
1626
1627 MaxAlign = std::max(a: MaxAlign, b: LI->getAlign());
1628 }
1629
1630 if (!LoadType)
1631 return false;
1632
1633 APInt LoadSize =
1634 APInt(APWidth, DL.getTypeStoreSize(Ty: LoadType).getFixedValue());
1635
1636 // We can only transform this if it is safe to push the loads into the
1637 // predecessor blocks. The only thing to watch out for is that we can't put
1638 // a possibly trapping load in the predecessor if it is a critical edge.
1639 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1640 Instruction *TI = PN.getIncomingBlock(i: Idx)->getTerminator();
1641 Value *InVal = PN.getIncomingValue(i: Idx);
1642
1643 // If the value is produced by the terminator of the predecessor (an
1644 // invoke) or it has side-effects, there is no valid place to put a load
1645 // in the predecessor.
1646 if (TI == InVal || TI->mayHaveSideEffects())
1647 return false;
1648
1649 // If the predecessor has a single successor, then the edge isn't
1650 // critical.
1651 if (TI->getNumSuccessors() == 1)
1652 continue;
1653
1654 // If this pointer is always safe to load, or if we can prove that there
1655 // is already a load in the block, then we can move the load to the pred
1656 // block.
1657 if (isSafeToLoadUnconditionally(V: InVal, Alignment: MaxAlign, Size: LoadSize, DL, ScanFrom: TI))
1658 continue;
1659
1660 return false;
1661 }
1662
1663 return true;
1664}
1665
1666static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN) {
1667 LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
1668
1669 LoadInst *SomeLoad = cast<LoadInst>(Val: PN.user_back());
1670 Type *LoadTy = SomeLoad->getType();
1671 IRB.SetInsertPoint(&PN);
1672 PHINode *NewPN = IRB.CreatePHI(Ty: LoadTy, NumReservedValues: PN.getNumIncomingValues(),
1673 Name: PN.getName() + ".sroa.speculated");
1674
1675 // Get the AA tags and alignment to use from one of the loads. It does not
1676 // matter which one we get and if any differ.
1677 AAMDNodes AATags = SomeLoad->getAAMetadata();
1678 Align Alignment = SomeLoad->getAlign();
1679
1680 // Rewrite all loads of the PN to use the new PHI.
1681 while (!PN.use_empty()) {
1682 LoadInst *LI = cast<LoadInst>(Val: PN.user_back());
1683 LI->replaceAllUsesWith(V: NewPN);
1684 LI->eraseFromParent();
1685 }
1686
1687 // Inject loads into all of the pred blocks.
1688 DenseMap<BasicBlock *, Value *> InjectedLoads;
1689 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1690 BasicBlock *Pred = PN.getIncomingBlock(i: Idx);
1691 Value *InVal = PN.getIncomingValue(i: Idx);
1692
1693 // A PHI node is allowed to have multiple (duplicated) entries for the same
1694 // basic block, as long as the value is the same. So if we already injected
1695 // a load in the predecessor, then we should reuse the same load for all
1696 // duplicated entries.
1697 if (Value *V = InjectedLoads.lookup(Val: Pred)) {
1698 NewPN->addIncoming(V, BB: Pred);
1699 continue;
1700 }
1701
1702 Instruction *TI = Pred->getTerminator();
1703 IRB.SetInsertPoint(TI);
1704
1705 LoadInst *Load = IRB.CreateAlignedLoad(
1706 Ty: LoadTy, Ptr: InVal, Align: Alignment,
1707 Name: (PN.getName() + ".sroa.speculate.load." + Pred->getName()));
1708 ++NumLoadsSpeculated;
1709 if (AATags)
1710 Load->setAAMetadata(AATags);
1711 NewPN->addIncoming(V: Load, BB: Pred);
1712 InjectedLoads[Pred] = Load;
1713 }
1714
1715 LLVM_DEBUG(dbgs() << " speculated to: " << *NewPN << "\n");
1716 PN.eraseFromParent();
1717}
1718
1719SelectHandSpeculativity &
1720SelectHandSpeculativity::setAsSpeculatable(bool isTrueVal) {
1721 if (isTrueVal)
1722 Bitfield::set<SelectHandSpeculativity::TrueVal>(Packed&: Storage, Value: true);
1723 else
1724 Bitfield::set<SelectHandSpeculativity::FalseVal>(Packed&: Storage, Value: true);
1725 return *this;
1726}
1727
1728bool SelectHandSpeculativity::isSpeculatable(bool isTrueVal) const {
1729 return isTrueVal ? Bitfield::get<SelectHandSpeculativity::TrueVal>(Packed: Storage)
1730 : Bitfield::get<SelectHandSpeculativity::FalseVal>(Packed: Storage);
1731}
1732
1733bool SelectHandSpeculativity::areAllSpeculatable() const {
1734 return isSpeculatable(/*isTrueVal=*/true) &&
1735 isSpeculatable(/*isTrueVal=*/false);
1736}
1737
1738bool SelectHandSpeculativity::areAnySpeculatable() const {
1739 return isSpeculatable(/*isTrueVal=*/true) ||
1740 isSpeculatable(/*isTrueVal=*/false);
1741}
1742bool SelectHandSpeculativity::areNoneSpeculatable() const {
1743 return !areAnySpeculatable();
1744}
1745
1746static SelectHandSpeculativity
1747isSafeLoadOfSelectToSpeculate(LoadInst &LI, SelectInst &SI, bool PreserveCFG) {
1748 assert(LI.isSimple() && "Only for simple loads");
1749 SelectHandSpeculativity Spec;
1750
1751 const DataLayout &DL = SI.getDataLayout();
1752 for (Value *Value : {SI.getTrueValue(), SI.getFalseValue()})
1753 if (isSafeToLoadUnconditionally(V: Value, Ty: LI.getType(), Alignment: LI.getAlign(), DL,
1754 ScanFrom: &LI))
1755 Spec.setAsSpeculatable(/*isTrueVal=*/Value == SI.getTrueValue());
1756 else if (PreserveCFG)
1757 return Spec;
1758
1759 return Spec;
1760}
1761
1762std::optional<RewriteableMemOps>
1763SROA::isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG) {
1764 RewriteableMemOps Ops;
1765
1766 for (User *U : SI.users()) {
1767 if (auto *BC = dyn_cast<BitCastInst>(Val: U); BC && BC->hasOneUse())
1768 U = *BC->user_begin();
1769
1770 if (auto *Store = dyn_cast<StoreInst>(Val: U)) {
1771 // Note that atomic stores can be transformed; atomic semantics do not
1772 // have any meaning for a local alloca. Stores are not speculatable,
1773 // however, so if we can't turn it into a predicated store, we are done.
1774 if (Store->isVolatile() || PreserveCFG)
1775 return {}; // Give up on this `select`.
1776 Ops.emplace_back(Args&: Store);
1777 continue;
1778 }
1779
1780 auto *LI = dyn_cast<LoadInst>(Val: U);
1781
1782 // Note that atomic loads can be transformed;
1783 // atomic semantics do not have any meaning for a local alloca.
1784 if (!LI || LI->isVolatile())
1785 return {}; // Give up on this `select`.
1786
1787 PossiblySpeculatableLoad Load(LI);
1788 if (!LI->isSimple()) {
1789 // If the `load` is not simple, we can't speculatively execute it,
1790 // but we could handle this via a CFG modification. But can we?
1791 if (PreserveCFG)
1792 return {}; // Give up on this `select`.
1793 Ops.emplace_back(Args&: Load);
1794 continue;
1795 }
1796
1797 SelectHandSpeculativity Spec =
1798 isSafeLoadOfSelectToSpeculate(LI&: *LI, SI, PreserveCFG);
1799 if (PreserveCFG && !Spec.areAllSpeculatable())
1800 return {}; // Give up on this `select`.
1801
1802 Load.setInt(Spec);
1803 Ops.emplace_back(Args&: Load);
1804 }
1805
1806 return Ops;
1807}
1808
1809static void speculateSelectInstLoads(SelectInst &SI, LoadInst &LI,
1810 IRBuilderTy &IRB) {
1811 LLVM_DEBUG(dbgs() << " original load: " << SI << "\n");
1812
1813 Value *TV = SI.getTrueValue();
1814 Value *FV = SI.getFalseValue();
1815 // Replace the given load of the select with a select of two loads.
1816
1817 assert(LI.isSimple() && "We only speculate simple loads");
1818
1819 IRB.SetInsertPoint(&LI);
1820
1821 LoadInst *TL =
1822 IRB.CreateAlignedLoad(Ty: LI.getType(), Ptr: TV, Align: LI.getAlign(),
1823 Name: LI.getName() + ".sroa.speculate.load.true");
1824 LoadInst *FL =
1825 IRB.CreateAlignedLoad(Ty: LI.getType(), Ptr: FV, Align: LI.getAlign(),
1826 Name: LI.getName() + ".sroa.speculate.load.false");
1827 NumLoadsSpeculated += 2;
1828
1829 // Transfer alignment and AA info if present.
1830 TL->setAlignment(LI.getAlign());
1831 FL->setAlignment(LI.getAlign());
1832
1833 AAMDNodes Tags = LI.getAAMetadata();
1834 if (Tags) {
1835 TL->setAAMetadata(Tags);
1836 FL->setAAMetadata(Tags);
1837 }
1838
1839 Value *V = IRB.CreateSelect(C: SI.getCondition(), True: TL, False: FL,
1840 Name: LI.getName() + ".sroa.speculated",
1841 MDFrom: ProfcheckDisableMetadataFixes ? nullptr : &SI);
1842
1843 LLVM_DEBUG(dbgs() << " speculated to: " << *V << "\n");
1844 LI.replaceAllUsesWith(V);
1845}
1846
1847template <typename T>
1848static void rewriteMemOpOfSelect(SelectInst &SI, T &I,
1849 SelectHandSpeculativity Spec,
1850 DomTreeUpdater &DTU) {
1851 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && "Only for load and store!");
1852 LLVM_DEBUG(dbgs() << " original mem op: " << I << "\n");
1853 BasicBlock *Head = I.getParent();
1854 Instruction *ThenTerm = nullptr;
1855 Instruction *ElseTerm = nullptr;
1856 if (Spec.areNoneSpeculatable())
1857 SplitBlockAndInsertIfThenElse(SI.getCondition(), &I, &ThenTerm, &ElseTerm,
1858 SI.getMetadata(KindID: LLVMContext::MD_prof), &DTU);
1859 else {
1860 SplitBlockAndInsertIfThen(SI.getCondition(), &I, /*Unreachable=*/false,
1861 SI.getMetadata(KindID: LLVMContext::MD_prof), &DTU,
1862 /*LI=*/nullptr, /*ThenBlock=*/nullptr);
1863 if (Spec.isSpeculatable(/*isTrueVal=*/true))
1864 cast<CondBrInst>(Val: Head->getTerminator())->swapSuccessors();
1865 }
1866 auto *HeadBI = cast<CondBrInst>(Val: Head->getTerminator());
1867 Spec = {}; // Do not use `Spec` beyond this point.
1868 BasicBlock *Tail = I.getParent();
1869 Tail->setName(Head->getName() + ".cont");
1870 PHINode *PN;
1871 if (isa<LoadInst>(I))
1872 PN = PHINode::Create(Ty: I.getType(), NumReservedValues: 2, NameStr: "", InsertBefore: I.getIterator());
1873 for (BasicBlock *SuccBB : successors(BB: Head)) {
1874 bool IsThen = SuccBB == HeadBI->getSuccessor(i: 0);
1875 int SuccIdx = IsThen ? 0 : 1;
1876 auto *NewMemOpBB = SuccBB == Tail ? Head : SuccBB;
1877 auto &CondMemOp = cast<T>(*I.clone());
1878 if (NewMemOpBB != Head) {
1879 NewMemOpBB->setName(Head->getName() + (IsThen ? ".then" : ".else"));
1880 if (isa<LoadInst>(I))
1881 ++NumLoadsPredicated;
1882 else
1883 ++NumStoresPredicated;
1884 } else {
1885 CondMemOp.dropUBImplyingAttrsAndMetadata();
1886 ++NumLoadsSpeculated;
1887 }
1888 CondMemOp.insertBefore(NewMemOpBB->getTerminator()->getIterator());
1889 Value *Ptr = SI.getOperand(i_nocapture: 1 + SuccIdx);
1890 CondMemOp.setOperand(I.getPointerOperandIndex(), Ptr);
1891 if (isa<LoadInst>(I)) {
1892 CondMemOp.setName(I.getName() + (IsThen ? ".then" : ".else") + ".val");
1893 PN->addIncoming(V: &CondMemOp, BB: NewMemOpBB);
1894 } else
1895 LLVM_DEBUG(dbgs() << " to: " << CondMemOp << "\n");
1896 }
1897 if (isa<LoadInst>(I)) {
1898 PN->takeName(V: &I);
1899 LLVM_DEBUG(dbgs() << " to: " << *PN << "\n");
1900 I.replaceAllUsesWith(PN);
1901 }
1902}
1903
1904static void rewriteMemOpOfSelect(SelectInst &SelInst, Instruction &I,
1905 SelectHandSpeculativity Spec,
1906 DomTreeUpdater &DTU) {
1907 if (auto *LI = dyn_cast<LoadInst>(Val: &I))
1908 rewriteMemOpOfSelect(SI&: SelInst, I&: *LI, Spec, DTU);
1909 else if (auto *SI = dyn_cast<StoreInst>(Val: &I))
1910 rewriteMemOpOfSelect(SI&: SelInst, I&: *SI, Spec, DTU);
1911 else
1912 llvm_unreachable_internal(msg: "Only for load and store.");
1913}
1914
1915static bool rewriteSelectInstMemOps(SelectInst &SI,
1916 const RewriteableMemOps &Ops,
1917 IRBuilderTy &IRB, DomTreeUpdater *DTU) {
1918 bool CFGChanged = false;
1919 LLVM_DEBUG(dbgs() << " original select: " << SI << "\n");
1920
1921 for (const RewriteableMemOp &Op : Ops) {
1922 SelectHandSpeculativity Spec;
1923 Instruction *I;
1924 if (auto *const *US = std::get_if<UnspeculatableStore>(ptr: &Op)) {
1925 I = *US;
1926 } else {
1927 auto PSL = std::get<PossiblySpeculatableLoad>(v: Op);
1928 I = PSL.getPointer();
1929 Spec = PSL.getInt();
1930 }
1931 if (Spec.areAllSpeculatable()) {
1932 speculateSelectInstLoads(SI, LI&: cast<LoadInst>(Val&: *I), IRB);
1933 } else {
1934 assert(DTU && "Should not get here when not allowed to modify the CFG!");
1935 rewriteMemOpOfSelect(SelInst&: SI, I&: *I, Spec, DTU&: *DTU);
1936 CFGChanged = true;
1937 }
1938 I->eraseFromParent();
1939 }
1940
1941 for (User *U : make_early_inc_range(Range: SI.users()))
1942 cast<BitCastInst>(Val: U)->eraseFromParent();
1943 SI.eraseFromParent();
1944 return CFGChanged;
1945}
1946
1947/// Compute an adjusted pointer from Ptr by Offset bytes where the
1948/// resulting pointer has PointerTy.
1949static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
1950 APInt Offset, Type *PointerTy,
1951 const Twine &NamePrefix) {
1952 if (Offset != 0)
1953 Ptr = IRB.CreateInBoundsPtrAdd(Ptr, Offset: IRB.getInt(AI: Offset),
1954 Name: NamePrefix + "sroa_idx");
1955 return IRB.CreatePointerBitCastOrAddrSpaceCast(V: Ptr, DestTy: PointerTy,
1956 Name: NamePrefix + "sroa_cast");
1957}
1958
1959/// Compute the adjusted alignment for a load or store from an offset.
1960static Align getAdjustedAlignment(Instruction *I, uint64_t Offset) {
1961 return commonAlignment(A: getLoadStoreAlignment(I), Offset);
1962}
1963
1964/// Test whether we can convert a value from the old to the new type.
1965///
1966/// This predicate should be used to guard calls to convertValue in order to
1967/// ensure that we only try to convert viable values. The strategy is that we
1968/// will peel off single element struct and array wrappings to get to an
1969/// underlying value, and convert that value.
1970static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy,
1971 unsigned VScale = 0) {
1972 if (OldTy == NewTy)
1973 return true;
1974
1975 // For integer types, we can't handle any bit-width differences. This would
1976 // break both vector conversions with extension and introduce endianness
1977 // issues when in conjunction with loads and stores.
1978 if (isa<IntegerType>(Val: OldTy) && isa<IntegerType>(Val: NewTy)) {
1979 assert(cast<IntegerType>(OldTy)->getBitWidth() !=
1980 cast<IntegerType>(NewTy)->getBitWidth() &&
1981 "We can't have the same bitwidth for different int types");
1982 return false;
1983 }
1984
1985 TypeSize NewSize = DL.getTypeSizeInBits(Ty: NewTy);
1986 TypeSize OldSize = DL.getTypeSizeInBits(Ty: OldTy);
1987
1988 if ((isa<ScalableVectorType>(Val: NewTy) && isa<FixedVectorType>(Val: OldTy)) ||
1989 (isa<ScalableVectorType>(Val: OldTy) && isa<FixedVectorType>(Val: NewTy))) {
1990 // Conversion is only possible when the size of scalable vectors is known.
1991 if (!VScale)
1992 return false;
1993
1994 // For ptr-to-int and int-to-ptr casts, the pointer side is resolved within
1995 // a single domain (either fixed or scalable). Any additional conversion
1996 // between fixed and scalable types is handled through integer types.
1997 auto OldVTy = OldTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(OldTy) : OldTy;
1998 auto NewVTy = NewTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(NewTy) : NewTy;
1999
2000 if (isa<ScalableVectorType>(Val: NewTy)) {
2001 if (!VectorType::getWithSizeAndScalar(SizeTy: cast<VectorType>(Val: NewVTy), EltTy: OldVTy))
2002 return false;
2003
2004 NewSize = TypeSize::getFixed(ExactSize: NewSize.getKnownMinValue() * VScale);
2005 } else {
2006 if (!VectorType::getWithSizeAndScalar(SizeTy: cast<VectorType>(Val: OldVTy), EltTy: NewVTy))
2007 return false;
2008
2009 OldSize = TypeSize::getFixed(ExactSize: OldSize.getKnownMinValue() * VScale);
2010 }
2011 }
2012
2013 if (NewSize != OldSize)
2014 return false;
2015 if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
2016 return false;
2017
2018 // We can convert pointers to integers and vice-versa. Same for vectors
2019 // of pointers and integers.
2020 OldTy = OldTy->getScalarType();
2021 NewTy = NewTy->getScalarType();
2022 if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
2023 if (NewTy->isPointerTy() && OldTy->isPointerTy()) {
2024 unsigned OldAS = OldTy->getPointerAddressSpace();
2025 unsigned NewAS = NewTy->getPointerAddressSpace();
2026 // Convert pointers if they are pointers from the same address space or
2027 // different integral (not non-integral) address spaces with the same
2028 // pointer size.
2029 return OldAS == NewAS ||
2030 (!DL.isNonIntegralAddressSpace(AddrSpace: OldAS) &&
2031 !DL.isNonIntegralAddressSpace(AddrSpace: NewAS) &&
2032 DL.getPointerSize(AS: OldAS) == DL.getPointerSize(AS: NewAS));
2033 }
2034
2035 // We can convert integers to integral pointers, but not to non-integral
2036 // pointers.
2037 if (OldTy->isIntegerTy())
2038 return !DL.isNonIntegralPointerType(Ty: NewTy);
2039
2040 // We can convert integral pointers to integers, but non-integral pointers
2041 // need to remain pointers.
2042 if (!DL.isNonIntegralPointerType(Ty: OldTy))
2043 return NewTy->isIntegerTy();
2044
2045 return false;
2046 }
2047
2048 if (OldTy->isTargetExtTy() || NewTy->isTargetExtTy())
2049 return false;
2050
2051 return true;
2052}
2053
2054/// Test whether the given slice use can be promoted to a vector.
2055///
2056/// This function is called to test each entry in a partition which is slated
2057/// for a single slice.
2058static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
2059 VectorType *Ty,
2060 uint64_t ElementSize,
2061 const DataLayout &DL,
2062 unsigned VScale) {
2063 // First validate the slice offsets.
2064 uint64_t BeginOffset =
2065 std::max(a: S.beginOffset(), b: P.beginOffset()) - P.beginOffset();
2066 uint64_t BeginIndex = BeginOffset / ElementSize;
2067 if (BeginIndex * ElementSize != BeginOffset ||
2068 BeginIndex >= cast<FixedVectorType>(Val: Ty)->getNumElements())
2069 return false;
2070 uint64_t EndOffset = std::min(a: S.endOffset(), b: P.endOffset()) - P.beginOffset();
2071 uint64_t EndIndex = EndOffset / ElementSize;
2072 if (EndIndex * ElementSize != EndOffset ||
2073 EndIndex > cast<FixedVectorType>(Val: Ty)->getNumElements())
2074 return false;
2075
2076 assert(EndIndex > BeginIndex && "Empty vector!");
2077 uint64_t NumElements = EndIndex - BeginIndex;
2078 Type *SliceTy = (NumElements == 1)
2079 ? Ty->getElementType()
2080 : FixedVectorType::get(ElementType: Ty->getElementType(), NumElts: NumElements);
2081
2082 Type *SplitIntTy =
2083 Type::getIntNTy(C&: Ty->getContext(), N: NumElements * ElementSize * 8);
2084
2085 Use *U = S.getUse();
2086
2087 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Val: U->getUser())) {
2088 if (MI->isVolatile())
2089 return false;
2090 if (!S.isSplittable())
2091 return false; // Skip any unsplittable intrinsics.
2092 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: U->getUser())) {
2093 if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
2094 return false;
2095 } else if (LoadInst *LI = dyn_cast<LoadInst>(Val: U->getUser())) {
2096 if (LI->isVolatile())
2097 return false;
2098 Type *LTy = LI->getType();
2099 // Disable vector promotion when there are loads or stores of an FCA.
2100 if (LTy->isStructTy())
2101 return false;
2102 if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
2103 assert(LTy->isIntegerTy());
2104 LTy = SplitIntTy;
2105 }
2106 if (!canConvertValue(DL, OldTy: SliceTy, NewTy: LTy, VScale))
2107 return false;
2108 } else if (StoreInst *SI = dyn_cast<StoreInst>(Val: U->getUser())) {
2109 if (SI->isVolatile())
2110 return false;
2111 Type *STy = SI->getValueOperand()->getType();
2112 // Disable vector promotion when there are loads or stores of an FCA.
2113 if (STy->isStructTy())
2114 return false;
2115 if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
2116 assert(STy->isIntegerTy());
2117 STy = SplitIntTy;
2118 }
2119 if (!canConvertValue(DL, OldTy: STy, NewTy: SliceTy, VScale))
2120 return false;
2121 } else {
2122 return false;
2123 }
2124
2125 return true;
2126}
2127
2128/// Test whether any vector type in \p CandidateTys is viable for promotion.
2129///
2130/// This implements the necessary checking for \c isVectorPromotionViable over
2131/// all slices of the alloca for the given VectorType.
2132static VectorType *
2133checkVectorTypesForPromotion(Partition &P, const DataLayout &DL,
2134 SmallVectorImpl<VectorType *> &CandidateTys,
2135 bool HaveCommonEltTy, Type *CommonEltTy,
2136 bool HaveVecPtrTy, bool HaveCommonVecPtrTy,
2137 VectorType *CommonVecPtrTy, unsigned VScale) {
2138 // If we didn't find a vector type, nothing to do here.
2139 if (CandidateTys.empty())
2140 return nullptr;
2141
2142 // Pointer-ness is sticky, if we had a vector-of-pointers candidate type,
2143 // then we should choose it, not some other alternative.
2144 // But, we can't perform a no-op pointer address space change via bitcast,
2145 // so if we didn't have a common pointer element type, bail.
2146 if (HaveVecPtrTy && !HaveCommonVecPtrTy)
2147 return nullptr;
2148
2149 // Try to pick the "best" element type out of the choices.
2150 if (!HaveCommonEltTy && HaveVecPtrTy) {
2151 // If there was a pointer element type, there's really only one choice.
2152 CandidateTys.clear();
2153 CandidateTys.push_back(Elt: CommonVecPtrTy);
2154 } else if (!HaveCommonEltTy && !HaveVecPtrTy) {
2155 // Integer-ify vector types.
2156 for (VectorType *&VTy : CandidateTys) {
2157 if (!VTy->getElementType()->isIntegerTy())
2158 VTy = cast<VectorType>(Val: VTy->getWithNewType(EltTy: IntegerType::getIntNTy(
2159 C&: VTy->getContext(), N: VTy->getScalarSizeInBits())));
2160 }
2161
2162 // Rank the remaining candidate vector types. This is easy because we know
2163 // they're all integer vectors. We sort by ascending number of elements.
2164 auto RankVectorTypesComp = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
2165 (void)DL;
2166 assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
2167 DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
2168 "Cannot have vector types of different sizes!");
2169 assert(RHSTy->getElementType()->isIntegerTy() &&
2170 "All non-integer types eliminated!");
2171 assert(LHSTy->getElementType()->isIntegerTy() &&
2172 "All non-integer types eliminated!");
2173 return cast<FixedVectorType>(Val: RHSTy)->getNumElements() <
2174 cast<FixedVectorType>(Val: LHSTy)->getNumElements();
2175 };
2176 auto RankVectorTypesEq = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
2177 (void)DL;
2178 assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
2179 DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
2180 "Cannot have vector types of different sizes!");
2181 assert(RHSTy->getElementType()->isIntegerTy() &&
2182 "All non-integer types eliminated!");
2183 assert(LHSTy->getElementType()->isIntegerTy() &&
2184 "All non-integer types eliminated!");
2185 return cast<FixedVectorType>(Val: RHSTy)->getNumElements() ==
2186 cast<FixedVectorType>(Val: LHSTy)->getNumElements();
2187 };
2188 llvm::sort(C&: CandidateTys, Comp: RankVectorTypesComp);
2189 CandidateTys.erase(CS: llvm::unique(R&: CandidateTys, P: RankVectorTypesEq),
2190 CE: CandidateTys.end());
2191 } else {
2192// The only way to have the same element type in every vector type is to
2193// have the same vector type. Check that and remove all but one.
2194#ifndef NDEBUG
2195 for (VectorType *VTy : CandidateTys) {
2196 assert(VTy->getElementType() == CommonEltTy &&
2197 "Unaccounted for element type!");
2198 assert(VTy == CandidateTys[0] &&
2199 "Different vector types with the same element type!");
2200 }
2201#endif
2202 CandidateTys.resize(N: 1);
2203 }
2204
2205 // FIXME: hack. Do we have a named constant for this?
2206 // SDAG SDNode can't have more than 65535 operands.
2207 llvm::erase_if(C&: CandidateTys, P: [](VectorType *VTy) {
2208 return cast<FixedVectorType>(Val: VTy)->getNumElements() >
2209 std::numeric_limits<unsigned short>::max();
2210 });
2211
2212 // Find a vector type viable for promotion by iterating over all slices.
2213 auto *VTy = llvm::find_if(Range&: CandidateTys, P: [&](VectorType *VTy) -> bool {
2214 uint64_t ElementSize =
2215 DL.getTypeSizeInBits(Ty: VTy->getElementType()).getFixedValue();
2216
2217 // While the definition of LLVM vectors is bitpacked, we don't support sizes
2218 // that aren't byte sized.
2219 if (ElementSize % 8)
2220 return false;
2221 assert((DL.getTypeSizeInBits(VTy).getFixedValue() % 8) == 0 &&
2222 "vector size not a multiple of element size?");
2223 ElementSize /= 8;
2224
2225 for (const Slice &S : P)
2226 if (!isVectorPromotionViableForSlice(P, S, Ty: VTy, ElementSize, DL, VScale))
2227 return false;
2228
2229 for (const Slice *S : P.splitSliceTails())
2230 if (!isVectorPromotionViableForSlice(P, S: *S, Ty: VTy, ElementSize, DL, VScale))
2231 return false;
2232
2233 return true;
2234 });
2235 return VTy != CandidateTys.end() ? *VTy : nullptr;
2236}
2237
2238static VectorType *createAndCheckVectorTypesForPromotion(
2239 SetVector<Type *> &OtherTys, ArrayRef<VectorType *> CandidateTysCopy,
2240 function_ref<void(Type *)> CheckCandidateType, Partition &P,
2241 const DataLayout &DL, SmallVectorImpl<VectorType *> &CandidateTys,
2242 bool &HaveCommonEltTy, Type *&CommonEltTy, bool &HaveVecPtrTy,
2243 bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy, unsigned VScale) {
2244 [[maybe_unused]] VectorType *OriginalElt =
2245 CandidateTysCopy.size() ? CandidateTysCopy[0] : nullptr;
2246 // Consider additional vector types where the element type size is a
2247 // multiple of load/store element size.
2248 for (Type *Ty : OtherTys) {
2249 if (!VectorType::isValidElementType(ElemTy: Ty))
2250 continue;
2251 unsigned TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
2252 // Make a copy of CandidateTys and iterate through it, because we
2253 // might append to CandidateTys in the loop.
2254 for (VectorType *const VTy : CandidateTysCopy) {
2255 // The elements in the copy should remain invariant throughout the loop
2256 assert(CandidateTysCopy[0] == OriginalElt && "Different Element");
2257 unsigned VectorSize = DL.getTypeSizeInBits(Ty: VTy).getFixedValue();
2258 unsigned ElementSize =
2259 DL.getTypeSizeInBits(Ty: VTy->getElementType()).getFixedValue();
2260 if (TypeSize != VectorSize && TypeSize != ElementSize &&
2261 VectorSize % TypeSize == 0) {
2262 VectorType *NewVTy = VectorType::get(ElementType: Ty, NumElements: VectorSize / TypeSize, Scalable: false);
2263 CheckCandidateType(NewVTy);
2264 }
2265 }
2266 }
2267
2268 return checkVectorTypesForPromotion(
2269 P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
2270 HaveCommonVecPtrTy, CommonVecPtrTy, VScale);
2271}
2272
2273/// Test whether the given alloca partitioning and range of slices can be
2274/// promoted to a vector.
2275///
2276/// This is a quick test to check whether we can rewrite a particular alloca
2277/// partition (and its newly formed alloca) into a vector alloca with only
2278/// whole-vector loads and stores such that it could be promoted to a vector
2279/// SSA value. We only can ensure this for a limited set of operations, and we
2280/// don't want to do the rewrites unless we are confident that the result will
2281/// be promotable, so we have an early test here.
2282static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL,
2283 unsigned VScale) {
2284 // Collect the candidate types for vector-based promotion. Also track whether
2285 // we have different element types.
2286 SmallVector<VectorType *, 4> CandidateTys;
2287 SetVector<Type *> LoadStoreTys;
2288 SetVector<Type *> DeferredTys;
2289 Type *CommonEltTy = nullptr;
2290 VectorType *CommonVecPtrTy = nullptr;
2291 bool HaveVecPtrTy = false;
2292 bool HaveCommonEltTy = true;
2293 bool HaveCommonVecPtrTy = true;
2294 auto CheckCandidateType = [&](Type *Ty) {
2295 if (auto *VTy = dyn_cast<FixedVectorType>(Val: Ty)) {
2296 // Return if bitcast to vectors is different for total size in bits.
2297 if (!CandidateTys.empty()) {
2298 VectorType *V = CandidateTys[0];
2299 if (DL.getTypeSizeInBits(Ty: VTy).getFixedValue() !=
2300 DL.getTypeSizeInBits(Ty: V).getFixedValue()) {
2301 CandidateTys.clear();
2302 return;
2303 }
2304 }
2305 CandidateTys.push_back(Elt: VTy);
2306 Type *EltTy = VTy->getElementType();
2307
2308 if (!CommonEltTy)
2309 CommonEltTy = EltTy;
2310 else if (CommonEltTy != EltTy)
2311 HaveCommonEltTy = false;
2312
2313 if (EltTy->isPointerTy()) {
2314 HaveVecPtrTy = true;
2315 if (!CommonVecPtrTy)
2316 CommonVecPtrTy = VTy;
2317 else if (CommonVecPtrTy != VTy)
2318 HaveCommonVecPtrTy = false;
2319 }
2320 }
2321 };
2322
2323 // Put load and store types into a set for de-duplication.
2324 for (const Slice &S : P) {
2325 Type *Ty;
2326 if (auto *LI = dyn_cast<LoadInst>(Val: S.getUse()->getUser()))
2327 Ty = LI->getType();
2328 else if (auto *SI = dyn_cast<StoreInst>(Val: S.getUse()->getUser()))
2329 Ty = SI->getValueOperand()->getType();
2330 else
2331 continue;
2332
2333 auto CandTy = Ty->getScalarType();
2334 if (CandTy->isPointerTy() && (S.beginOffset() != P.beginOffset() ||
2335 S.endOffset() != P.endOffset())) {
2336 DeferredTys.insert(X: Ty);
2337 continue;
2338 }
2339
2340 LoadStoreTys.insert(X: Ty);
2341 // Consider any loads or stores that are the exact size of the slice.
2342 if (S.beginOffset() == P.beginOffset() && S.endOffset() == P.endOffset())
2343 CheckCandidateType(Ty);
2344 }
2345
2346 SmallVector<VectorType *, 4> CandidateTysCopy = CandidateTys;
2347 if (auto *VTy = createAndCheckVectorTypesForPromotion(
2348 OtherTys&: LoadStoreTys, CandidateTysCopy, CheckCandidateType, P, DL,
2349 CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
2350 HaveCommonVecPtrTy, CommonVecPtrTy, VScale))
2351 return VTy;
2352
2353 CandidateTys.clear();
2354 return createAndCheckVectorTypesForPromotion(
2355 OtherTys&: DeferredTys, CandidateTysCopy, CheckCandidateType, P, DL, CandidateTys,
2356 HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, HaveCommonVecPtrTy,
2357 CommonVecPtrTy, VScale);
2358}
2359
2360/// Test whether a slice of an alloca is valid for integer widening.
2361///
2362/// This implements the necessary checking for the \c isIntegerWideningViable
2363/// test below on a single slice of the alloca.
2364static bool isIntegerWideningViableForSlice(const Slice &S,
2365 uint64_t AllocBeginOffset,
2366 Type *AllocaTy,
2367 const DataLayout &DL,
2368 bool &WholeAllocaOp) {
2369 uint64_t Size = DL.getTypeStoreSize(Ty: AllocaTy).getFixedValue();
2370
2371 uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
2372 uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
2373
2374 Use *U = S.getUse();
2375
2376 // Lifetime intrinsics operate over the whole alloca whose sizes are usually
2377 // larger than other load/store slices (RelEnd > Size). But lifetime are
2378 // always promotable and should not impact other slices' promotability of the
2379 // partition.
2380 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: U->getUser())) {
2381 if (II->isLifetimeStartOrEnd() || II->isDroppable())
2382 return true;
2383 }
2384
2385 // We can't reasonably handle cases where the load or store extends past
2386 // the end of the alloca's type and into its padding.
2387 if (RelEnd > Size)
2388 return false;
2389
2390 if (LoadInst *LI = dyn_cast<LoadInst>(Val: U->getUser())) {
2391 if (LI->isVolatile())
2392 return false;
2393 // We can't handle loads that extend past the allocated memory.
2394 TypeSize LoadSize = DL.getTypeStoreSize(Ty: LI->getType());
2395 if (!LoadSize.isFixed() || LoadSize.getFixedValue() > Size)
2396 return false;
2397 // So far, AllocaSliceRewriter does not support widening split slice tails
2398 // in rewriteIntegerLoad.
2399 if (S.beginOffset() < AllocBeginOffset)
2400 return false;
2401 // Note that we don't count vector loads or stores as whole-alloca
2402 // operations which enable integer widening because we would prefer to use
2403 // vector widening instead.
2404 if (!isa<VectorType>(Val: LI->getType()) && RelBegin == 0 && RelEnd == Size)
2405 WholeAllocaOp = true;
2406 if (IntegerType *ITy = dyn_cast<IntegerType>(Val: LI->getType())) {
2407 if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(Ty: ITy).getFixedValue())
2408 return false;
2409 } else if (RelBegin != 0 || RelEnd != Size ||
2410 !canConvertValue(DL, OldTy: AllocaTy, NewTy: LI->getType())) {
2411 // Non-integer loads need to be convertible from the alloca type so that
2412 // they are promotable.
2413 return false;
2414 }
2415 } else if (StoreInst *SI = dyn_cast<StoreInst>(Val: U->getUser())) {
2416 Type *ValueTy = SI->getValueOperand()->getType();
2417 if (SI->isVolatile())
2418 return false;
2419 // We can't handle stores that extend past the allocated memory.
2420 TypeSize StoreSize = DL.getTypeStoreSize(Ty: ValueTy);
2421 if (!StoreSize.isFixed() || StoreSize.getFixedValue() > Size)
2422 return false;
2423 // So far, AllocaSliceRewriter does not support widening split slice tails
2424 // in rewriteIntegerStore.
2425 if (S.beginOffset() < AllocBeginOffset)
2426 return false;
2427 // Note that we don't count vector loads or stores as whole-alloca
2428 // operations which enable integer widening because we would prefer to use
2429 // vector widening instead.
2430 if (!isa<VectorType>(Val: ValueTy) && RelBegin == 0 && RelEnd == Size)
2431 WholeAllocaOp = true;
2432 if (IntegerType *ITy = dyn_cast<IntegerType>(Val: ValueTy)) {
2433 if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(Ty: ITy).getFixedValue())
2434 return false;
2435 } else if (RelBegin != 0 || RelEnd != Size ||
2436 !canConvertValue(DL, OldTy: ValueTy, NewTy: AllocaTy)) {
2437 // Non-integer stores need to be convertible to the alloca type so that
2438 // they are promotable.
2439 return false;
2440 }
2441 } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Val: U->getUser())) {
2442 if (MI->isVolatile() || !isa<Constant>(Val: MI->getLength()))
2443 return false;
2444 if (!S.isSplittable())
2445 return false; // Skip any unsplittable intrinsics.
2446 } else {
2447 return false;
2448 }
2449
2450 return true;
2451}
2452
2453/// Test whether the given alloca partition's integer operations can be
2454/// widened to promotable ones.
2455///
2456/// This is a quick test to check whether we can rewrite the integer loads and
2457/// stores to a particular alloca into wider loads and stores and be able to
2458/// promote the resulting alloca.
2459static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
2460 const DataLayout &DL) {
2461 uint64_t SizeInBits = DL.getTypeSizeInBits(Ty: AllocaTy).getFixedValue();
2462 // Don't create integer types larger than the maximum bitwidth.
2463 if (SizeInBits > IntegerType::MAX_INT_BITS)
2464 return false;
2465
2466 // Don't try to handle allocas with bit-padding.
2467 if (SizeInBits != DL.getTypeStoreSizeInBits(Ty: AllocaTy).getFixedValue())
2468 return false;
2469
2470 // We need to ensure that an integer type with the appropriate bitwidth can
2471 // be converted to the alloca type, whatever that is. We don't want to force
2472 // the alloca itself to have an integer type if there is a more suitable one.
2473 Type *IntTy = Type::getIntNTy(C&: AllocaTy->getContext(), N: SizeInBits);
2474 if (!canConvertValue(DL, OldTy: AllocaTy, NewTy: IntTy) ||
2475 !canConvertValue(DL, OldTy: IntTy, NewTy: AllocaTy))
2476 return false;
2477
2478 // While examining uses, we ensure that the alloca has a covering load or
2479 // store. We don't want to widen the integer operations only to fail to
2480 // promote due to some other unsplittable entry (which we may make splittable
2481 // later). However, if there are only splittable uses, go ahead and assume
2482 // that we cover the alloca.
2483 // FIXME: We shouldn't consider split slices that happen to start in the
2484 // partition here...
2485 bool WholeAllocaOp = P.empty() && DL.isLegalInteger(Width: SizeInBits);
2486
2487 for (const Slice &S : P)
2488 if (!isIntegerWideningViableForSlice(S, AllocBeginOffset: P.beginOffset(), AllocaTy, DL,
2489 WholeAllocaOp))
2490 return false;
2491
2492 for (const Slice *S : P.splitSliceTails())
2493 if (!isIntegerWideningViableForSlice(S: *S, AllocBeginOffset: P.beginOffset(), AllocaTy, DL,
2494 WholeAllocaOp))
2495 return false;
2496
2497 return WholeAllocaOp;
2498}
2499
2500static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
2501 IntegerType *Ty, uint64_t Offset,
2502 const Twine &Name) {
2503 LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2504 IntegerType *IntTy = cast<IntegerType>(Val: V->getType());
2505 assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <=
2506 DL.getTypeStoreSize(IntTy).getFixedValue() &&
2507 "Element extends past full value");
2508 uint64_t ShAmt = 8 * Offset;
2509 if (DL.isBigEndian())
2510 ShAmt = 8 * (DL.getTypeStoreSize(Ty: IntTy).getFixedValue() -
2511 DL.getTypeStoreSize(Ty).getFixedValue() - Offset);
2512 if (ShAmt) {
2513 V = IRB.CreateLShr(LHS: V, RHS: ShAmt, Name: Name + ".shift");
2514 LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2515 }
2516 assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2517 "Cannot extract to a larger integer!");
2518 if (Ty != IntTy) {
2519 V = IRB.CreateTrunc(V, DestTy: Ty, Name: Name + ".trunc");
2520 LLVM_DEBUG(dbgs() << " trunced: " << *V << "\n");
2521 }
2522 return V;
2523}
2524
2525static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
2526 Value *V, uint64_t Offset, const Twine &Name) {
2527 IntegerType *IntTy = cast<IntegerType>(Val: Old->getType());
2528 IntegerType *Ty = cast<IntegerType>(Val: V->getType());
2529 assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2530 "Cannot insert a larger integer!");
2531 LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2532 if (Ty != IntTy) {
2533 V = IRB.CreateZExt(V, DestTy: IntTy, Name: Name + ".ext");
2534 LLVM_DEBUG(dbgs() << " extended: " << *V << "\n");
2535 }
2536 assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <=
2537 DL.getTypeStoreSize(IntTy).getFixedValue() &&
2538 "Element store outside of alloca store");
2539 uint64_t ShAmt = 8 * Offset;
2540 if (DL.isBigEndian())
2541 ShAmt = 8 * (DL.getTypeStoreSize(Ty: IntTy).getFixedValue() -
2542 DL.getTypeStoreSize(Ty).getFixedValue() - Offset);
2543 if (ShAmt) {
2544 V = IRB.CreateShl(LHS: V, RHS: ShAmt, Name: Name + ".shift");
2545 LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2546 }
2547
2548 if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) {
2549 APInt Mask = ~Ty->getMask().zext(width: IntTy->getBitWidth()).shl(shiftAmt: ShAmt);
2550 Old = IRB.CreateAnd(LHS: Old, RHS: Mask, Name: Name + ".mask");
2551 LLVM_DEBUG(dbgs() << " masked: " << *Old << "\n");
2552 V = IRB.CreateOr(LHS: Old, RHS: V, Name: Name + ".insert");
2553 LLVM_DEBUG(dbgs() << " inserted: " << *V << "\n");
2554 }
2555 return V;
2556}
2557
2558static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
2559 unsigned EndIndex, const Twine &Name) {
2560 auto *VecTy = cast<FixedVectorType>(Val: V->getType());
2561 unsigned NumElements = EndIndex - BeginIndex;
2562 assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
2563
2564 if (NumElements == VecTy->getNumElements())
2565 return V;
2566
2567 if (NumElements == 1) {
2568 V = IRB.CreateExtractElement(Vec: V, Idx: IRB.getInt32(C: BeginIndex),
2569 Name: Name + ".extract");
2570 LLVM_DEBUG(dbgs() << " extract: " << *V << "\n");
2571 return V;
2572 }
2573
2574 auto Mask = llvm::to_vector<8>(Range: llvm::seq<int>(Begin: BeginIndex, End: EndIndex));
2575 V = IRB.CreateShuffleVector(V, Mask, Name: Name + ".extract");
2576 LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2577 return V;
2578}
2579
2580static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
2581 unsigned BeginIndex, const Twine &Name) {
2582 VectorType *VecTy = cast<VectorType>(Val: Old->getType());
2583 assert(VecTy && "Can only insert a vector into a vector");
2584
2585 VectorType *Ty = dyn_cast<VectorType>(Val: V->getType());
2586 if (!Ty) {
2587 // Single element to insert.
2588 V = IRB.CreateInsertElement(Vec: Old, NewElt: V, Idx: IRB.getInt32(C: BeginIndex),
2589 Name: Name + ".insert");
2590 LLVM_DEBUG(dbgs() << " insert: " << *V << "\n");
2591 return V;
2592 }
2593
2594 unsigned NumSubElements = cast<FixedVectorType>(Val: Ty)->getNumElements();
2595 unsigned NumElements = cast<FixedVectorType>(Val: VecTy)->getNumElements();
2596
2597 assert(NumSubElements <= NumElements && "Too many elements!");
2598 if (NumSubElements == NumElements) {
2599 assert(V->getType() == VecTy && "Vector type mismatch");
2600 return V;
2601 }
2602 unsigned EndIndex = BeginIndex + NumSubElements;
2603
2604 // When inserting a smaller vector into the larger to store, we first
2605 // use a shuffle vector to widen it with undef elements, and then
2606 // a second shuffle vector to select between the loaded vector and the
2607 // incoming vector.
2608 SmallVector<int, 8> Mask;
2609 Mask.reserve(N: NumElements);
2610 for (unsigned Idx = 0; Idx != NumElements; ++Idx)
2611 if (Idx >= BeginIndex && Idx < EndIndex)
2612 Mask.push_back(Elt: Idx - BeginIndex);
2613 else
2614 Mask.push_back(Elt: -1);
2615 V = IRB.CreateShuffleVector(V, Mask, Name: Name + ".expand");
2616 LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2617
2618 Mask.clear();
2619 for (unsigned Idx = 0; Idx != NumElements; ++Idx)
2620 if (Idx >= BeginIndex && Idx < EndIndex)
2621 Mask.push_back(Elt: Idx);
2622 else
2623 Mask.push_back(Elt: Idx + NumElements);
2624 V = IRB.CreateShuffleVector(V1: V, V2: Old, Mask, Name: Name + "blend");
2625 LLVM_DEBUG(dbgs() << " blend: " << *V << "\n");
2626 return V;
2627}
2628
2629/// This function takes two vector values and combines them into a single vector
2630/// by concatenating their elements. The function handles:
2631///
2632/// 1. Element type mismatch: If either vector's element type differs from
2633/// NewAIEltType, the function bitcasts the vector to use NewAIEltType while
2634/// preserving the total bit width (adjusting the number of elements
2635/// accordingly).
2636///
2637/// 2. Size mismatch: After transforming the vectors to have the desired element
2638/// type, if the two vectors have different numbers of elements, the smaller
2639/// vector is extended with poison values to match the size of the larger
2640/// vector before concatenation.
2641///
2642/// 3. Concatenation: The vectors are merged using a shuffle operation that
2643/// places all elements of V0 first, followed by all elements of V1.
2644///
2645/// \param V0 The first vector to merge (must be a vector type)
2646/// \param V1 The second vector to merge (must be a vector type)
2647/// \param DL The data layout for size calculations
2648/// \param NewAIEltTy The desired element type for the result vector
2649/// \param Builder IRBuilder for creating new instructions
2650/// \return A new vector containing all elements from V0 followed by all
2651/// elements from V1
2652static Value *mergeTwoVectors(Value *V0, Value *V1, const DataLayout &DL,
2653 Type *NewAIEltTy, IRBuilder<> &Builder) {
2654 // V0 and V1 are vectors
2655 // Create a new vector type with combined elements
2656 // Use ShuffleVector to concatenate the vectors
2657 auto *VecType0 = cast<FixedVectorType>(Val: V0->getType());
2658 auto *VecType1 = cast<FixedVectorType>(Val: V1->getType());
2659
2660 // If V0/V1 element types are different from NewAllocaElementType,
2661 // we need to introduce bitcasts before merging them
2662 auto BitcastIfNeeded = [&](Value *&V, FixedVectorType *&VecType,
2663 const char *DebugName) {
2664 Type *EltType = VecType->getElementType();
2665 if (EltType != NewAIEltTy) {
2666 // Calculate new number of elements to maintain same bit width
2667 unsigned TotalBits =
2668 VecType->getNumElements() * DL.getTypeSizeInBits(Ty: EltType);
2669 unsigned NewNumElts = TotalBits / DL.getTypeSizeInBits(Ty: NewAIEltTy);
2670
2671 auto *NewVecType = FixedVectorType::get(ElementType: NewAIEltTy, NumElts: NewNumElts);
2672 V = Builder.CreateBitCast(V, DestTy: NewVecType);
2673 VecType = NewVecType;
2674 LLVM_DEBUG(dbgs() << " bitcast " << DebugName << ": " << *V << "\n");
2675 }
2676 };
2677
2678 BitcastIfNeeded(V0, VecType0, "V0");
2679 BitcastIfNeeded(V1, VecType1, "V1");
2680
2681 unsigned NumElts0 = VecType0->getNumElements();
2682 unsigned NumElts1 = VecType1->getNumElements();
2683
2684 SmallVector<int, 16> ShuffleMask;
2685
2686 if (NumElts0 == NumElts1) {
2687 for (unsigned i = 0; i < NumElts0 + NumElts1; ++i)
2688 ShuffleMask.push_back(Elt: i);
2689 } else {
2690 // If two vectors have different sizes, we need to extend
2691 // the smaller vector to the size of the larger vector.
2692 unsigned SmallSize = std::min(a: NumElts0, b: NumElts1);
2693 unsigned LargeSize = std::max(a: NumElts0, b: NumElts1);
2694 bool IsV0Smaller = NumElts0 < NumElts1;
2695 Value *&ExtendedVec = IsV0Smaller ? V0 : V1;
2696 SmallVector<int, 16> ExtendMask;
2697 for (unsigned i = 0; i < SmallSize; ++i)
2698 ExtendMask.push_back(Elt: i);
2699 for (unsigned i = SmallSize; i < LargeSize; ++i)
2700 ExtendMask.push_back(Elt: PoisonMaskElem);
2701 ExtendedVec = Builder.CreateShuffleVector(
2702 V1: ExtendedVec, V2: PoisonValue::get(T: ExtendedVec->getType()), Mask: ExtendMask);
2703 LLVM_DEBUG(dbgs() << " shufflevector: " << *ExtendedVec << "\n");
2704 for (unsigned i = 0; i < NumElts0; ++i)
2705 ShuffleMask.push_back(Elt: i);
2706 for (unsigned i = 0; i < NumElts1; ++i)
2707 ShuffleMask.push_back(Elt: LargeSize + i);
2708 }
2709
2710 return Builder.CreateShuffleVector(V1: V0, V2: V1, Mask: ShuffleMask);
2711}
2712
2713namespace {
2714
2715/// Visitor to rewrite instructions using p particular slice of an alloca
2716/// to use a new alloca.
2717///
2718/// Also implements the rewriting to vector-based accesses when the partition
2719/// passes the isVectorPromotionViable predicate. Most of the rewriting logic
2720/// lives here.
2721class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
2722 // Befriend the base class so it can delegate to private visit methods.
2723 friend class InstVisitor<AllocaSliceRewriter, bool>;
2724
2725 using Base = InstVisitor<AllocaSliceRewriter, bool>;
2726
2727 const DataLayout &DL;
2728 AllocaSlices &AS;
2729 SROA &Pass;
2730 AllocaInst &OldAI, &NewAI;
2731 const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
2732 Type *NewAllocaTy;
2733
2734 // This is a convenience and flag variable that will be null unless the new
2735 // alloca's integer operations should be widened to this integer type due to
2736 // passing isIntegerWideningViable above. If it is non-null, the desired
2737 // integer type will be stored here for easy access during rewriting.
2738 IntegerType *IntTy;
2739
2740 // If we are rewriting an alloca partition which can be written as pure
2741 // vector operations, we stash extra information here. When VecTy is
2742 // non-null, we have some strict guarantees about the rewritten alloca:
2743 // - The new alloca is exactly the size of the vector type here.
2744 // - The accesses all either map to the entire vector or to a single
2745 // element.
2746 // - The set of accessing instructions is only one of those handled above
2747 // in isVectorPromotionViable. Generally these are the same access kinds
2748 // which are promotable via mem2reg.
2749 VectorType *VecTy;
2750 Type *ElementTy;
2751 uint64_t ElementSize;
2752
2753 // The original offset of the slice currently being rewritten relative to
2754 // the original alloca.
2755 uint64_t BeginOffset = 0;
2756 uint64_t EndOffset = 0;
2757
2758 // The new offsets of the slice currently being rewritten relative to the
2759 // original alloca.
2760 uint64_t NewBeginOffset = 0, NewEndOffset = 0;
2761
2762 uint64_t SliceSize = 0;
2763 bool IsSplittable = false;
2764 bool IsSplit = false;
2765 Use *OldUse = nullptr;
2766 Instruction *OldPtr = nullptr;
2767
2768 // Track post-rewrite users which are PHI nodes and Selects.
2769 SmallSetVector<PHINode *, 8> &PHIUsers;
2770 SmallSetVector<SelectInst *, 8> &SelectUsers;
2771
2772 // Utility IR builder, whose name prefix is setup for each visited use, and
2773 // the insertion point is set to point to the user.
2774 IRBuilderTy IRB;
2775
2776 // Return the new alloca, addrspacecasted if required to avoid changing the
2777 // addrspace of a volatile access.
2778 Value *getPtrToNewAI(unsigned AddrSpace, bool IsVolatile) {
2779 if (!IsVolatile || AddrSpace == NewAI.getType()->getPointerAddressSpace())
2780 return &NewAI;
2781
2782 Type *AccessTy = IRB.getPtrTy(AddrSpace);
2783 return IRB.CreateAddrSpaceCast(V: &NewAI, DestTy: AccessTy);
2784 }
2785
2786public:
2787 AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROA &Pass,
2788 AllocaInst &OldAI, AllocaInst &NewAI, Type *NewAllocaTy,
2789 uint64_t NewAllocaBeginOffset,
2790 uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
2791 VectorType *PromotableVecTy,
2792 SmallSetVector<PHINode *, 8> &PHIUsers,
2793 SmallSetVector<SelectInst *, 8> &SelectUsers)
2794 : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
2795 NewAllocaBeginOffset(NewAllocaBeginOffset),
2796 NewAllocaEndOffset(NewAllocaEndOffset), NewAllocaTy(NewAllocaTy),
2797 IntTy(IsIntegerPromotable
2798 ? Type::getIntNTy(
2799 C&: NewAI.getContext(),
2800 N: DL.getTypeSizeInBits(Ty: NewAllocaTy).getFixedValue())
2801 : nullptr),
2802 VecTy(PromotableVecTy),
2803 ElementTy(VecTy ? VecTy->getElementType() : nullptr),
2804 ElementSize(VecTy ? DL.getTypeSizeInBits(Ty: ElementTy).getFixedValue() / 8
2805 : 0),
2806 PHIUsers(PHIUsers), SelectUsers(SelectUsers),
2807 IRB(NewAI.getContext(), ConstantFolder()) {
2808 if (VecTy) {
2809 assert((DL.getTypeSizeInBits(ElementTy).getFixedValue() % 8) == 0 &&
2810 "Only multiple-of-8 sized vector elements are viable");
2811 ++NumVectorized;
2812 }
2813 assert((!IntTy && !VecTy) || (IntTy && !VecTy) || (!IntTy && VecTy));
2814 }
2815
2816 bool visit(AllocaSlices::const_iterator I) {
2817 bool CanSROA = true;
2818 BeginOffset = I->beginOffset();
2819 EndOffset = I->endOffset();
2820 IsSplittable = I->isSplittable();
2821 IsSplit =
2822 BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
2823 LLVM_DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : ""));
2824 LLVM_DEBUG(AS.printSlice(dbgs(), I, ""));
2825 LLVM_DEBUG(dbgs() << "\n");
2826
2827 // Compute the intersecting offset range.
2828 assert(BeginOffset < NewAllocaEndOffset);
2829 assert(EndOffset > NewAllocaBeginOffset);
2830 NewBeginOffset = std::max(a: BeginOffset, b: NewAllocaBeginOffset);
2831 NewEndOffset = std::min(a: EndOffset, b: NewAllocaEndOffset);
2832
2833 SliceSize = NewEndOffset - NewBeginOffset;
2834 LLVM_DEBUG(dbgs() << " Begin:(" << BeginOffset << ", " << EndOffset
2835 << ") NewBegin:(" << NewBeginOffset << ", "
2836 << NewEndOffset << ") NewAllocaBegin:("
2837 << NewAllocaBeginOffset << ", " << NewAllocaEndOffset
2838 << ")\n");
2839 assert(IsSplit || NewBeginOffset == BeginOffset);
2840 OldUse = I->getUse();
2841 OldPtr = cast<Instruction>(Val: OldUse->get());
2842
2843 Instruction *OldUserI = cast<Instruction>(Val: OldUse->getUser());
2844 IRB.SetInsertPoint(OldUserI);
2845 IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
2846 // Avoid materializing the name prefix when it is discarded anyway.
2847 if (!IRB.getContext().shouldDiscardValueNames())
2848 IRB.getInserter().SetNamePrefix(Twine(NewAI.getName()) + "." +
2849 Twine(BeginOffset) + ".");
2850
2851 CanSROA &= visit(I: cast<Instruction>(Val: OldUse->getUser()));
2852 if (VecTy || IntTy)
2853 assert(CanSROA);
2854 return CanSROA;
2855 }
2856
2857 /// Attempts to rewrite a partition using tree-structured merge optimization.
2858 ///
2859 /// This function analyzes a partition to determine if it can be optimized
2860 /// using a tree-structured merge pattern, where multiple non-overlapping
2861 /// stores completely fill an alloca. And there is no load from the alloca in
2862 /// the middle of the stores. Such patterns can be optimized by eliminating
2863 /// the intermediate stores and directly constructing the final vector by
2864 /// using shufflevectors.
2865 ///
2866 /// Example transformation:
2867 /// Before: (stores do not have to be in order)
2868 /// %alloca = alloca <8 x float>
2869 /// store <2 x float> %val0, ptr %alloca ; offset 0-1
2870 /// store <2 x float> %val2, ptr %alloca+16 ; offset 4-5
2871 /// store <2 x float> %val1, ptr %alloca+8 ; offset 2-3
2872 /// store <2 x float> %val3, ptr %alloca+24 ; offset 6-7
2873 ///
2874 /// After:
2875 /// %alloca = alloca <8 x float>
2876 /// %shuffle0 = shufflevector %val0, %val1, <4 x i32> <i32 0, i32 1, i32 2,
2877 /// i32 3>
2878 /// %shuffle1 = shufflevector %val2, %val3, <4 x i32> <i32 0, i32 1, i32 2,
2879 /// i32 3>
2880 /// %shuffle2 = shufflevector %shuffle0, %shuffle1, <8 x i32> <i32 0, i32 1,
2881 /// i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2882 /// store %shuffle2, ptr %alloca
2883 ///
2884 /// The optimization looks for partitions that:
2885 /// 1. Have no overlapping split slice tails
2886 /// 2. Contain non-overlapping stores that cover the entire alloca
2887 /// 3. Have exactly one load that reads the complete alloca structure and not
2888 /// in the middle of the stores (TODO: maybe we can relax the constraint
2889 /// about reading the entire alloca structure)
2890 ///
2891 /// \param P The partition to analyze and potentially rewrite
2892 /// \return An optional vector of values that were deleted during the rewrite
2893 /// process, or std::nullopt if the partition cannot be optimized
2894 /// using tree-structured merge
2895 std::optional<SmallVector<Value *, 4>>
2896 rewriteTreeStructuredMerge(Partition &P) {
2897 // No tail slices that overlap with the partition
2898 if (P.splitSliceTails().size() > 0)
2899 return std::nullopt;
2900
2901 SmallVector<Value *, 4> DeletedValues;
2902 LoadInst *TheLoad = nullptr;
2903
2904 // Structure to hold store information
2905 struct StoreInfo {
2906 StoreInst *Store;
2907 uint64_t BeginOffset;
2908 uint64_t EndOffset;
2909 Value *StoredValue;
2910 StoreInfo(StoreInst *SI, uint64_t Begin, uint64_t End, Value *Val)
2911 : Store(SI), BeginOffset(Begin), EndOffset(End), StoredValue(Val) {}
2912 };
2913
2914 SmallVector<StoreInfo, 4> StoreInfos;
2915
2916 // If the new alloca is a fixed vector type, we use its element type as the
2917 // allocated element type, otherwise we use i8 as the allocated element
2918 Type *AllocatedEltTy =
2919 isa<FixedVectorType>(Val: NewAllocaTy)
2920 ? cast<FixedVectorType>(Val: NewAllocaTy)->getElementType()
2921 : Type::getInt8Ty(C&: NewAI.getContext());
2922 unsigned AllocatedEltTySize = DL.getTypeSizeInBits(Ty: AllocatedEltTy);
2923
2924 // Helper to check if a type is
2925 // 1. A fixed vector type
2926 // 2. The element type is not a pointer
2927 // 3. The element type size is byte-aligned
2928 // We only handle the cases that the ld/st meet these conditions
2929 auto IsTypeValidForTreeStructuredMerge = [&](Type *Ty) -> bool {
2930 auto *FixedVecTy = dyn_cast<FixedVectorType>(Val: Ty);
2931 return FixedVecTy &&
2932 DL.getTypeSizeInBits(Ty: FixedVecTy->getElementType()) % 8 == 0 &&
2933 !FixedVecTy->getElementType()->isPointerTy();
2934 };
2935
2936 for (Slice &S : P) {
2937 auto *User = cast<Instruction>(Val: S.getUse()->getUser());
2938 if (auto *LI = dyn_cast<LoadInst>(Val: User)) {
2939 // Do not handle the case if
2940 // 1. There is more than one load
2941 // 2. The load is volatile
2942 // 3. The load does not read the entire alloca structure
2943 // 4. The load does not meet the conditions in the helper function
2944 if (TheLoad || !IsTypeValidForTreeStructuredMerge(LI->getType()) ||
2945 S.beginOffset() != NewAllocaBeginOffset ||
2946 S.endOffset() != NewAllocaEndOffset || LI->isVolatile())
2947 return std::nullopt;
2948 TheLoad = LI;
2949 } else if (auto *SI = dyn_cast<StoreInst>(Val: User)) {
2950 // Do not handle the case if
2951 // 1. The store does not meet the conditions in the helper function
2952 // 2. The store is volatile
2953 // 3. The total store size is not a multiple of the allocated element
2954 // type size
2955 if (!IsTypeValidForTreeStructuredMerge(
2956 SI->getValueOperand()->getType()) ||
2957 SI->isVolatile())
2958 return std::nullopt;
2959 auto *VecTy = cast<FixedVectorType>(Val: SI->getValueOperand()->getType());
2960 unsigned NumElts = VecTy->getNumElements();
2961 unsigned EltSize = DL.getTypeSizeInBits(Ty: VecTy->getElementType());
2962 if (NumElts * EltSize % AllocatedEltTySize != 0)
2963 return std::nullopt;
2964 StoreInfos.emplace_back(Args&: SI, Args: S.beginOffset(), Args: S.endOffset(),
2965 Args: SI->getValueOperand());
2966 } else {
2967 // If we have instructions other than load and store, we cannot do the
2968 // tree structured merge
2969 return std::nullopt;
2970 }
2971 }
2972 // If we do not have any load, we cannot do the tree structured merge
2973 if (!TheLoad)
2974 return std::nullopt;
2975
2976 // If we do not have multiple stores, we cannot do the tree structured merge
2977 if (StoreInfos.size() < 2)
2978 return std::nullopt;
2979
2980 // Stores should not overlap and should cover the whole alloca
2981 // Sort by begin offset
2982 llvm::sort(C&: StoreInfos, Comp: [](const StoreInfo &A, const StoreInfo &B) {
2983 return A.BeginOffset < B.BeginOffset;
2984 });
2985
2986 // Check for overlaps and coverage
2987 uint64_t ExpectedStart = NewAllocaBeginOffset;
2988 for (auto &StoreInfo : StoreInfos) {
2989 uint64_t BeginOff = StoreInfo.BeginOffset;
2990 uint64_t EndOff = StoreInfo.EndOffset;
2991
2992 // Check for gap or overlap
2993 if (BeginOff != ExpectedStart)
2994 return std::nullopt;
2995
2996 ExpectedStart = EndOff;
2997 }
2998 // Check that stores cover the entire alloca
2999 if (ExpectedStart != NewAllocaEndOffset)
3000 return std::nullopt;
3001
3002 // Stores should be in the same basic block
3003 // The load should not be in the middle of the stores
3004 // Note:
3005 // If the load is in a different basic block with the stores, we can still
3006 // do the tree structured merge. This is because we do not have the
3007 // store->load forwarding here. The merged vector will be stored back to
3008 // NewAI and the new load will load from NewAI. The forwarding will be
3009 // handled later when we try to promote NewAI.
3010 BasicBlock *LoadBB = TheLoad->getParent();
3011 BasicBlock *StoreBB = StoreInfos[0].Store->getParent();
3012
3013 for (auto &StoreInfo : StoreInfos) {
3014 if (StoreInfo.Store->getParent() != StoreBB)
3015 return std::nullopt;
3016 if (LoadBB == StoreBB && !StoreInfo.Store->comesBefore(Other: TheLoad))
3017 return std::nullopt;
3018 }
3019
3020 // If we reach here, the partition can be merged with a tree structured
3021 // merge
3022 LLVM_DEBUG({
3023 dbgs() << "Tree structured merge rewrite:\n Load: " << *TheLoad
3024 << "\n Ordered stores:\n";
3025 for (auto [i, Info] : enumerate(StoreInfos))
3026 dbgs() << " [" << i << "] Range[" << Info.BeginOffset << ", "
3027 << Info.EndOffset << ") \tStore: " << *Info.Store
3028 << "\tValue: " << *Info.StoredValue << "\n";
3029 });
3030
3031 // Instead of having these stores, we merge all the stored values into a
3032 // vector and store the merged value into the alloca
3033 std::queue<Value *> VecElements;
3034 // StoreInfos is sorted by offset, not by block order. Anchoring to
3035 // StoreInfos.back().Store (last by offset) can place shuffles before
3036 // operands that appear later in the block (invalid SSA). Insert before
3037 // TheLoad when it shares the store block (after all stores, before any
3038 // later IR in that block). Otherwise insert before the store block's
3039 // terminator so the merge runs after every store and any trailing
3040 // instructions in that block.
3041 IRBuilder<> Builder(LoadBB == StoreBB ? TheLoad : StoreBB->getTerminator());
3042 for (const auto &Info : StoreInfos) {
3043 DeletedValues.push_back(Elt: Info.Store);
3044 VecElements.push(x: Info.StoredValue);
3045 }
3046
3047 LLVM_DEBUG(dbgs() << " Rewrite stores into shufflevectors:\n");
3048 while (VecElements.size() > 1) {
3049 const auto NumElts = VecElements.size();
3050 for ([[maybe_unused]] const auto _ : llvm::seq(Size: NumElts / 2)) {
3051 Value *V0 = VecElements.front();
3052 VecElements.pop();
3053 Value *V1 = VecElements.front();
3054 VecElements.pop();
3055 Value *Merged = mergeTwoVectors(V0, V1, DL, NewAIEltTy: AllocatedEltTy, Builder);
3056 LLVM_DEBUG(dbgs() << " shufflevector: " << *Merged << "\n");
3057 VecElements.push(x: Merged);
3058 }
3059 if (NumElts % 2 == 1) {
3060 Value *V = VecElements.front();
3061 VecElements.pop();
3062 VecElements.push(x: V);
3063 }
3064 }
3065
3066 // Store the merged value into the alloca
3067 Value *MergedValue = VecElements.front();
3068 Builder.CreateAlignedStore(Val: MergedValue, Ptr: &NewAI, Align: getSliceAlign());
3069
3070 IRBuilder<> LoadBuilder(TheLoad);
3071 TheLoad->replaceAllUsesWith(V: LoadBuilder.CreateAlignedLoad(
3072 Ty: TheLoad->getType(), Ptr: &NewAI, Align: getSliceAlign(), isVolatile: TheLoad->isVolatile(),
3073 Name: TheLoad->getName() + ".sroa.new.load"));
3074 DeletedValues.push_back(Elt: TheLoad);
3075
3076 return DeletedValues;
3077 }
3078
3079private:
3080 // Make sure the other visit overloads are visible.
3081 using Base::visit;
3082
3083 // Every instruction which can end up as a user must have a rewrite rule.
3084 bool visitInstruction(Instruction &I) {
3085 LLVM_DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n");
3086 llvm_unreachable("No rewrite rule for this instruction!");
3087 }
3088
3089 Value *getNewAllocaSlicePtr(IRBuilderTy &IRB, Type *PointerTy) {
3090 // Note that the offset computation can use BeginOffset or NewBeginOffset
3091 // interchangeably for unsplit slices.
3092 assert(IsSplit || BeginOffset == NewBeginOffset);
3093 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3094
3095 StringRef OldName = OldPtr->getName();
3096 // Skip through the last '.sroa.' component of the name.
3097 size_t LastSROAPrefix = OldName.rfind(Str: ".sroa.");
3098 if (LastSROAPrefix != StringRef::npos) {
3099 OldName = OldName.substr(Start: LastSROAPrefix + strlen(s: ".sroa."));
3100 // Look for an SROA slice index.
3101 size_t IndexEnd = OldName.find_first_not_of(Chars: "0123456789");
3102 if (IndexEnd != StringRef::npos && OldName[IndexEnd] == '.') {
3103 // Strip the index and look for the offset.
3104 OldName = OldName.substr(Start: IndexEnd + 1);
3105 size_t OffsetEnd = OldName.find_first_not_of(Chars: "0123456789");
3106 if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == '.')
3107 // Strip the offset.
3108 OldName = OldName.substr(Start: OffsetEnd + 1);
3109 }
3110 }
3111 // Strip any SROA suffixes as well.
3112 OldName = OldName.substr(Start: 0, N: OldName.find(Str: ".sroa_"));
3113
3114 return getAdjustedPtr(IRB, DL, Ptr: &NewAI,
3115 Offset: APInt(DL.getIndexTypeSizeInBits(Ty: PointerTy), Offset),
3116 PointerTy, NamePrefix: Twine(OldName) + ".");
3117 }
3118
3119 /// Compute suitable alignment to access this slice of the *new*
3120 /// alloca.
3121 ///
3122 /// You can optionally pass a type to this routine and if that type's ABI
3123 /// alignment is itself suitable, this will return zero.
3124 Align getSliceAlign() {
3125 return commonAlignment(A: NewAI.getAlign(),
3126 Offset: NewBeginOffset - NewAllocaBeginOffset);
3127 }
3128
3129 unsigned getIndex(uint64_t Offset) {
3130 assert(VecTy && "Can only call getIndex when rewriting a vector");
3131 uint64_t RelOffset = Offset - NewAllocaBeginOffset;
3132 assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
3133 uint32_t Index = RelOffset / ElementSize;
3134 assert(Index * ElementSize == RelOffset);
3135 return Index;
3136 }
3137
3138 void deleteIfTriviallyDead(Value *V) {
3139 Instruction *I = cast<Instruction>(Val: V);
3140 if (isInstructionTriviallyDead(I))
3141 Pass.DeadInsts.push_back(Elt: I);
3142 }
3143
3144 Value *rewriteVectorizedLoadInst(LoadInst &LI) {
3145 unsigned BeginIndex = getIndex(Offset: NewBeginOffset);
3146 unsigned EndIndex = getIndex(Offset: NewEndOffset);
3147 assert(EndIndex > BeginIndex && "Empty vector!");
3148
3149 LoadInst *Load =
3150 IRB.CreateAlignedLoad(Ty: NewAllocaTy, Ptr: &NewAI, Align: NewAI.getAlign(), Name: "load");
3151
3152 Load->copyMetadata(SrcInst: LI, WL: {LLVMContext::MD_mem_parallel_loop_access,
3153 LLVMContext::MD_access_group});
3154 return extractVector(IRB, V: Load, BeginIndex, EndIndex, Name: "vec");
3155 }
3156
3157 Value *rewriteIntegerLoad(LoadInst &LI) {
3158 assert(IntTy && "We cannot insert an integer to the alloca");
3159 assert(!LI.isVolatile());
3160 Value *V =
3161 IRB.CreateAlignedLoad(Ty: NewAllocaTy, Ptr: &NewAI, Align: NewAI.getAlign(), Name: "load");
3162 V = IRB.CreateBitPreservingCastChain(DL, V, NewTy: IntTy);
3163 assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
3164 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3165 if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) {
3166 IntegerType *ExtractTy = Type::getIntNTy(C&: LI.getContext(), N: SliceSize * 8);
3167 V = extractInteger(DL, IRB, V, Ty: ExtractTy, Offset, Name: "extract");
3168 }
3169 // It is possible that the extracted type is not the load type. This
3170 // happens if there is a load past the end of the alloca, and as
3171 // a consequence the slice is narrower but still a candidate for integer
3172 // lowering. To handle this case, we just zero extend the extracted
3173 // integer.
3174 assert(cast<IntegerType>(LI.getType())->getBitWidth() >= SliceSize * 8 &&
3175 "Can only handle an extract for an overly wide load");
3176 if (cast<IntegerType>(Val: LI.getType())->getBitWidth() > SliceSize * 8)
3177 V = IRB.CreateZExt(V, DestTy: LI.getType());
3178 return V;
3179 }
3180
3181 bool visitLoadInst(LoadInst &LI) {
3182 LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
3183 Value *OldOp = LI.getOperand(i_nocapture: 0);
3184 assert(OldOp == OldPtr);
3185
3186 AAMDNodes AATags = LI.getAAMetadata();
3187
3188 unsigned AS = LI.getPointerAddressSpace();
3189
3190 Type *TargetTy = IsSplit ? Type::getIntNTy(C&: LI.getContext(), N: SliceSize * 8)
3191 : LI.getType();
3192 bool IsPtrAdjusted = false;
3193 Value *V;
3194 if (VecTy) {
3195 V = rewriteVectorizedLoadInst(LI);
3196 } else if (IntTy && LI.getType()->isIntegerTy()) {
3197 V = rewriteIntegerLoad(LI);
3198 } else if (NewBeginOffset == NewAllocaBeginOffset &&
3199 NewEndOffset == NewAllocaEndOffset &&
3200 (canConvertValue(DL, OldTy: NewAllocaTy, NewTy: TargetTy) ||
3201 (NewAllocaTy->isIntegerTy() && TargetTy->isIntegerTy() &&
3202 DL.getTypeStoreSize(Ty: TargetTy).getFixedValue() > SliceSize &&
3203 !LI.isVolatile()))) {
3204 Value *NewPtr =
3205 getPtrToNewAI(AddrSpace: LI.getPointerAddressSpace(), IsVolatile: LI.isVolatile());
3206 LoadInst *NewLI = IRB.CreateAlignedLoad(
3207 Ty: NewAllocaTy, Ptr: NewPtr, Align: NewAI.getAlign(), isVolatile: LI.isVolatile(), Name: LI.getName());
3208 if (LI.isVolatile())
3209 NewLI->setAtomic(Ordering: LI.getOrdering(), SSID: LI.getSyncScopeID());
3210 if (NewLI->isAtomic())
3211 NewLI->setAlignment(LI.getAlign());
3212
3213 // Copy any metadata that is valid for the new load. This may require
3214 // conversion to a different kind of metadata, e.g. !nonnull might change
3215 // to !range or vice versa.
3216 copyMetadataForLoad(Dest&: *NewLI, Source: LI);
3217
3218 // Do this after copyMetadataForLoad() to preserve the TBAA shift.
3219 if (AATags)
3220 NewLI->setAAMetadata(AATags.adjustForAccess(
3221 Offset: NewBeginOffset - BeginOffset, AccessTy: NewLI->getType(), DL));
3222
3223 // Try to preserve nonnull metadata
3224 V = NewLI;
3225
3226 // If this is an integer load past the end of the slice (which means the
3227 // bytes outside the slice are undef or this load is dead) just forcibly
3228 // fix the integer size with correct handling of endianness.
3229 if (auto *AITy = dyn_cast<IntegerType>(Val: NewAllocaTy))
3230 if (auto *TITy = dyn_cast<IntegerType>(Val: TargetTy))
3231 if (AITy->getBitWidth() < TITy->getBitWidth()) {
3232 V = IRB.CreateZExt(V, DestTy: TITy, Name: "load.ext");
3233 if (DL.isBigEndian())
3234 V = IRB.CreateShl(LHS: V, RHS: TITy->getBitWidth() - AITy->getBitWidth(),
3235 Name: "endian_shift");
3236 }
3237 } else {
3238 Type *LTy = IRB.getPtrTy(AddrSpace: AS);
3239 LoadInst *NewLI =
3240 IRB.CreateAlignedLoad(Ty: TargetTy, Ptr: getNewAllocaSlicePtr(IRB, PointerTy: LTy),
3241 Align: getSliceAlign(), isVolatile: LI.isVolatile(), Name: LI.getName());
3242
3243 if (AATags)
3244 NewLI->setAAMetadata(AATags.adjustForAccess(
3245 Offset: NewBeginOffset - BeginOffset, AccessTy: NewLI->getType(), DL));
3246
3247 if (LI.isVolatile())
3248 NewLI->setAtomic(Ordering: LI.getOrdering(), SSID: LI.getSyncScopeID());
3249 NewLI->copyMetadata(SrcInst: LI, WL: {LLVMContext::MD_mem_parallel_loop_access,
3250 LLVMContext::MD_access_group});
3251
3252 V = NewLI;
3253 IsPtrAdjusted = true;
3254 }
3255 V = IRB.CreateBitPreservingCastChain(DL, V, NewTy: TargetTy);
3256
3257 if (IsSplit) {
3258 assert(!LI.isVolatile());
3259 assert(LI.getType()->isIntegerTy() &&
3260 "Only integer type loads and stores are split");
3261 assert(SliceSize < DL.getTypeStoreSize(LI.getType()).getFixedValue() &&
3262 "Split load isn't smaller than original load");
3263 assert(DL.typeSizeEqualsStoreSize(LI.getType()) &&
3264 "Non-byte-multiple bit width");
3265 // Move the insertion point just past the load so that we can refer to it.
3266 BasicBlock::iterator LIIt = std::next(x: LI.getIterator());
3267 // Ensure the insertion point comes before any debug-info immediately
3268 // after the load, so that variable values referring to the load are
3269 // dominated by it.
3270 LIIt.setHeadBit(true);
3271 IRB.SetInsertPoint(TheBB: LI.getParent(), IP: LIIt);
3272 // Create a placeholder value with the same type as LI to use as the
3273 // basis for the new value. This allows us to replace the uses of LI with
3274 // the computed value, and then replace the placeholder with LI, leaving
3275 // LI only used for this computation.
3276 Value *Placeholder =
3277 new LoadInst(LI.getType(), PoisonValue::get(T: IRB.getPtrTy(AddrSpace: AS)), "",
3278 false, Align(1));
3279 V = insertInteger(DL, IRB, Old: Placeholder, V, Offset: NewBeginOffset - BeginOffset,
3280 Name: "insert");
3281 LI.replaceAllUsesWith(V);
3282 Placeholder->replaceAllUsesWith(V: &LI);
3283 Placeholder->deleteValue();
3284 } else {
3285 LI.replaceAllUsesWith(V);
3286 }
3287
3288 Pass.DeadInsts.push_back(Elt: &LI);
3289 deleteIfTriviallyDead(V: OldOp);
3290 LLVM_DEBUG(dbgs() << " to: " << *V << "\n");
3291 return !LI.isVolatile() && !IsPtrAdjusted;
3292 }
3293
3294 bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp,
3295 AAMDNodes AATags) {
3296 // Capture V for the purpose of debug-info accounting once it's converted
3297 // to a vector store.
3298 Value *OrigV = V;
3299 if (V->getType() != VecTy) {
3300 unsigned BeginIndex = getIndex(Offset: NewBeginOffset);
3301 unsigned EndIndex = getIndex(Offset: NewEndOffset);
3302 assert(EndIndex > BeginIndex && "Empty vector!");
3303 unsigned NumElements = EndIndex - BeginIndex;
3304 assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
3305 "Too many elements!");
3306 Type *SliceTy = (NumElements == 1)
3307 ? ElementTy
3308 : FixedVectorType::get(ElementType: ElementTy, NumElts: NumElements);
3309 if (V->getType() != SliceTy)
3310 V = IRB.CreateBitPreservingCastChain(DL, V, NewTy: SliceTy);
3311
3312 // Mix in the existing elements.
3313 Value *Old =
3314 IRB.CreateAlignedLoad(Ty: NewAllocaTy, Ptr: &NewAI, Align: NewAI.getAlign(), Name: "load");
3315 V = insertVector(IRB, Old, V, BeginIndex, Name: "vec");
3316 }
3317 StoreInst *Store = IRB.CreateAlignedStore(Val: V, Ptr: &NewAI, Align: NewAI.getAlign());
3318 Store->copyMetadata(SrcInst: SI, WL: {LLVMContext::MD_mem_parallel_loop_access,
3319 LLVMContext::MD_access_group});
3320 if (AATags)
3321 Store->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3322 AccessTy: V->getType(), DL));
3323 Pass.DeadInsts.push_back(Elt: &SI);
3324
3325 // NOTE: Careful to use OrigV rather than V.
3326 migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * 8, SliceSizeInBits: SliceSize * 8, OldInst: &SI,
3327 Inst: Store, Dest: Store->getPointerOperand(), Value: OrigV, DL);
3328 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3329 return true;
3330 }
3331
3332 bool rewriteIntegerStore(Value *V, StoreInst &SI, AAMDNodes AATags) {
3333 assert(IntTy && "We cannot extract an integer from the alloca");
3334 assert(!SI.isVolatile());
3335 if (DL.getTypeSizeInBits(Ty: V->getType()).getFixedValue() !=
3336 IntTy->getBitWidth()) {
3337 Value *Old = IRB.CreateAlignedLoad(Ty: NewAllocaTy, Ptr: &NewAI, Align: NewAI.getAlign(),
3338 Name: "oldload");
3339 Old = IRB.CreateBitPreservingCastChain(DL, V: Old, NewTy: IntTy);
3340 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
3341 uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
3342 V = insertInteger(DL, IRB, Old, V: SI.getValueOperand(), Offset, Name: "insert");
3343 }
3344 V = IRB.CreateBitPreservingCastChain(DL, V, NewTy: NewAllocaTy);
3345 StoreInst *Store = IRB.CreateAlignedStore(Val: V, Ptr: &NewAI, Align: NewAI.getAlign());
3346 Store->copyMetadata(SrcInst: SI, WL: {LLVMContext::MD_mem_parallel_loop_access,
3347 LLVMContext::MD_access_group});
3348 if (AATags)
3349 Store->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3350 AccessTy: V->getType(), DL));
3351
3352 migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * 8, SliceSizeInBits: SliceSize * 8, OldInst: &SI,
3353 Inst: Store, Dest: Store->getPointerOperand(),
3354 Value: Store->getValueOperand(), DL);
3355
3356 Pass.DeadInsts.push_back(Elt: &SI);
3357 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3358 return true;
3359 }
3360
3361 bool visitStoreInst(StoreInst &SI) {
3362 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3363 Value *OldOp = SI.getOperand(i_nocapture: 1);
3364 assert(OldOp == OldPtr);
3365
3366 AAMDNodes AATags = SI.getAAMetadata();
3367 Value *V = SI.getValueOperand();
3368
3369 // Strip all inbounds GEPs and pointer casts to try to dig out any root
3370 // alloca that should be re-examined after promoting this alloca.
3371 if (V->getType()->isPointerTy())
3372 if (AllocaInst *AI = dyn_cast<AllocaInst>(Val: V->stripInBoundsOffsets()))
3373 Pass.PostPromotionWorklist.insert(X: AI);
3374
3375 TypeSize StoreSize = DL.getTypeStoreSize(Ty: V->getType());
3376 if (StoreSize.isFixed() && SliceSize < StoreSize.getFixedValue()) {
3377 assert(!SI.isVolatile());
3378 assert(V->getType()->isIntegerTy() &&
3379 "Only integer type loads and stores are split");
3380 assert(DL.typeSizeEqualsStoreSize(V->getType()) &&
3381 "Non-byte-multiple bit width");
3382 IntegerType *NarrowTy = Type::getIntNTy(C&: SI.getContext(), N: SliceSize * 8);
3383 V = extractInteger(DL, IRB, V, Ty: NarrowTy, Offset: NewBeginOffset - BeginOffset,
3384 Name: "extract");
3385 }
3386
3387 if (VecTy)
3388 return rewriteVectorizedStoreInst(V, SI, OldOp, AATags);
3389 if (IntTy && V->getType()->isIntegerTy())
3390 return rewriteIntegerStore(V, SI, AATags);
3391
3392 StoreInst *NewSI;
3393 if (NewBeginOffset == NewAllocaBeginOffset &&
3394 NewEndOffset == NewAllocaEndOffset &&
3395 canConvertValue(DL, OldTy: V->getType(), NewTy: NewAllocaTy)) {
3396 V = IRB.CreateBitPreservingCastChain(DL, V, NewTy: NewAllocaTy);
3397 Value *NewPtr =
3398 getPtrToNewAI(AddrSpace: SI.getPointerAddressSpace(), IsVolatile: SI.isVolatile());
3399
3400 NewSI =
3401 IRB.CreateAlignedStore(Val: V, Ptr: NewPtr, Align: NewAI.getAlign(), isVolatile: SI.isVolatile());
3402 } else {
3403 unsigned AS = SI.getPointerAddressSpace();
3404 Value *NewPtr = getNewAllocaSlicePtr(IRB, PointerTy: IRB.getPtrTy(AddrSpace: AS));
3405 NewSI =
3406 IRB.CreateAlignedStore(Val: V, Ptr: NewPtr, Align: getSliceAlign(), isVolatile: SI.isVolatile());
3407 }
3408 NewSI->copyMetadata(SrcInst: SI, WL: {LLVMContext::MD_mem_parallel_loop_access,
3409 LLVMContext::MD_access_group});
3410 if (AATags)
3411 NewSI->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3412 AccessTy: V->getType(), DL));
3413 if (SI.isVolatile())
3414 NewSI->setAtomic(Ordering: SI.getOrdering(), SSID: SI.getSyncScopeID());
3415 if (NewSI->isAtomic())
3416 NewSI->setAlignment(SI.getAlign());
3417
3418 migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * 8, SliceSizeInBits: SliceSize * 8, OldInst: &SI,
3419 Inst: NewSI, Dest: NewSI->getPointerOperand(),
3420 Value: NewSI->getValueOperand(), DL);
3421
3422 Pass.DeadInsts.push_back(Elt: &SI);
3423 deleteIfTriviallyDead(V: OldOp);
3424
3425 LLVM_DEBUG(dbgs() << " to: " << *NewSI << "\n");
3426 return NewSI->getPointerOperand() == &NewAI &&
3427 NewSI->getValueOperand()->getType() == NewAllocaTy &&
3428 !SI.isVolatile();
3429 }
3430
3431 /// Compute an integer value from splatting an i8 across the given
3432 /// number of bytes.
3433 ///
3434 /// Note that this routine assumes an i8 is a byte. If that isn't true, don't
3435 /// call this routine.
3436 /// FIXME: Heed the advice above.
3437 ///
3438 /// \param V The i8 value to splat.
3439 /// \param Size The number of bytes in the output (assuming i8 is one byte)
3440 Value *getIntegerSplat(Value *V, unsigned Size) {
3441 assert(Size > 0 && "Expected a positive number of bytes.");
3442 IntegerType *VTy = cast<IntegerType>(Val: V->getType());
3443 assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte");
3444 if (Size == 1)
3445 return V;
3446
3447 Type *SplatIntTy = Type::getIntNTy(C&: VTy->getContext(), N: Size * 8);
3448 V = IRB.CreateMul(
3449 LHS: IRB.CreateZExt(V, DestTy: SplatIntTy, Name: "zext"),
3450 RHS: IRB.CreateUDiv(LHS: Constant::getAllOnesValue(Ty: SplatIntTy),
3451 RHS: IRB.CreateZExt(V: Constant::getAllOnesValue(Ty: V->getType()),
3452 DestTy: SplatIntTy)),
3453 Name: "isplat");
3454 return V;
3455 }
3456
3457 /// Compute a vector splat for a given element value.
3458 Value *getVectorSplat(Value *V, unsigned NumElements) {
3459 V = IRB.CreateVectorSplat(NumElts: NumElements, V, Name: "vsplat");
3460 LLVM_DEBUG(dbgs() << " splat: " << *V << "\n");
3461 return V;
3462 }
3463
3464 bool visitMemSetInst(MemSetInst &II) {
3465 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3466 assert(II.getRawDest() == OldPtr);
3467
3468 AAMDNodes AATags = II.getAAMetadata();
3469
3470 // If the memset has a variable size, it cannot be split, just adjust the
3471 // pointer to the new alloca.
3472 if (!isa<ConstantInt>(Val: II.getLength())) {
3473 assert(!IsSplit);
3474 assert(NewBeginOffset == BeginOffset);
3475 II.setDest(getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType()));
3476 II.setDestAlignment(getSliceAlign());
3477 // In theory we should call migrateDebugInfo here. However, we do not
3478 // emit dbg.assign intrinsics for mem intrinsics storing through non-
3479 // constant geps, or storing a variable number of bytes.
3480 assert(at::getDVRAssignmentMarkers(&II).empty() &&
3481 "AT: Unexpected link to non-const GEP");
3482 deleteIfTriviallyDead(V: OldPtr);
3483 return false;
3484 }
3485
3486 // Record this instruction for deletion.
3487 Pass.DeadInsts.push_back(Elt: &II);
3488
3489 Type *ScalarTy = NewAllocaTy->getScalarType();
3490
3491 const bool CanContinue = [&]() {
3492 if (VecTy || IntTy)
3493 return true;
3494 if (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset)
3495 return false;
3496 // Length must be in range for FixedVectorType.
3497 auto *C = cast<ConstantInt>(Val: II.getLength());
3498 const uint64_t Len = C->getLimitedValue();
3499 if (Len > std::numeric_limits<unsigned>::max())
3500 return false;
3501 auto *Int8Ty = IntegerType::getInt8Ty(C&: NewAI.getContext());
3502 auto *SrcTy = FixedVectorType::get(ElementType: Int8Ty, NumElts: Len);
3503 return canConvertValue(DL, OldTy: SrcTy, NewTy: NewAllocaTy) &&
3504 DL.isLegalInteger(Width: DL.getTypeSizeInBits(Ty: ScalarTy).getFixedValue());
3505 }();
3506
3507 // If this doesn't map cleanly onto the alloca type, and that type isn't
3508 // a single value type, just emit a memset.
3509 if (!CanContinue) {
3510 Type *SizeTy = II.getLength()->getType();
3511 unsigned Sz = NewEndOffset - NewBeginOffset;
3512 Constant *Size = ConstantInt::get(Ty: SizeTy, V: Sz);
3513 MemIntrinsic *New = cast<MemIntrinsic>(Val: IRB.CreateMemSet(
3514 Ptr: getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType()), Val: II.getValue(), Size,
3515 Align: MaybeAlign(getSliceAlign()), isVolatile: II.isVolatile()));
3516 if (AATags)
3517 New->setAAMetadata(
3518 AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset, AccessSize: Sz));
3519
3520 migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * 8, SliceSizeInBits: SliceSize * 8, OldInst: &II,
3521 Inst: New, Dest: New->getRawDest(), Value: nullptr, DL);
3522
3523 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3524 return false;
3525 }
3526
3527 // If we can represent this as a simple value, we have to build the actual
3528 // value to store, which requires expanding the byte present in memset to
3529 // a sensible representation for the alloca type. This is essentially
3530 // splatting the byte to a sufficiently wide integer, splatting it across
3531 // any desired vector width, and bitcasting to the final type.
3532 Value *V;
3533
3534 if (VecTy) {
3535 // If this is a memset of a vectorized alloca, insert it.
3536 assert(ElementTy == ScalarTy);
3537
3538 unsigned BeginIndex = getIndex(Offset: NewBeginOffset);
3539 unsigned EndIndex = getIndex(Offset: NewEndOffset);
3540 assert(EndIndex > BeginIndex && "Empty vector!");
3541 unsigned NumElements = EndIndex - BeginIndex;
3542 assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
3543 "Too many elements!");
3544
3545 Value *Splat = getIntegerSplat(
3546 V: II.getValue(), Size: DL.getTypeSizeInBits(Ty: ElementTy).getFixedValue() / 8);
3547 Splat = IRB.CreateBitPreservingCastChain(DL, V: Splat, NewTy: ElementTy);
3548 if (NumElements > 1)
3549 Splat = getVectorSplat(V: Splat, NumElements);
3550
3551 Value *Old = IRB.CreateAlignedLoad(Ty: NewAllocaTy, Ptr: &NewAI, Align: NewAI.getAlign(),
3552 Name: "oldload");
3553 V = insertVector(IRB, Old, V: Splat, BeginIndex, Name: "vec");
3554 } else if (IntTy) {
3555 // If this is a memset on an alloca where we can widen stores, insert the
3556 // set integer.
3557 assert(!II.isVolatile());
3558
3559 uint64_t Size = NewEndOffset - NewBeginOffset;
3560 V = getIntegerSplat(V: II.getValue(), Size);
3561
3562 if (IntTy && (NewBeginOffset != NewAllocaBeginOffset ||
3563 NewEndOffset != NewAllocaEndOffset)) {
3564 Value *Old = IRB.CreateAlignedLoad(Ty: NewAllocaTy, Ptr: &NewAI,
3565 Align: NewAI.getAlign(), Name: "oldload");
3566 Old = IRB.CreateBitPreservingCastChain(DL, V: Old, NewTy: IntTy);
3567 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3568 V = insertInteger(DL, IRB, Old, V, Offset, Name: "insert");
3569 } else {
3570 assert(V->getType() == IntTy &&
3571 "Wrong type for an alloca wide integer!");
3572 }
3573 V = IRB.CreateBitPreservingCastChain(DL, V, NewTy: NewAllocaTy);
3574 } else {
3575 // Established these invariants above.
3576 assert(NewBeginOffset == NewAllocaBeginOffset);
3577 assert(NewEndOffset == NewAllocaEndOffset);
3578
3579 V = getIntegerSplat(V: II.getValue(),
3580 Size: DL.getTypeSizeInBits(Ty: ScalarTy).getFixedValue() / 8);
3581 if (VectorType *AllocaVecTy = dyn_cast<VectorType>(Val: NewAllocaTy))
3582 V = getVectorSplat(
3583 V, NumElements: cast<FixedVectorType>(Val: AllocaVecTy)->getNumElements());
3584
3585 V = IRB.CreateBitPreservingCastChain(DL, V, NewTy: NewAllocaTy);
3586 }
3587
3588 Value *NewPtr = getPtrToNewAI(AddrSpace: II.getDestAddressSpace(), IsVolatile: II.isVolatile());
3589 StoreInst *New =
3590 IRB.CreateAlignedStore(Val: V, Ptr: NewPtr, Align: NewAI.getAlign(), isVolatile: II.isVolatile());
3591 New->copyMetadata(SrcInst: II, WL: {LLVMContext::MD_mem_parallel_loop_access,
3592 LLVMContext::MD_access_group});
3593 if (AATags)
3594 New->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3595 AccessTy: V->getType(), DL));
3596
3597 migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * 8, SliceSizeInBits: SliceSize * 8, OldInst: &II,
3598 Inst: New, Dest: New->getPointerOperand(), Value: V, DL);
3599
3600 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3601 return !II.isVolatile();
3602 }
3603
3604 bool visitMemTransferInst(MemTransferInst &II) {
3605 // Rewriting of memory transfer instructions can be a bit tricky. We break
3606 // them into two categories: split intrinsics and unsplit intrinsics.
3607
3608 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3609
3610 AAMDNodes AATags = II.getAAMetadata();
3611
3612 bool IsDest = &II.getRawDestUse() == OldUse;
3613 assert((IsDest && II.getRawDest() == OldPtr) ||
3614 (!IsDest && II.getRawSource() == OldPtr));
3615
3616 Align SliceAlign = getSliceAlign();
3617 // For unsplit intrinsics, we simply modify the source and destination
3618 // pointers in place. This isn't just an optimization, it is a matter of
3619 // correctness. With unsplit intrinsics we may be dealing with transfers
3620 // within a single alloca before SROA ran, or with transfers that have
3621 // a variable length. We may also be dealing with memmove instead of
3622 // memcpy, and so simply updating the pointers is the necessary for us to
3623 // update both source and dest of a single call.
3624 if (!IsSplittable) {
3625 Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType());
3626 if (IsDest) {
3627 // Update the address component of linked dbg.assigns.
3628 for (DbgVariableRecord *DbgAssign : at::getDVRAssignmentMarkers(Inst: &II)) {
3629 if (llvm::is_contained(Range: DbgAssign->location_ops(), Element: II.getDest()) ||
3630 DbgAssign->getAddress() == II.getDest())
3631 DbgAssign->replaceVariableLocationOp(OldValue: II.getDest(), NewValue: AdjustedPtr);
3632 }
3633 II.setDest(AdjustedPtr);
3634 II.setDestAlignment(SliceAlign);
3635 } else {
3636 II.setSource(AdjustedPtr);
3637 II.setSourceAlignment(SliceAlign);
3638 }
3639
3640 LLVM_DEBUG(dbgs() << " to: " << II << "\n");
3641 deleteIfTriviallyDead(V: OldPtr);
3642 return false;
3643 }
3644 // For split transfer intrinsics we have an incredibly useful assurance:
3645 // the source and destination do not reside within the same alloca, and at
3646 // least one of them does not escape. This means that we can replace
3647 // memmove with memcpy, and we don't need to worry about all manner of
3648 // downsides to splitting and transforming the operations.
3649
3650 // If this doesn't map cleanly onto the alloca type, and that type isn't
3651 // a single value type, just emit a memcpy.
3652 bool EmitMemCpy =
3653 !VecTy && !IntTy &&
3654 (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
3655 SliceSize != DL.getTypeStoreSize(Ty: NewAllocaTy).getFixedValue() ||
3656 !DL.typeSizeEqualsStoreSize(Ty: NewAllocaTy) ||
3657 !NewAllocaTy->isSingleValueType());
3658
3659 // If we're just going to emit a memcpy, the alloca hasn't changed, and the
3660 // size hasn't been shrunk based on analysis of the viable range, this is
3661 // a no-op.
3662 if (EmitMemCpy && &OldAI == &NewAI) {
3663 // Ensure the start lines up.
3664 assert(NewBeginOffset == BeginOffset);
3665
3666 // Rewrite the size as needed.
3667 if (NewEndOffset != EndOffset)
3668 II.setLength(NewEndOffset - NewBeginOffset);
3669 return false;
3670 }
3671 // Record this instruction for deletion.
3672 Pass.DeadInsts.push_back(Elt: &II);
3673
3674 // Strip all inbounds GEPs and pointer casts to try to dig out any root
3675 // alloca that should be re-examined after rewriting this instruction.
3676 Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
3677 if (AllocaInst *AI =
3678 dyn_cast<AllocaInst>(Val: OtherPtr->stripInBoundsOffsets())) {
3679 assert(AI != &OldAI && AI != &NewAI &&
3680 "Splittable transfers cannot reach the same alloca on both ends.");
3681 Pass.Worklist.insert(X: AI);
3682 }
3683
3684 Type *OtherPtrTy = OtherPtr->getType();
3685 unsigned OtherAS = OtherPtrTy->getPointerAddressSpace();
3686
3687 // Compute the relative offset for the other pointer within the transfer.
3688 unsigned OffsetWidth = DL.getIndexSizeInBits(AS: OtherAS);
3689 APInt OtherOffset(OffsetWidth, NewBeginOffset - BeginOffset);
3690 Align OtherAlign =
3691 (IsDest ? II.getSourceAlign() : II.getDestAlign()).valueOrOne();
3692 OtherAlign =
3693 commonAlignment(A: OtherAlign, Offset: OtherOffset.zextOrTrunc(width: 64).getZExtValue());
3694
3695 if (EmitMemCpy) {
3696 // Compute the other pointer, folding as much as possible to produce
3697 // a single, simple GEP in most cases.
3698 OtherPtr = getAdjustedPtr(IRB, DL, Ptr: OtherPtr, Offset: OtherOffset, PointerTy: OtherPtrTy,
3699 NamePrefix: OtherPtr->getName() + ".");
3700
3701 Value *OurPtr = getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType());
3702 Type *SizeTy = II.getLength()->getType();
3703 Constant *Size = ConstantInt::get(Ty: SizeTy, V: NewEndOffset - NewBeginOffset);
3704
3705 Value *DestPtr, *SrcPtr;
3706 MaybeAlign DestAlign, SrcAlign;
3707 // Note: IsDest is true iff we're copying into the new alloca slice
3708 if (IsDest) {
3709 DestPtr = OurPtr;
3710 DestAlign = SliceAlign;
3711 SrcPtr = OtherPtr;
3712 SrcAlign = OtherAlign;
3713 } else {
3714 DestPtr = OtherPtr;
3715 DestAlign = OtherAlign;
3716 SrcPtr = OurPtr;
3717 SrcAlign = SliceAlign;
3718 }
3719 CallInst *New = IRB.CreateMemCpy(Dst: DestPtr, DstAlign: DestAlign, Src: SrcPtr, SrcAlign,
3720 Size, isVolatile: II.isVolatile());
3721 if (AATags)
3722 New->setAAMetadata(AATags.shift(Offset: NewBeginOffset - BeginOffset));
3723
3724 APInt Offset(DL.getIndexTypeSizeInBits(Ty: DestPtr->getType()), 0);
3725 if (IsDest) {
3726 migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * 8, SliceSizeInBits: SliceSize * 8,
3727 OldInst: &II, Inst: New, Dest: DestPtr, Value: nullptr, DL);
3728 } else if (AllocaInst *Base = dyn_cast<AllocaInst>(
3729 Val: DestPtr->stripAndAccumulateConstantOffsets(
3730 DL, Offset, /*AllowNonInbounds*/ true))) {
3731 migrateDebugInfo(OldAlloca: Base, IsSplit, OldAllocaOffsetInBits: Offset.getZExtValue() * 8,
3732 SliceSizeInBits: SliceSize * 8, OldInst: &II, Inst: New, Dest: DestPtr, Value: nullptr, DL);
3733 }
3734 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3735 return false;
3736 }
3737
3738 bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset &&
3739 NewEndOffset == NewAllocaEndOffset;
3740 uint64_t Size = NewEndOffset - NewBeginOffset;
3741 unsigned BeginIndex = VecTy ? getIndex(Offset: NewBeginOffset) : 0;
3742 unsigned EndIndex = VecTy ? getIndex(Offset: NewEndOffset) : 0;
3743 unsigned NumElements = EndIndex - BeginIndex;
3744 IntegerType *SubIntTy =
3745 IntTy ? Type::getIntNTy(C&: IntTy->getContext(), N: Size * 8) : nullptr;
3746
3747 // Reset the other pointer type to match the register type we're going to
3748 // use, but using the address space of the original other pointer.
3749 Type *OtherTy;
3750 if (VecTy && !IsWholeAlloca) {
3751 if (NumElements == 1)
3752 OtherTy = VecTy->getElementType();
3753 else
3754 OtherTy = FixedVectorType::get(ElementType: VecTy->getElementType(), NumElts: NumElements);
3755 } else if (IntTy && !IsWholeAlloca) {
3756 OtherTy = SubIntTy;
3757 } else {
3758 OtherTy = NewAllocaTy;
3759 }
3760
3761 Value *AdjPtr = getAdjustedPtr(IRB, DL, Ptr: OtherPtr, Offset: OtherOffset, PointerTy: OtherPtrTy,
3762 NamePrefix: OtherPtr->getName() + ".");
3763 MaybeAlign SrcAlign = OtherAlign;
3764 MaybeAlign DstAlign = SliceAlign;
3765 if (!IsDest)
3766 std::swap(a&: SrcAlign, b&: DstAlign);
3767
3768 Value *SrcPtr;
3769 Value *DstPtr;
3770
3771 if (IsDest) {
3772 DstPtr = getPtrToNewAI(AddrSpace: II.getDestAddressSpace(), IsVolatile: II.isVolatile());
3773 SrcPtr = AdjPtr;
3774 } else {
3775 DstPtr = AdjPtr;
3776 SrcPtr = getPtrToNewAI(AddrSpace: II.getSourceAddressSpace(), IsVolatile: II.isVolatile());
3777 }
3778
3779 Value *Src;
3780 if (VecTy && !IsWholeAlloca && !IsDest) {
3781 Src =
3782 IRB.CreateAlignedLoad(Ty: NewAllocaTy, Ptr: &NewAI, Align: NewAI.getAlign(), Name: "load");
3783 Src = extractVector(IRB, V: Src, BeginIndex, EndIndex, Name: "vec");
3784 } else if (IntTy && !IsWholeAlloca && !IsDest) {
3785 Src =
3786 IRB.CreateAlignedLoad(Ty: NewAllocaTy, Ptr: &NewAI, Align: NewAI.getAlign(), Name: "load");
3787 Src = IRB.CreateBitPreservingCastChain(DL, V: Src, NewTy: IntTy);
3788 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3789 Src = extractInteger(DL, IRB, V: Src, Ty: SubIntTy, Offset, Name: "extract");
3790 } else {
3791 LoadInst *Load = IRB.CreateAlignedLoad(Ty: OtherTy, Ptr: SrcPtr, Align: SrcAlign,
3792 isVolatile: II.isVolatile(), Name: "copyload");
3793 Load->copyMetadata(SrcInst: II, WL: {LLVMContext::MD_mem_parallel_loop_access,
3794 LLVMContext::MD_access_group});
3795 if (AATags)
3796 Load->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3797 AccessTy: Load->getType(), DL));
3798 Src = Load;
3799 }
3800
3801 if (VecTy && !IsWholeAlloca && IsDest) {
3802 Value *Old = IRB.CreateAlignedLoad(Ty: NewAllocaTy, Ptr: &NewAI, Align: NewAI.getAlign(),
3803 Name: "oldload");
3804 Src = insertVector(IRB, Old, V: Src, BeginIndex, Name: "vec");
3805 } else if (IntTy && !IsWholeAlloca && IsDest) {
3806 Value *Old = IRB.CreateAlignedLoad(Ty: NewAllocaTy, Ptr: &NewAI, Align: NewAI.getAlign(),
3807 Name: "oldload");
3808 Old = IRB.CreateBitPreservingCastChain(DL, V: Old, NewTy: IntTy);
3809 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3810 Src = insertInteger(DL, IRB, Old, V: Src, Offset, Name: "insert");
3811 Src = IRB.CreateBitPreservingCastChain(DL, V: Src, NewTy: NewAllocaTy);
3812 }
3813
3814 StoreInst *Store = cast<StoreInst>(
3815 Val: IRB.CreateAlignedStore(Val: Src, Ptr: DstPtr, Align: DstAlign, isVolatile: II.isVolatile()));
3816 Store->copyMetadata(SrcInst: II, WL: {LLVMContext::MD_mem_parallel_loop_access,
3817 LLVMContext::MD_access_group});
3818 if (AATags)
3819 Store->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3820 AccessTy: Src->getType(), DL));
3821
3822 APInt Offset(DL.getIndexTypeSizeInBits(Ty: DstPtr->getType()), 0);
3823 if (IsDest) {
3824
3825 migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * 8, SliceSizeInBits: SliceSize * 8, OldInst: &II,
3826 Inst: Store, Dest: DstPtr, Value: Src, DL);
3827 } else if (AllocaInst *Base = dyn_cast<AllocaInst>(
3828 Val: DstPtr->stripAndAccumulateConstantOffsets(
3829 DL, Offset, /*AllowNonInbounds*/ true))) {
3830 migrateDebugInfo(OldAlloca: Base, IsSplit, OldAllocaOffsetInBits: Offset.getZExtValue() * 8, SliceSizeInBits: SliceSize * 8,
3831 OldInst: &II, Inst: Store, Dest: DstPtr, Value: Src, DL);
3832 }
3833
3834 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3835 return !II.isVolatile();
3836 }
3837
3838 bool visitIntrinsicInst(IntrinsicInst &II) {
3839 assert((II.isLifetimeStartOrEnd() || II.isDroppable()) &&
3840 "Unexpected intrinsic!");
3841 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3842
3843 // Record this instruction for deletion.
3844 Pass.DeadInsts.push_back(Elt: &II);
3845
3846 if (II.isDroppable()) {
3847 assert(II.getIntrinsicID() == Intrinsic::assume && "Expected assume");
3848 // TODO For now we forget assumed information, this can be improved.
3849 OldPtr->dropDroppableUsesIn(Usr&: II);
3850 return true;
3851 }
3852
3853 assert(II.getArgOperand(0) == OldPtr);
3854 Type *PointerTy = IRB.getPtrTy(AddrSpace: OldPtr->getType()->getPointerAddressSpace());
3855 Value *Ptr = getNewAllocaSlicePtr(IRB, PointerTy);
3856 Value *New;
3857 if (II.getIntrinsicID() == Intrinsic::lifetime_start)
3858 New = IRB.CreateLifetimeStart(Ptr);
3859 else
3860 New = IRB.CreateLifetimeEnd(Ptr);
3861
3862 (void)New;
3863 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3864
3865 return true;
3866 }
3867
3868 void fixLoadStoreAlign(Instruction &Root) {
3869 // This algorithm implements the same visitor loop as
3870 // hasUnsafePHIOrSelectUse, and fixes the alignment of each load
3871 // or store found.
3872 SmallPtrSet<Instruction *, 4> Visited;
3873 SmallVector<Instruction *, 4> Uses;
3874 Visited.insert(Ptr: &Root);
3875 Uses.push_back(Elt: &Root);
3876 do {
3877 Instruction *I = Uses.pop_back_val();
3878
3879 if (LoadInst *LI = dyn_cast<LoadInst>(Val: I)) {
3880 LI->setAlignment(std::min(a: LI->getAlign(), b: getSliceAlign()));
3881 continue;
3882 }
3883 if (StoreInst *SI = dyn_cast<StoreInst>(Val: I)) {
3884 SI->setAlignment(std::min(a: SI->getAlign(), b: getSliceAlign()));
3885 continue;
3886 }
3887
3888 assert(isa<BitCastInst>(I) || isa<AddrSpaceCastInst>(I) ||
3889 isa<PHINode>(I) || isa<SelectInst>(I) ||
3890 isa<GetElementPtrInst>(I));
3891 for (User *U : I->users())
3892 if (Visited.insert(Ptr: cast<Instruction>(Val: U)).second)
3893 Uses.push_back(Elt: cast<Instruction>(Val: U));
3894 } while (!Uses.empty());
3895 }
3896
3897 bool visitPHINode(PHINode &PN) {
3898 LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
3899 assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable");
3900 assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable");
3901
3902 // We would like to compute a new pointer in only one place, but have it be
3903 // as local as possible to the PHI. To do that, we re-use the location of
3904 // the old pointer, which necessarily must be in the right position to
3905 // dominate the PHI.
3906 IRBuilderBase::InsertPointGuard Guard(IRB);
3907 if (isa<PHINode>(Val: OldPtr))
3908 IRB.SetInsertPoint(TheBB: OldPtr->getParent(),
3909 IP: OldPtr->getParent()->getFirstInsertionPt());
3910 else
3911 IRB.SetInsertPoint(OldPtr);
3912 IRB.SetCurrentDebugLocation(OldPtr->getDebugLoc());
3913
3914 Value *NewPtr = getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType());
3915 // Replace the operands which were using the old pointer.
3916 std::replace(first: PN.op_begin(), last: PN.op_end(), old_value: cast<Value>(Val: OldPtr), new_value: NewPtr);
3917
3918 LLVM_DEBUG(dbgs() << " to: " << PN << "\n");
3919 deleteIfTriviallyDead(V: OldPtr);
3920
3921 // Fix the alignment of any loads or stores using this PHI node.
3922 fixLoadStoreAlign(Root&: PN);
3923
3924 // PHIs can't be promoted on their own, but often can be speculated. We
3925 // check the speculation outside of the rewriter so that we see the
3926 // fully-rewritten alloca.
3927 PHIUsers.insert(X: &PN);
3928 return true;
3929 }
3930
3931 bool visitSelectInst(SelectInst &SI) {
3932 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3933 assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) &&
3934 "Pointer isn't an operand!");
3935 assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
3936 assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable");
3937
3938 Value *NewPtr = getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType());
3939 // Replace the operands which were using the old pointer.
3940 if (SI.getOperand(i_nocapture: 1) == OldPtr)
3941 SI.setOperand(i_nocapture: 1, Val_nocapture: NewPtr);
3942 if (SI.getOperand(i_nocapture: 2) == OldPtr)
3943 SI.setOperand(i_nocapture: 2, Val_nocapture: NewPtr);
3944
3945 LLVM_DEBUG(dbgs() << " to: " << SI << "\n");
3946 deleteIfTriviallyDead(V: OldPtr);
3947
3948 // Fix the alignment of any loads or stores using this select.
3949 fixLoadStoreAlign(Root&: SI);
3950
3951 // Selects can't be promoted on their own, but often can be speculated. We
3952 // check the speculation outside of the rewriter so that we see the
3953 // fully-rewritten alloca.
3954 SelectUsers.insert(X: &SI);
3955 return true;
3956 }
3957};
3958
3959/// Visitor to rewrite aggregate loads and stores as scalar.
3960///
3961/// This pass aggressively rewrites all aggregate loads and stores on
3962/// a particular pointer (or any pointer derived from it which we can identify)
3963/// with scalar loads and stores.
3964class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
3965 // Befriend the base class so it can delegate to private visit methods.
3966 friend class InstVisitor<AggLoadStoreRewriter, bool>;
3967
3968 /// Queue of pointer uses to analyze and potentially rewrite.
3969 SmallVector<Use *, 8> Queue;
3970
3971 /// Set to prevent us from cycling with phi nodes and loops.
3972 SmallPtrSet<User *, 8> Visited;
3973
3974 /// The current pointer use being rewritten. This is used to dig up the used
3975 /// value (as opposed to the user).
3976 Use *U = nullptr;
3977
3978 /// Used to calculate offsets, and hence alignment, of subobjects.
3979 const DataLayout &DL;
3980
3981 IRBuilderTy &IRB;
3982
3983public:
3984 AggLoadStoreRewriter(const DataLayout &DL, IRBuilderTy &IRB)
3985 : DL(DL), IRB(IRB) {}
3986
3987 /// Rewrite loads and stores through a pointer and all pointers derived from
3988 /// it.
3989 bool rewrite(Instruction &I) {
3990 LLVM_DEBUG(dbgs() << " Rewriting FCA loads and stores...\n");
3991 enqueueUsers(I);
3992 bool Changed = false;
3993 while (!Queue.empty()) {
3994 U = Queue.pop_back_val();
3995 Changed |= visit(I: cast<Instruction>(Val: U->getUser()));
3996 }
3997 return Changed;
3998 }
3999
4000private:
4001 /// Enqueue all the users of the given instruction for further processing.
4002 /// This uses a set to de-duplicate users.
4003 void enqueueUsers(Instruction &I) {
4004 for (Use &U : I.uses())
4005 if (Visited.insert(Ptr: U.getUser()).second)
4006 Queue.push_back(Elt: &U);
4007 }
4008
4009 // Conservative default is to not rewrite anything.
4010 bool visitInstruction(Instruction &I) { return false; }
4011
4012 /// Generic recursive split emission class.
4013 template <typename Derived> class OpSplitter {
4014 protected:
4015 /// The builder used to form new instructions.
4016 IRBuilderTy &IRB;
4017
4018 /// The indices which to be used with insert- or extractvalue to select the
4019 /// appropriate value within the aggregate.
4020 SmallVector<unsigned, 4> Indices;
4021
4022 /// The indices to a GEP instruction which will move Ptr to the correct slot
4023 /// within the aggregate.
4024 SmallVector<Value *, 4> GEPIndices;
4025
4026 /// The base pointer of the original op, used as a base for GEPing the
4027 /// split operations.
4028 Value *Ptr;
4029
4030 /// The base pointee type being GEPed into.
4031 Type *BaseTy;
4032
4033 /// Known alignment of the base pointer.
4034 Align BaseAlign;
4035
4036 /// To calculate offset of each component so we can correctly deduce
4037 /// alignments.
4038 const DataLayout &DL;
4039
4040 /// Initialize the splitter with an insertion point, Ptr and start with a
4041 /// single zero GEP index.
4042 OpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
4043 Align BaseAlign, const DataLayout &DL, IRBuilderTy &IRB)
4044 : IRB(IRB), GEPIndices(1, IRB.getInt32(C: 0)), Ptr(Ptr), BaseTy(BaseTy),
4045 BaseAlign(BaseAlign), DL(DL) {
4046 IRB.SetInsertPoint(InsertionPoint);
4047 }
4048
4049 public:
4050 /// Generic recursive split emission routine.
4051 ///
4052 /// This method recursively splits an aggregate op (load or store) into
4053 /// scalar or vector ops. It splits recursively until it hits a single value
4054 /// and emits that single value operation via the template argument.
4055 ///
4056 /// The logic of this routine relies on GEPs and insertvalue and
4057 /// extractvalue all operating with the same fundamental index list, merely
4058 /// formatted differently (GEPs need actual values).
4059 ///
4060 /// \param Ty The type being split recursively into smaller ops.
4061 /// \param Agg The aggregate value being built up or stored, depending on
4062 /// whether this is splitting a load or a store respectively.
4063 void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) {
4064 if (Ty->isSingleValueType()) {
4065 unsigned Offset = DL.getIndexedOffsetInType(ElemTy: BaseTy, Indices: GEPIndices);
4066 return static_cast<Derived *>(this)->emitFunc(
4067 Ty, Agg, commonAlignment(A: BaseAlign, Offset), Name);
4068 }
4069
4070 if (ArrayType *ATy = dyn_cast<ArrayType>(Val: Ty)) {
4071 unsigned OldSize = Indices.size();
4072 (void)OldSize;
4073 for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size;
4074 ++Idx) {
4075 assert(Indices.size() == OldSize && "Did not return to the old size");
4076 Indices.push_back(Elt: Idx);
4077 GEPIndices.push_back(Elt: IRB.getInt32(C: Idx));
4078 emitSplitOps(Ty: ATy->getElementType(), Agg, Name: Name + "." + Twine(Idx));
4079 GEPIndices.pop_back();
4080 Indices.pop_back();
4081 }
4082 return;
4083 }
4084
4085 if (StructType *STy = dyn_cast<StructType>(Val: Ty)) {
4086 unsigned OldSize = Indices.size();
4087 (void)OldSize;
4088 for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size;
4089 ++Idx) {
4090 assert(Indices.size() == OldSize && "Did not return to the old size");
4091 Indices.push_back(Elt: Idx);
4092 GEPIndices.push_back(Elt: IRB.getInt32(C: Idx));
4093 emitSplitOps(Ty: STy->getElementType(N: Idx), Agg, Name: Name + "." + Twine(Idx));
4094 GEPIndices.pop_back();
4095 Indices.pop_back();
4096 }
4097 return;
4098 }
4099
4100 llvm_unreachable("Only arrays and structs are aggregate loadable types");
4101 }
4102 };
4103
4104 struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
4105 AAMDNodes AATags;
4106 // A vector to hold the split components that we want to emit
4107 // separate fake uses for.
4108 SmallVector<Value *, 4> Components;
4109 // A vector to hold all the fake uses of the struct that we are splitting.
4110 // Usually there should only be one, but we are handling the general case.
4111 SmallVector<Instruction *, 1> FakeUses;
4112
4113 LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
4114 AAMDNodes AATags, Align BaseAlign, const DataLayout &DL,
4115 IRBuilderTy &IRB)
4116 : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, DL,
4117 IRB),
4118 AATags(AATags) {}
4119
4120 /// Emit a leaf load of a single value. This is called at the leaves of the
4121 /// recursive emission to actually load values.
4122 void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
4123 assert(Ty->isSingleValueType());
4124 // Load the single value and insert it using the indices.
4125 Value *GEP =
4126 IRB.CreateInBoundsGEP(Ty: BaseTy, Ptr, IdxList: GEPIndices, Name: Name + ".gep");
4127 LoadInst *Load =
4128 IRB.CreateAlignedLoad(Ty, Ptr: GEP, Align: Alignment, Name: Name + ".load");
4129
4130 APInt Offset(
4131 DL.getIndexSizeInBits(AS: Ptr->getType()->getPointerAddressSpace()), 0);
4132 if (AATags &&
4133 GEPOperator::accumulateConstantOffset(SourceType: BaseTy, Index: GEPIndices, DL, Offset))
4134 Load->setAAMetadata(
4135 AATags.adjustForAccess(Offset: Offset.getZExtValue(), AccessTy: Load->getType(), DL));
4136 // Record the load so we can generate a fake use for this aggregate
4137 // component.
4138 Components.push_back(Elt: Load);
4139
4140 Agg = IRB.CreateInsertValue(Agg, Val: Load, Idxs: Indices, Name: Name + ".insert");
4141 LLVM_DEBUG(dbgs() << " to: " << *Load << "\n");
4142 }
4143
4144 // Stash the fake uses that use the value generated by this instruction.
4145 void recordFakeUses(LoadInst &LI) {
4146 for (Use &U : LI.uses())
4147 if (auto *II = dyn_cast<IntrinsicInst>(Val: U.getUser()))
4148 if (II->getIntrinsicID() == Intrinsic::fake_use)
4149 FakeUses.push_back(Elt: II);
4150 }
4151
4152 // Replace all fake uses of the aggregate with a series of fake uses, one
4153 // for each split component.
4154 void emitFakeUses() {
4155 for (Instruction *I : FakeUses) {
4156 IRB.SetInsertPoint(I);
4157 for (auto *V : Components)
4158 IRB.CreateIntrinsic(ID: Intrinsic::fake_use, Args: {V});
4159 I->eraseFromParent();
4160 }
4161 }
4162 };
4163
4164 bool visitLoadInst(LoadInst &LI) {
4165 assert(LI.getPointerOperand() == *U);
4166 if (!LI.isSimple() || LI.getType()->isSingleValueType())
4167 return false;
4168
4169 // We have an aggregate being loaded, split it apart.
4170 LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
4171 LoadOpSplitter Splitter(&LI, *U, LI.getType(), LI.getAAMetadata(),
4172 getAdjustedAlignment(I: &LI, Offset: 0), DL, IRB);
4173 Splitter.recordFakeUses(LI);
4174 Value *V = PoisonValue::get(T: LI.getType());
4175 Splitter.emitSplitOps(Ty: LI.getType(), Agg&: V, Name: LI.getName() + ".fca");
4176 Splitter.emitFakeUses();
4177 Visited.erase(Ptr: &LI);
4178 LI.replaceAllUsesWith(V);
4179 LI.eraseFromParent();
4180 return true;
4181 }
4182
4183 struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
4184 StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
4185 AAMDNodes AATags, StoreInst *AggStore, Align BaseAlign,
4186 const DataLayout &DL, IRBuilderTy &IRB)
4187 : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
4188 DL, IRB),
4189 AATags(AATags), AggStore(AggStore) {}
4190 AAMDNodes AATags;
4191 StoreInst *AggStore;
4192 /// Emit a leaf store of a single value. This is called at the leaves of the
4193 /// recursive emission to actually produce stores.
4194 void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
4195 assert(Ty->isSingleValueType());
4196 // Extract the single value and store it using the indices.
4197 //
4198 // The gep and extractvalue values are factored out of the CreateStore
4199 // call to make the output independent of the argument evaluation order.
4200 Value *ExtractValue =
4201 IRB.CreateExtractValue(Agg, Idxs: Indices, Name: Name + ".extract");
4202 Value *InBoundsGEP =
4203 IRB.CreateInBoundsGEP(Ty: BaseTy, Ptr, IdxList: GEPIndices, Name: Name + ".gep");
4204 StoreInst *Store =
4205 IRB.CreateAlignedStore(Val: ExtractValue, Ptr: InBoundsGEP, Align: Alignment);
4206
4207 APInt Offset(
4208 DL.getIndexSizeInBits(AS: Ptr->getType()->getPointerAddressSpace()), 0);
4209 GEPOperator::accumulateConstantOffset(SourceType: BaseTy, Index: GEPIndices, DL, Offset);
4210 if (AATags) {
4211 Store->setAAMetadata(AATags.adjustForAccess(
4212 Offset: Offset.getZExtValue(), AccessTy: ExtractValue->getType(), DL));
4213 }
4214
4215 // migrateDebugInfo requires the base Alloca. Walk to it from this gep.
4216 // If we cannot (because there's an intervening non-const or unbounded
4217 // gep) then we wouldn't expect to see dbg.assign intrinsics linked to
4218 // this instruction.
4219 Value *Base = AggStore->getPointerOperand()->stripInBoundsOffsets();
4220 if (auto *OldAI = dyn_cast<AllocaInst>(Val: Base)) {
4221 uint64_t SizeInBits =
4222 DL.getTypeSizeInBits(Ty: Store->getValueOperand()->getType());
4223 migrateDebugInfo(OldAlloca: OldAI, /*IsSplit*/ true, OldAllocaOffsetInBits: Offset.getZExtValue() * 8,
4224 SliceSizeInBits: SizeInBits, OldInst: AggStore, Inst: Store,
4225 Dest: Store->getPointerOperand(), Value: Store->getValueOperand(),
4226 DL);
4227 } else {
4228 assert(at::getDVRAssignmentMarkers(Store).empty() &&
4229 "AT: unexpected debug.assign linked to store through "
4230 "unbounded GEP");
4231 }
4232 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
4233 }
4234 };
4235
4236 bool visitStoreInst(StoreInst &SI) {
4237 if (!SI.isSimple() || SI.getPointerOperand() != *U)
4238 return false;
4239 Value *V = SI.getValueOperand();
4240 if (V->getType()->isSingleValueType())
4241 return false;
4242
4243 // We have an aggregate being stored, split it apart.
4244 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
4245 StoreOpSplitter Splitter(&SI, *U, V->getType(), SI.getAAMetadata(), &SI,
4246 getAdjustedAlignment(I: &SI, Offset: 0), DL, IRB);
4247 Splitter.emitSplitOps(Ty: V->getType(), Agg&: V, Name: V->getName() + ".fca");
4248 Visited.erase(Ptr: &SI);
4249 // The stores replacing SI each have markers describing fragments of the
4250 // assignment so delete the assignment markers linked to SI.
4251 at::deleteAssignmentMarkers(Inst: &SI);
4252 SI.eraseFromParent();
4253 return true;
4254 }
4255
4256 bool visitBitCastInst(BitCastInst &BC) {
4257 enqueueUsers(I&: BC);
4258 return false;
4259 }
4260
4261 bool visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
4262 enqueueUsers(I&: ASC);
4263 return false;
4264 }
4265
4266 // Unfold gep (select cond, ptr1, ptr2), idx
4267 // => select cond, gep(ptr1, idx), gep(ptr2, idx)
4268 // and gep ptr, (select cond, idx1, idx2)
4269 // => select cond, gep(ptr, idx1), gep(ptr, idx2)
4270 // We also allow for i1 zext indices, which are equivalent to selects.
4271 bool unfoldGEPSelect(GetElementPtrInst &GEPI) {
4272 // Check whether the GEP has exactly one select operand and all indices
4273 // will become constant after the transform.
4274 Instruction *Sel = dyn_cast<SelectInst>(Val: GEPI.getPointerOperand());
4275 for (Value *Op : GEPI.indices()) {
4276 if (auto *SI = dyn_cast<SelectInst>(Val: Op)) {
4277 if (Sel)
4278 return false;
4279
4280 Sel = SI;
4281 if (!isa<ConstantInt>(Val: SI->getTrueValue()) ||
4282 !isa<ConstantInt>(Val: SI->getFalseValue()))
4283 return false;
4284 continue;
4285 }
4286 if (auto *ZI = dyn_cast<ZExtInst>(Val: Op)) {
4287 if (Sel)
4288 return false;
4289 Sel = ZI;
4290 if (!ZI->getSrcTy()->isIntegerTy(Bitwidth: 1))
4291 return false;
4292 continue;
4293 }
4294
4295 if (!isa<ConstantInt>(Val: Op))
4296 return false;
4297 }
4298
4299 if (!Sel)
4300 return false;
4301
4302 LLVM_DEBUG(dbgs() << " Rewriting gep(select) -> select(gep):\n";
4303 dbgs() << " original: " << *Sel << "\n";
4304 dbgs() << " " << GEPI << "\n";);
4305
4306 auto GetNewOps = [&](Value *SelOp) {
4307 SmallVector<Value *> NewOps;
4308 for (Value *Op : GEPI.operands())
4309 if (Op == Sel)
4310 NewOps.push_back(Elt: SelOp);
4311 else
4312 NewOps.push_back(Elt: Op);
4313 return NewOps;
4314 };
4315
4316 Value *Cond, *True, *False;
4317 Instruction *MDFrom = nullptr;
4318 if (auto *SI = dyn_cast<SelectInst>(Val: Sel)) {
4319 Cond = SI->getCondition();
4320 True = SI->getTrueValue();
4321 False = SI->getFalseValue();
4322 if (!ProfcheckDisableMetadataFixes)
4323 MDFrom = SI;
4324 } else {
4325 Cond = Sel->getOperand(i: 0);
4326 True = ConstantInt::get(Ty: Sel->getType(), V: 1);
4327 False = ConstantInt::get(Ty: Sel->getType(), V: 0);
4328 }
4329 SmallVector<Value *> TrueOps = GetNewOps(True);
4330 SmallVector<Value *> FalseOps = GetNewOps(False);
4331
4332 IRB.SetInsertPoint(&GEPI);
4333 GEPNoWrapFlags NW = GEPI.getNoWrapFlags();
4334
4335 Type *Ty = GEPI.getSourceElementType();
4336 Value *NTrue = IRB.CreateGEP(Ty, Ptr: TrueOps[0], IdxList: ArrayRef(TrueOps).drop_front(),
4337 Name: True->getName() + ".sroa.gep", NW);
4338
4339 Value *NFalse =
4340 IRB.CreateGEP(Ty, Ptr: FalseOps[0], IdxList: ArrayRef(FalseOps).drop_front(),
4341 Name: False->getName() + ".sroa.gep", NW);
4342
4343 Value *NSel = MDFrom
4344 ? IRB.CreateSelect(C: Cond, True: NTrue, False: NFalse,
4345 Name: Sel->getName() + ".sroa.sel", MDFrom)
4346 : IRB.CreateSelectWithUnknownProfile(
4347 C: Cond, True: NTrue, False: NFalse, DEBUG_TYPE,
4348 Name: Sel->getName() + ".sroa.sel");
4349 Visited.erase(Ptr: &GEPI);
4350 GEPI.replaceAllUsesWith(V: NSel);
4351 GEPI.eraseFromParent();
4352 Instruction *NSelI = cast<Instruction>(Val: NSel);
4353 Visited.insert(Ptr: NSelI);
4354 enqueueUsers(I&: *NSelI);
4355
4356 LLVM_DEBUG(dbgs() << " to: " << *NTrue << "\n";
4357 dbgs() << " " << *NFalse << "\n";
4358 dbgs() << " " << *NSel << "\n";);
4359
4360 return true;
4361 }
4362
4363 // Unfold gep (phi ptr1, ptr2), idx
4364 // => phi ((gep ptr1, idx), (gep ptr2, idx))
4365 // and gep ptr, (phi idx1, idx2)
4366 // => phi ((gep ptr, idx1), (gep ptr, idx2))
4367 bool unfoldGEPPhi(GetElementPtrInst &GEPI) {
4368 // To prevent infinitely expanding recursive phis, bail if the GEP pointer
4369 // operand (looking through the phi if it is the phi we want to unfold) is
4370 // an instruction besides a static alloca.
4371 PHINode *Phi = dyn_cast<PHINode>(Val: GEPI.getPointerOperand());
4372 auto IsInvalidPointerOperand = [](Value *V) {
4373 if (!isa<Instruction>(Val: V))
4374 return false;
4375 if (auto *AI = dyn_cast<AllocaInst>(Val: V))
4376 return !AI->isStaticAlloca();
4377 return true;
4378 };
4379 if (Phi) {
4380 if (any_of(Range: Phi->operands(), P: IsInvalidPointerOperand))
4381 return false;
4382 } else {
4383 if (IsInvalidPointerOperand(GEPI.getPointerOperand()))
4384 return false;
4385 }
4386 // Check whether the GEP has exactly one phi operand (including the pointer
4387 // operand) and all indices will become constant after the transform.
4388 for (Value *Op : GEPI.indices()) {
4389 if (auto *SI = dyn_cast<PHINode>(Val: Op)) {
4390 if (Phi)
4391 return false;
4392
4393 Phi = SI;
4394 if (!all_of(Range: Phi->incoming_values(),
4395 P: [](Value *V) { return isa<ConstantInt>(Val: V); }))
4396 return false;
4397 continue;
4398 }
4399
4400 if (!isa<ConstantInt>(Val: Op))
4401 return false;
4402 }
4403
4404 if (!Phi)
4405 return false;
4406
4407 LLVM_DEBUG(dbgs() << " Rewriting gep(phi) -> phi(gep):\n";
4408 dbgs() << " original: " << *Phi << "\n";
4409 dbgs() << " " << GEPI << "\n";);
4410
4411 auto GetNewOps = [&](Value *PhiOp) {
4412 SmallVector<Value *> NewOps;
4413 for (Value *Op : GEPI.operands())
4414 if (Op == Phi)
4415 NewOps.push_back(Elt: PhiOp);
4416 else
4417 NewOps.push_back(Elt: Op);
4418 return NewOps;
4419 };
4420
4421 IRB.SetInsertPoint(Phi);
4422 PHINode *NewPhi = IRB.CreatePHI(Ty: GEPI.getType(), NumReservedValues: Phi->getNumIncomingValues(),
4423 Name: Phi->getName() + ".sroa.phi");
4424
4425 Type *SourceTy = GEPI.getSourceElementType();
4426 // We only handle arguments, constants, and static allocas here, so we can
4427 // insert GEPs at the end of the entry block.
4428 IRB.SetInsertPoint(GEPI.getFunction()->getEntryBlock().getTerminator());
4429 for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
4430 Value *Op = Phi->getIncomingValue(i: I);
4431 BasicBlock *BB = Phi->getIncomingBlock(i: I);
4432 Value *NewGEP;
4433 if (int NI = NewPhi->getBasicBlockIndex(BB); NI >= 0) {
4434 NewGEP = NewPhi->getIncomingValue(i: NI);
4435 } else {
4436 SmallVector<Value *> NewOps = GetNewOps(Op);
4437 NewGEP =
4438 IRB.CreateGEP(Ty: SourceTy, Ptr: NewOps[0], IdxList: ArrayRef(NewOps).drop_front(),
4439 Name: Phi->getName() + ".sroa.gep", NW: GEPI.getNoWrapFlags());
4440 }
4441 NewPhi->addIncoming(V: NewGEP, BB);
4442 }
4443
4444 Visited.erase(Ptr: &GEPI);
4445 GEPI.replaceAllUsesWith(V: NewPhi);
4446 GEPI.eraseFromParent();
4447 Visited.insert(Ptr: NewPhi);
4448 enqueueUsers(I&: *NewPhi);
4449
4450 LLVM_DEBUG(dbgs() << " to: ";
4451 for (Value *In
4452 : NewPhi->incoming_values()) dbgs()
4453 << "\n " << *In;
4454 dbgs() << "\n " << *NewPhi << '\n');
4455
4456 return true;
4457 }
4458
4459 bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
4460 if (unfoldGEPSelect(GEPI))
4461 return true;
4462
4463 if (unfoldGEPPhi(GEPI))
4464 return true;
4465
4466 enqueueUsers(I&: GEPI);
4467 return false;
4468 }
4469
4470 bool visitPHINode(PHINode &PN) {
4471 enqueueUsers(I&: PN);
4472 return false;
4473 }
4474
4475 bool visitSelectInst(SelectInst &SI) {
4476 enqueueUsers(I&: SI);
4477 return false;
4478 }
4479};
4480
4481} // end anonymous namespace
4482
4483/// Strip aggregate type wrapping.
4484///
4485/// This removes no-op aggregate types wrapping an underlying type. It will
4486/// strip as many layers of types as it can without changing either the type
4487/// size or the allocated size.
4488static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) {
4489 if (Ty->isSingleValueType())
4490 return Ty;
4491
4492 uint64_t AllocSize = DL.getTypeAllocSize(Ty).getFixedValue();
4493 uint64_t TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
4494
4495 Type *InnerTy;
4496 if (ArrayType *ArrTy = dyn_cast<ArrayType>(Val: Ty)) {
4497 InnerTy = ArrTy->getElementType();
4498 } else if (StructType *STy = dyn_cast<StructType>(Val: Ty)) {
4499 const StructLayout *SL = DL.getStructLayout(Ty: STy);
4500 unsigned Index = SL->getElementContainingOffset(FixedOffset: 0);
4501 InnerTy = STy->getElementType(N: Index);
4502 } else {
4503 return Ty;
4504 }
4505
4506 if (AllocSize > DL.getTypeAllocSize(Ty: InnerTy).getFixedValue() ||
4507 TypeSize > DL.getTypeSizeInBits(Ty: InnerTy).getFixedValue())
4508 return Ty;
4509
4510 return stripAggregateTypeWrapping(DL, Ty: InnerTy);
4511}
4512
4513/// Try to find a partition of the aggregate type passed in for a given
4514/// offset and size.
4515///
4516/// This recurses through the aggregate type and tries to compute a subtype
4517/// based on the offset and size. When the offset and size span a sub-section
4518/// of an array, it will even compute a new array type for that sub-section,
4519/// and the same for structs.
4520///
4521/// Note that this routine is very strict and tries to find a partition of the
4522/// type which produces the *exact* right offset and size. It is not forgiving
4523/// when the size or offset cause either end of type-based partition to be off.
4524/// Also, this is a best-effort routine. It is reasonable to give up and not
4525/// return a type if necessary.
4526static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
4527 uint64_t Size) {
4528 if (Offset == 0 && DL.getTypeAllocSize(Ty).getFixedValue() == Size)
4529 return stripAggregateTypeWrapping(DL, Ty);
4530 if (Offset > DL.getTypeAllocSize(Ty).getFixedValue() ||
4531 (DL.getTypeAllocSize(Ty).getFixedValue() - Offset) < Size)
4532 return nullptr;
4533
4534 if (isa<ArrayType>(Val: Ty) || isa<VectorType>(Val: Ty)) {
4535 Type *ElementTy;
4536 uint64_t TyNumElements;
4537 if (auto *AT = dyn_cast<ArrayType>(Val: Ty)) {
4538 ElementTy = AT->getElementType();
4539 TyNumElements = AT->getNumElements();
4540 } else {
4541 // FIXME: This isn't right for vectors with non-byte-sized or
4542 // non-power-of-two sized elements.
4543 auto *VT = cast<FixedVectorType>(Val: Ty);
4544 ElementTy = VT->getElementType();
4545 TyNumElements = VT->getNumElements();
4546 }
4547 uint64_t ElementSize = DL.getTypeAllocSize(Ty: ElementTy).getFixedValue();
4548 uint64_t NumSkippedElements = Offset / ElementSize;
4549 if (NumSkippedElements >= TyNumElements)
4550 return nullptr;
4551 Offset -= NumSkippedElements * ElementSize;
4552
4553 // First check if we need to recurse.
4554 if (Offset > 0 || Size < ElementSize) {
4555 // Bail if the partition ends in a different array element.
4556 if ((Offset + Size) > ElementSize)
4557 return nullptr;
4558 // Recurse through the element type trying to peel off offset bytes.
4559 return getTypePartition(DL, Ty: ElementTy, Offset, Size);
4560 }
4561 assert(Offset == 0);
4562
4563 if (Size == ElementSize)
4564 return stripAggregateTypeWrapping(DL, Ty: ElementTy);
4565 assert(Size > ElementSize);
4566 uint64_t NumElements = Size / ElementSize;
4567 if (NumElements * ElementSize != Size)
4568 return nullptr;
4569 return ArrayType::get(ElementType: ElementTy, NumElements);
4570 }
4571
4572 StructType *STy = dyn_cast<StructType>(Val: Ty);
4573 if (!STy)
4574 return nullptr;
4575
4576 const StructLayout *SL = DL.getStructLayout(Ty: STy);
4577
4578 if (SL->getSizeInBits().isScalable())
4579 return nullptr;
4580
4581 if (Offset >= SL->getSizeInBytes())
4582 return nullptr;
4583 uint64_t EndOffset = Offset + Size;
4584 if (EndOffset > SL->getSizeInBytes())
4585 return nullptr;
4586
4587 unsigned Index = SL->getElementContainingOffset(FixedOffset: Offset);
4588 Offset -= SL->getElementOffset(Idx: Index);
4589
4590 Type *ElementTy = STy->getElementType(N: Index);
4591 uint64_t ElementSize = DL.getTypeAllocSize(Ty: ElementTy).getFixedValue();
4592 if (Offset >= ElementSize)
4593 return nullptr; // The offset points into alignment padding.
4594
4595 // See if any partition must be contained by the element.
4596 if (Offset > 0 || Size < ElementSize) {
4597 if ((Offset + Size) > ElementSize)
4598 return nullptr;
4599 return getTypePartition(DL, Ty: ElementTy, Offset, Size);
4600 }
4601 assert(Offset == 0);
4602
4603 if (Size == ElementSize)
4604 return stripAggregateTypeWrapping(DL, Ty: ElementTy);
4605
4606 StructType::element_iterator EI = STy->element_begin() + Index,
4607 EE = STy->element_end();
4608 if (EndOffset < SL->getSizeInBytes()) {
4609 unsigned EndIndex = SL->getElementContainingOffset(FixedOffset: EndOffset);
4610 if (Index == EndIndex)
4611 return nullptr; // Within a single element and its padding.
4612
4613 // Don't try to form "natural" types if the elements don't line up with the
4614 // expected size.
4615 // FIXME: We could potentially recurse down through the last element in the
4616 // sub-struct to find a natural end point.
4617 if (SL->getElementOffset(Idx: EndIndex) != EndOffset)
4618 return nullptr;
4619
4620 assert(Index < EndIndex);
4621 EE = STy->element_begin() + EndIndex;
4622 }
4623
4624 // Try to build up a sub-structure.
4625 StructType *SubTy =
4626 StructType::get(Context&: STy->getContext(), Elements: ArrayRef(EI, EE), isPacked: STy->isPacked());
4627 const StructLayout *SubSL = DL.getStructLayout(Ty: SubTy);
4628 if (Size != SubSL->getSizeInBytes())
4629 return nullptr; // The sub-struct doesn't have quite the size needed.
4630
4631 return SubTy;
4632}
4633
4634/// Pre-split loads and stores to simplify rewriting.
4635///
4636/// We want to break up the splittable load+store pairs as much as
4637/// possible. This is important to do as a preprocessing step, as once we
4638/// start rewriting the accesses to partitions of the alloca we lose the
4639/// necessary information to correctly split apart paired loads and stores
4640/// which both point into this alloca. The case to consider is something like
4641/// the following:
4642///
4643/// %a = alloca [12 x i8]
4644/// %gep1 = getelementptr i8, ptr %a, i32 0
4645/// %gep2 = getelementptr i8, ptr %a, i32 4
4646/// %gep3 = getelementptr i8, ptr %a, i32 8
4647/// store float 0.0, ptr %gep1
4648/// store float 1.0, ptr %gep2
4649/// %v = load i64, ptr %gep1
4650/// store i64 %v, ptr %gep2
4651/// %f1 = load float, ptr %gep2
4652/// %f2 = load float, ptr %gep3
4653///
4654/// Here we want to form 3 partitions of the alloca, each 4 bytes large, and
4655/// promote everything so we recover the 2 SSA values that should have been
4656/// there all along.
4657///
4658/// \returns true if any changes are made.
4659bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
4660 LLVM_DEBUG(dbgs() << "Pre-splitting loads and stores\n");
4661
4662 // Track the loads and stores which are candidates for pre-splitting here, in
4663 // the order they first appear during the partition scan. These give stable
4664 // iteration order and a basis for tracking which loads and stores we
4665 // actually split.
4666 SmallVector<LoadInst *, 4> Loads;
4667 SmallVector<StoreInst *, 4> Stores;
4668
4669 // We need to accumulate the splits required of each load or store where we
4670 // can find them via a direct lookup. This is important to cross-check loads
4671 // and stores against each other. We also track the slice so that we can kill
4672 // all the slices that end up split.
4673 struct SplitOffsets {
4674 Slice *S;
4675 std::vector<uint64_t> Splits;
4676 };
4677 SmallDenseMap<Instruction *, SplitOffsets, 8> SplitOffsetsMap;
4678
4679 // Track loads out of this alloca which cannot, for any reason, be pre-split.
4680 // This is important as we also cannot pre-split stores of those loads!
4681 // FIXME: This is all pretty gross. It means that we can be more aggressive
4682 // in pre-splitting when the load feeding the store happens to come from
4683 // a separate alloca. Put another way, the effectiveness of SROA would be
4684 // decreased by a frontend which just concatenated all of its local allocas
4685 // into one big flat alloca. But defeating such patterns is exactly the job
4686 // SROA is tasked with! Sadly, to not have this discrepancy we would have
4687 // change store pre-splitting to actually force pre-splitting of the load
4688 // that feeds it *and all stores*. That makes pre-splitting much harder, but
4689 // maybe it would make it more principled?
4690 SmallPtrSet<LoadInst *, 8> UnsplittableLoads;
4691
4692 LLVM_DEBUG(dbgs() << " Searching for candidate loads and stores\n");
4693 for (auto &P : AS.partitions()) {
4694 for (Slice &S : P) {
4695 Instruction *I = cast<Instruction>(Val: S.getUse()->getUser());
4696 if (!S.isSplittable() || S.endOffset() <= P.endOffset()) {
4697 // If this is a load we have to track that it can't participate in any
4698 // pre-splitting. If this is a store of a load we have to track that
4699 // that load also can't participate in any pre-splitting.
4700 if (auto *LI = dyn_cast<LoadInst>(Val: I))
4701 UnsplittableLoads.insert(Ptr: LI);
4702 else if (auto *SI = dyn_cast<StoreInst>(Val: I))
4703 if (auto *LI = dyn_cast<LoadInst>(Val: SI->getValueOperand()))
4704 UnsplittableLoads.insert(Ptr: LI);
4705 continue;
4706 }
4707 assert(P.endOffset() > S.beginOffset() &&
4708 "Empty or backwards partition!");
4709
4710 // Determine if this is a pre-splittable slice.
4711 if (auto *LI = dyn_cast<LoadInst>(Val: I)) {
4712 assert(!LI->isVolatile() && "Cannot split volatile loads!");
4713
4714 // The load must be used exclusively to store into other pointers for
4715 // us to be able to arbitrarily pre-split it. The stores must also be
4716 // simple to avoid changing semantics.
4717 auto IsLoadSimplyStored = [](LoadInst *LI) {
4718 for (User *LU : LI->users()) {
4719 auto *SI = dyn_cast<StoreInst>(Val: LU);
4720 if (!SI || !SI->isSimple())
4721 return false;
4722 }
4723 return true;
4724 };
4725 if (!IsLoadSimplyStored(LI)) {
4726 UnsplittableLoads.insert(Ptr: LI);
4727 continue;
4728 }
4729
4730 Loads.push_back(Elt: LI);
4731 } else if (auto *SI = dyn_cast<StoreInst>(Val: I)) {
4732 if (S.getUse() != &SI->getOperandUse(i: SI->getPointerOperandIndex()))
4733 // Skip stores *of* pointers. FIXME: This shouldn't even be possible!
4734 continue;
4735 auto *StoredLoad = dyn_cast<LoadInst>(Val: SI->getValueOperand());
4736 if (!StoredLoad || !StoredLoad->isSimple())
4737 continue;
4738 assert(!SI->isVolatile() && "Cannot split volatile stores!");
4739
4740 Stores.push_back(Elt: SI);
4741 } else {
4742 // Other uses cannot be pre-split.
4743 continue;
4744 }
4745
4746 // Record the initial split.
4747 LLVM_DEBUG(dbgs() << " Candidate: " << *I << "\n");
4748 auto &Offsets = SplitOffsetsMap[I];
4749 assert(Offsets.Splits.empty() &&
4750 "Should not have splits the first time we see an instruction!");
4751 Offsets.S = &S;
4752 Offsets.Splits.push_back(x: P.endOffset() - S.beginOffset());
4753 }
4754
4755 // Now scan the already split slices, and add a split for any of them which
4756 // we're going to pre-split.
4757 for (Slice *S : P.splitSliceTails()) {
4758 auto SplitOffsetsMapI =
4759 SplitOffsetsMap.find(Val: cast<Instruction>(Val: S->getUse()->getUser()));
4760 if (SplitOffsetsMapI == SplitOffsetsMap.end())
4761 continue;
4762 auto &Offsets = SplitOffsetsMapI->second;
4763
4764 assert(Offsets.S == S && "Found a mismatched slice!");
4765 assert(!Offsets.Splits.empty() &&
4766 "Cannot have an empty set of splits on the second partition!");
4767 assert(Offsets.Splits.back() ==
4768 P.beginOffset() - Offsets.S->beginOffset() &&
4769 "Previous split does not end where this one begins!");
4770
4771 // Record each split. The last partition's end isn't needed as the size
4772 // of the slice dictates that.
4773 if (S->endOffset() > P.endOffset())
4774 Offsets.Splits.push_back(x: P.endOffset() - Offsets.S->beginOffset());
4775 }
4776 }
4777
4778 // We may have split loads where some of their stores are split stores. For
4779 // such loads and stores, we can only pre-split them if their splits exactly
4780 // match relative to their starting offset. We have to verify this prior to
4781 // any rewriting.
4782 llvm::erase_if(C&: Stores, P: [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
4783 // Lookup the load we are storing in our map of split
4784 // offsets.
4785 auto *LI = cast<LoadInst>(Val: SI->getValueOperand());
4786 // If it was completely unsplittable, then we're done,
4787 // and this store can't be pre-split.
4788 if (UnsplittableLoads.count(Ptr: LI))
4789 return true;
4790
4791 auto LoadOffsetsI = SplitOffsetsMap.find(Val: LI);
4792 if (LoadOffsetsI == SplitOffsetsMap.end())
4793 return false; // Unrelated loads are definitely safe.
4794 auto &LoadOffsets = LoadOffsetsI->second;
4795
4796 // Now lookup the store's offsets.
4797 auto &StoreOffsets = SplitOffsetsMap[SI];
4798
4799 // If the relative offsets of each split in the load and
4800 // store match exactly, then we can split them and we
4801 // don't need to remove them here.
4802 if (LoadOffsets.Splits == StoreOffsets.Splits)
4803 return false;
4804
4805 LLVM_DEBUG(dbgs() << " Mismatched splits for load and store:\n"
4806 << " " << *LI << "\n"
4807 << " " << *SI << "\n");
4808
4809 // We've found a store and load that we need to split
4810 // with mismatched relative splits. Just give up on them
4811 // and remove both instructions from our list of
4812 // candidates.
4813 UnsplittableLoads.insert(Ptr: LI);
4814 return true;
4815 });
4816 // Now we have to go *back* through all the stores, because a later store may
4817 // have caused an earlier store's load to become unsplittable and if it is
4818 // unsplittable for the later store, then we can't rely on it being split in
4819 // the earlier store either.
4820 llvm::erase_if(C&: Stores, P: [&UnsplittableLoads](StoreInst *SI) {
4821 auto *LI = cast<LoadInst>(Val: SI->getValueOperand());
4822 return UnsplittableLoads.count(Ptr: LI);
4823 });
4824 // Once we've established all the loads that can't be split for some reason,
4825 // filter any that made it into our list out.
4826 llvm::erase_if(C&: Loads, P: [&UnsplittableLoads](LoadInst *LI) {
4827 return UnsplittableLoads.count(Ptr: LI);
4828 });
4829
4830 // If no loads or stores are left, there is no pre-splitting to be done for
4831 // this alloca.
4832 if (Loads.empty() && Stores.empty())
4833 return false;
4834
4835 // From here on, we can't fail and will be building new accesses, so rig up
4836 // an IR builder.
4837 IRBuilderTy IRB(&AI);
4838
4839 // Collect the new slices which we will merge into the alloca slices.
4840 SmallVector<Slice, 4> NewSlices;
4841
4842 // Track any allocas we end up splitting loads and stores for so we iterate
4843 // on them.
4844 SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas;
4845
4846 // At this point, we have collected all of the loads and stores we can
4847 // pre-split, and the specific splits needed for them. We actually do the
4848 // splitting in a specific order in order to handle when one of the loads in
4849 // the value operand to one of the stores.
4850 //
4851 // First, we rewrite all of the split loads, and just accumulate each split
4852 // load in a parallel structure. We also build the slices for them and append
4853 // them to the alloca slices.
4854 SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap;
4855 std::vector<LoadInst *> SplitLoads;
4856 const DataLayout &DL = AI.getDataLayout();
4857 for (LoadInst *LI : Loads) {
4858 SplitLoads.clear();
4859
4860 auto &Offsets = SplitOffsetsMap[LI];
4861 unsigned SliceSize = Offsets.S->endOffset() - Offsets.S->beginOffset();
4862 assert(LI->getType()->getIntegerBitWidth() % 8 == 0 &&
4863 "Load must have type size equal to store size");
4864 assert(LI->getType()->getIntegerBitWidth() / 8 >= SliceSize &&
4865 "Load must be >= slice size");
4866
4867 uint64_t BaseOffset = Offsets.S->beginOffset();
4868 assert(BaseOffset + SliceSize > BaseOffset &&
4869 "Cannot represent alloca access size using 64-bit integers!");
4870
4871 Instruction *BasePtr = cast<Instruction>(Val: LI->getPointerOperand());
4872 IRB.SetInsertPoint(LI);
4873
4874 LLVM_DEBUG(dbgs() << " Splitting load: " << *LI << "\n");
4875
4876 uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
4877 int Idx = 0, Size = Offsets.Splits.size();
4878 for (;;) {
4879 auto *PartTy = Type::getIntNTy(C&: LI->getContext(), N: PartSize * 8);
4880 auto AS = LI->getPointerAddressSpace();
4881 auto *PartPtrTy = LI->getPointerOperandType();
4882 LoadInst *PLoad = IRB.CreateAlignedLoad(
4883 Ty: PartTy,
4884 Ptr: getAdjustedPtr(IRB, DL, Ptr: BasePtr,
4885 Offset: APInt(DL.getIndexSizeInBits(AS), PartOffset),
4886 PointerTy: PartPtrTy, NamePrefix: BasePtr->getName() + "."),
4887 Align: getAdjustedAlignment(I: LI, Offset: PartOffset),
4888 /*IsVolatile*/ isVolatile: false, Name: LI->getName());
4889 PLoad->copyMetadata(SrcInst: *LI, WL: {LLVMContext::MD_mem_parallel_loop_access,
4890 LLVMContext::MD_access_group});
4891
4892 // Append this load onto the list of split loads so we can find it later
4893 // to rewrite the stores.
4894 SplitLoads.push_back(x: PLoad);
4895
4896 // Now build a new slice for the alloca.
4897 NewSlices.push_back(
4898 Elt: Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
4899 &PLoad->getOperandUse(i: PLoad->getPointerOperandIndex()),
4900 /*IsSplittable*/ false, nullptr));
4901 LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
4902 << ", " << NewSlices.back().endOffset()
4903 << "): " << *PLoad << "\n");
4904
4905 // See if we've handled all the splits.
4906 if (Idx >= Size)
4907 break;
4908
4909 // Setup the next partition.
4910 PartOffset = Offsets.Splits[Idx];
4911 ++Idx;
4912 PartSize = (Idx < Size ? Offsets.Splits[Idx] : SliceSize) - PartOffset;
4913 }
4914
4915 // Now that we have the split loads, do the slow walk over all uses of the
4916 // load and rewrite them as split stores, or save the split loads to use
4917 // below if the store is going to be split there anyways.
4918 bool DeferredStores = false;
4919 for (User *LU : LI->users()) {
4920 StoreInst *SI = cast<StoreInst>(Val: LU);
4921 if (!Stores.empty() && SplitOffsetsMap.count(Val: SI)) {
4922 DeferredStores = true;
4923 LLVM_DEBUG(dbgs() << " Deferred splitting of store: " << *SI
4924 << "\n");
4925 continue;
4926 }
4927
4928 Value *StoreBasePtr = SI->getPointerOperand();
4929 IRB.SetInsertPoint(SI);
4930 AAMDNodes AATags = SI->getAAMetadata();
4931
4932 LLVM_DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n");
4933
4934 for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) {
4935 LoadInst *PLoad = SplitLoads[Idx];
4936 uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1];
4937 auto *PartPtrTy = SI->getPointerOperandType();
4938
4939 auto AS = SI->getPointerAddressSpace();
4940 StoreInst *PStore = IRB.CreateAlignedStore(
4941 Val: PLoad,
4942 Ptr: getAdjustedPtr(IRB, DL, Ptr: StoreBasePtr,
4943 Offset: APInt(DL.getIndexSizeInBits(AS), PartOffset),
4944 PointerTy: PartPtrTy, NamePrefix: StoreBasePtr->getName() + "."),
4945 Align: getAdjustedAlignment(I: SI, Offset: PartOffset),
4946 /*IsVolatile*/ isVolatile: false);
4947 PStore->copyMetadata(SrcInst: *SI, WL: {LLVMContext::MD_mem_parallel_loop_access,
4948 LLVMContext::MD_access_group,
4949 LLVMContext::MD_DIAssignID});
4950
4951 if (AATags)
4952 PStore->setAAMetadata(
4953 AATags.adjustForAccess(Offset: PartOffset, AccessTy: PLoad->getType(), DL));
4954 LLVM_DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n");
4955 }
4956
4957 // We want to immediately iterate on any allocas impacted by splitting
4958 // this store, and we have to track any promotable alloca (indicated by
4959 // a direct store) as needing to be resplit because it is no longer
4960 // promotable.
4961 if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(Val: StoreBasePtr)) {
4962 ResplitPromotableAllocas.insert(Ptr: OtherAI);
4963 Worklist.insert(X: OtherAI);
4964 } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
4965 Val: StoreBasePtr->stripInBoundsOffsets())) {
4966 Worklist.insert(X: OtherAI);
4967 }
4968
4969 // Mark the original store as dead.
4970 DeadInsts.push_back(Elt: SI);
4971 }
4972
4973 // Save the split loads if there are deferred stores among the users.
4974 if (DeferredStores)
4975 SplitLoadsMap.insert(KV: std::make_pair(x&: LI, y: std::move(SplitLoads)));
4976
4977 // Mark the original load as dead and kill the original slice.
4978 DeadInsts.push_back(Elt: LI);
4979 Offsets.S->kill();
4980 }
4981
4982 // Second, we rewrite all of the split stores. At this point, we know that
4983 // all loads from this alloca have been split already. For stores of such
4984 // loads, we can simply look up the pre-existing split loads. For stores of
4985 // other loads, we split those loads first and then write split stores of
4986 // them.
4987 for (StoreInst *SI : Stores) {
4988 auto *LI = cast<LoadInst>(Val: SI->getValueOperand());
4989 IntegerType *Ty = cast<IntegerType>(Val: LI->getType());
4990 assert(Ty->getBitWidth() % 8 == 0);
4991 uint64_t StoreSize = Ty->getBitWidth() / 8;
4992 assert(StoreSize > 0 && "Cannot have a zero-sized integer store!");
4993
4994 auto &Offsets = SplitOffsetsMap[SI];
4995 assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
4996 "Slice size should always match load size exactly!");
4997 uint64_t BaseOffset = Offsets.S->beginOffset();
4998 assert(BaseOffset + StoreSize > BaseOffset &&
4999 "Cannot represent alloca access size using 64-bit integers!");
5000
5001 Value *LoadBasePtr = LI->getPointerOperand();
5002 Instruction *StoreBasePtr = cast<Instruction>(Val: SI->getPointerOperand());
5003
5004 LLVM_DEBUG(dbgs() << " Splitting store: " << *SI << "\n");
5005
5006 // Check whether we have an already split load.
5007 auto SplitLoadsMapI = SplitLoadsMap.find(Val: LI);
5008 std::vector<LoadInst *> *SplitLoads = nullptr;
5009 if (SplitLoadsMapI != SplitLoadsMap.end()) {
5010 SplitLoads = &SplitLoadsMapI->second;
5011 assert(SplitLoads->size() == Offsets.Splits.size() + 1 &&
5012 "Too few split loads for the number of splits in the store!");
5013 } else {
5014 LLVM_DEBUG(dbgs() << " of load: " << *LI << "\n");
5015 }
5016
5017 uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
5018 int Idx = 0, Size = Offsets.Splits.size();
5019 for (;;) {
5020 auto *PartTy = Type::getIntNTy(C&: Ty->getContext(), N: PartSize * 8);
5021 auto *LoadPartPtrTy = LI->getPointerOperandType();
5022 auto *StorePartPtrTy = SI->getPointerOperandType();
5023
5024 // Either lookup a split load or create one.
5025 LoadInst *PLoad;
5026 if (SplitLoads) {
5027 PLoad = (*SplitLoads)[Idx];
5028 } else {
5029 IRB.SetInsertPoint(LI);
5030 auto AS = LI->getPointerAddressSpace();
5031 PLoad = IRB.CreateAlignedLoad(
5032 Ty: PartTy,
5033 Ptr: getAdjustedPtr(IRB, DL, Ptr: LoadBasePtr,
5034 Offset: APInt(DL.getIndexSizeInBits(AS), PartOffset),
5035 PointerTy: LoadPartPtrTy, NamePrefix: LoadBasePtr->getName() + "."),
5036 Align: getAdjustedAlignment(I: LI, Offset: PartOffset),
5037 /*IsVolatile*/ isVolatile: false, Name: LI->getName());
5038 PLoad->copyMetadata(SrcInst: *LI, WL: {LLVMContext::MD_mem_parallel_loop_access,
5039 LLVMContext::MD_access_group});
5040 }
5041
5042 // And store this partition.
5043 IRB.SetInsertPoint(SI);
5044 auto AS = SI->getPointerAddressSpace();
5045 StoreInst *PStore = IRB.CreateAlignedStore(
5046 Val: PLoad,
5047 Ptr: getAdjustedPtr(IRB, DL, Ptr: StoreBasePtr,
5048 Offset: APInt(DL.getIndexSizeInBits(AS), PartOffset),
5049 PointerTy: StorePartPtrTy, NamePrefix: StoreBasePtr->getName() + "."),
5050 Align: getAdjustedAlignment(I: SI, Offset: PartOffset),
5051 /*IsVolatile*/ isVolatile: false);
5052 PStore->copyMetadata(SrcInst: *SI, WL: {LLVMContext::MD_mem_parallel_loop_access,
5053 LLVMContext::MD_access_group});
5054
5055 // Now build a new slice for the alloca.
5056 // ProtectedFieldDisc==nullptr is a lie, but it doesn't matter because we
5057 // already determined that all accesses are consistent.
5058 NewSlices.push_back(
5059 Elt: Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
5060 &PStore->getOperandUse(i: PStore->getPointerOperandIndex()),
5061 /*IsSplittable*/ false, nullptr));
5062 LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
5063 << ", " << NewSlices.back().endOffset()
5064 << "): " << *PStore << "\n");
5065 if (!SplitLoads) {
5066 LLVM_DEBUG(dbgs() << " of split load: " << *PLoad << "\n");
5067 }
5068
5069 // See if we've finished all the splits.
5070 if (Idx >= Size)
5071 break;
5072
5073 // Setup the next partition.
5074 PartOffset = Offsets.Splits[Idx];
5075 ++Idx;
5076 PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset;
5077 }
5078
5079 // We want to immediately iterate on any allocas impacted by splitting
5080 // this load, which is only relevant if it isn't a load of this alloca and
5081 // thus we didn't already split the loads above. We also have to keep track
5082 // of any promotable allocas we split loads on as they can no longer be
5083 // promoted.
5084 if (!SplitLoads) {
5085 if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(Val: LoadBasePtr)) {
5086 assert(OtherAI != &AI && "We can't re-split our own alloca!");
5087 ResplitPromotableAllocas.insert(Ptr: OtherAI);
5088 Worklist.insert(X: OtherAI);
5089 } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
5090 Val: LoadBasePtr->stripInBoundsOffsets())) {
5091 assert(OtherAI != &AI && "We can't re-split our own alloca!");
5092 Worklist.insert(X: OtherAI);
5093 }
5094 }
5095
5096 // Mark the original store as dead now that we've split it up and kill its
5097 // slice. Note that we leave the original load in place unless this store
5098 // was its only use. It may in turn be split up if it is an alloca load
5099 // for some other alloca, but it may be a normal load. This may introduce
5100 // redundant loads, but where those can be merged the rest of the optimizer
5101 // should handle the merging, and this uncovers SSA splits which is more
5102 // important. In practice, the original loads will almost always be fully
5103 // split and removed eventually, and the splits will be merged by any
5104 // trivial CSE, including instcombine.
5105 if (LI->hasOneUse()) {
5106 assert(*LI->user_begin() == SI && "Single use isn't this store!");
5107 DeadInsts.push_back(Elt: LI);
5108 }
5109 DeadInsts.push_back(Elt: SI);
5110 Offsets.S->kill();
5111 }
5112
5113 // Remove the killed slices that have ben pre-split.
5114 llvm::erase_if(C&: AS, P: [](const Slice &S) { return S.isDead(); });
5115
5116 // Insert our new slices. This will sort and merge them into the sorted
5117 // sequence.
5118 AS.insert(NewSlices);
5119
5120 LLVM_DEBUG(dbgs() << " Pre-split slices:\n");
5121#ifndef NDEBUG
5122 for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
5123 LLVM_DEBUG(AS.print(dbgs(), I, " "));
5124#endif
5125
5126 // Finally, don't try to promote any allocas that new require re-splitting.
5127 // They have already been added to the worklist above.
5128 PromotableAllocas.set_subtract(ResplitPromotableAllocas);
5129
5130 return true;
5131}
5132
5133/// Select a partition type for an alloca partition.
5134///
5135/// Try to compute a friendly type for this partition of the alloca. This
5136/// won't always succeed, in which case we fall back to a legal integer type
5137/// or an i8 array of an appropriate size.
5138///
5139/// \returns A tuple with the following elements:
5140/// - PartitionType: The computed type for this partition.
5141/// - IsIntegerWideningViable: True if integer widening promotion is used.
5142/// - VectorType: The vector type if vector promotion is used, otherwise
5143/// nullptr.
5144static std::tuple<Type *, bool, VectorType *>
5145selectPartitionType(Partition &P, const DataLayout &DL, AllocaInst &AI,
5146 LLVMContext &C) {
5147 // First check if the partition is viable for vector promotion.
5148 //
5149 // We prefer vector promotion over integer widening promotion when:
5150 // - The vector element type is a floating-point type.
5151 // - All the loads/stores to the alloca are vector loads/stores to the
5152 // entire alloca or load/store a single element of the vector.
5153 //
5154 // Otherwise when there is an integer vector with mixed type loads/stores we
5155 // prefer integer widening promotion because it's more likely the user is
5156 // doing bitwise arithmetic and we generate better code.
5157 VectorType *VecTy =
5158 isVectorPromotionViable(P, DL, VScale: AI.getFunction()->getVScaleValue());
5159 // If the vector element type is a floating-point type, we prefer vector
5160 // promotion. If the vector has one element, let the below code select
5161 // whether we promote with the vector or scalar.
5162 if (VecTy && VecTy->getElementType()->isFloatingPointTy() &&
5163 VecTy->getElementCount().getFixedValue() > 1)
5164 return {VecTy, false, VecTy};
5165
5166 // Check if there is a common type that all slices of the partition use that
5167 // spans the partition.
5168 auto [CommonUseTy, LargestIntTy] =
5169 findCommonType(B: P.begin(), E: P.end(), EndOffset: P.endOffset());
5170 if (CommonUseTy) {
5171 TypeSize CommonUseSize = DL.getTypeAllocSize(Ty: CommonUseTy);
5172 if (CommonUseSize.isFixed() && CommonUseSize.getFixedValue() >= P.size()) {
5173 // We prefer vector promotion here because if vector promotion is viable
5174 // and there is a common type used, then it implies the second listed
5175 // condition for preferring vector promotion is true.
5176 if (VecTy)
5177 return {VecTy, false, VecTy};
5178 return {CommonUseTy, isIntegerWideningViable(P, AllocaTy: CommonUseTy, DL),
5179 nullptr};
5180 }
5181 }
5182
5183 // Can we find an appropriate subtype in the original allocated
5184 // type?
5185 if (Type *TypePartitionTy = getTypePartition(DL, Ty: AI.getAllocatedType(),
5186 Offset: P.beginOffset(), Size: P.size())) {
5187 // If the partition is an integer array that can be spanned by a legal
5188 // integer type, prefer to represent it as a legal integer type because
5189 // it's more likely to be promotable.
5190 if (TypePartitionTy->isArrayTy() &&
5191 TypePartitionTy->getArrayElementType()->isIntegerTy() &&
5192 DL.isLegalInteger(Width: P.size() * 8))
5193 TypePartitionTy = Type::getIntNTy(C, N: P.size() * 8);
5194 // There was no common type used, so we prefer integer widening promotion.
5195 if (isIntegerWideningViable(P, AllocaTy: TypePartitionTy, DL))
5196 return {TypePartitionTy, true, nullptr};
5197 if (VecTy)
5198 return {VecTy, false, VecTy};
5199 // If we couldn't promote with TypePartitionTy, try with the largest
5200 // integer type used.
5201 if (LargestIntTy &&
5202 DL.getTypeAllocSize(Ty: LargestIntTy).getFixedValue() >= P.size() &&
5203 isIntegerWideningViable(P, AllocaTy: LargestIntTy, DL))
5204 return {LargestIntTy, true, nullptr};
5205
5206 // Fallback to TypePartitionTy and we probably won't promote.
5207 return {TypePartitionTy, false, nullptr};
5208 }
5209
5210 // Select the largest integer type used if it spans the partition.
5211 if (LargestIntTy &&
5212 DL.getTypeAllocSize(Ty: LargestIntTy).getFixedValue() >= P.size())
5213 return {LargestIntTy, false, nullptr};
5214
5215 // Select a legal integer type if it spans the partition.
5216 if (DL.isLegalInteger(Width: P.size() * 8))
5217 return {Type::getIntNTy(C, N: P.size() * 8), false, nullptr};
5218
5219 // Fallback to an i8 array.
5220 return {ArrayType::get(ElementType: Type::getInt8Ty(C), NumElements: P.size()), false, nullptr};
5221}
5222
5223/// Rewrite an alloca partition's users.
5224///
5225/// This routine drives both of the rewriting goals of the SROA pass. It tries
5226/// to rewrite uses of an alloca partition to be conducive for SSA value
5227/// promotion. If the partition needs a new, more refined alloca, this will
5228/// build that new alloca, preserving as much type information as possible, and
5229/// rewrite the uses of the old alloca to point at the new one and have the
5230/// appropriate new offsets. It also evaluates how successful the rewrite was
5231/// at enabling promotion and if it was successful queues the alloca to be
5232/// promoted.
5233std::pair<AllocaInst *, uint64_t>
5234SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P) {
5235 const DataLayout &DL = AI.getDataLayout();
5236 // Select the type for the new alloca that spans the partition.
5237 auto [PartitionTy, IsIntegerWideningViable, VecTy] =
5238 selectPartitionType(P, DL, AI, C&: *C);
5239
5240 // Check for the case where we're going to rewrite to a new alloca of the
5241 // exact same type as the original, and with the same access offsets. In that
5242 // case, re-use the existing alloca, but still run through the rewriter to
5243 // perform phi and select speculation.
5244 // P.beginOffset() can be non-zero even with the same type in a case with
5245 // out-of-bounds access (e.g. @PR35657 function in SROA/basictest.ll).
5246 AllocaInst *NewAI;
5247 if (PartitionTy == AI.getAllocatedType() && P.beginOffset() == 0) {
5248 NewAI = &AI;
5249 // FIXME: We should be able to bail at this point with "nothing changed".
5250 // FIXME: We might want to defer PHI speculation until after here.
5251 // FIXME: return nullptr;
5252 } else {
5253 // Make sure the alignment is compatible with P.beginOffset().
5254 const Align Alignment = commonAlignment(A: AI.getAlign(), Offset: P.beginOffset());
5255 // If we will get at least this much alignment from the type alone, leave
5256 // the alloca's alignment unconstrained.
5257 const bool IsUnconstrained = Alignment <= DL.getABITypeAlign(Ty: PartitionTy);
5258 NewAI = new AllocaInst(
5259 PartitionTy, AI.getAddressSpace(), nullptr,
5260 IsUnconstrained ? DL.getPrefTypeAlign(Ty: PartitionTy) : Alignment,
5261 AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()),
5262 AI.getIterator());
5263 // Copy the old AI debug location over to the new one.
5264 NewAI->setDebugLoc(AI.getDebugLoc());
5265 ++NumNewAllocas;
5266 }
5267
5268 LLVM_DEBUG(dbgs() << "Rewriting alloca partition " << "[" << P.beginOffset()
5269 << "," << P.endOffset() << ") to: " << *NewAI << "\n");
5270
5271 // Track the high watermark on the worklist as it is only relevant for
5272 // promoted allocas. We will reset it to this point if the alloca is not in
5273 // fact scheduled for promotion.
5274 unsigned PPWOldSize = PostPromotionWorklist.size();
5275 unsigned NumUses = 0;
5276 SmallSetVector<PHINode *, 8> PHIUsers;
5277 SmallSetVector<SelectInst *, 8> SelectUsers;
5278
5279 AllocaSliceRewriter Rewriter(
5280 DL, AS, *this, AI, *NewAI, PartitionTy, P.beginOffset(), P.endOffset(),
5281 IsIntegerWideningViable, VecTy, PHIUsers, SelectUsers);
5282 bool Promotable = true;
5283 // Check whether we can have tree-structured merge.
5284 if (auto DeletedValues = Rewriter.rewriteTreeStructuredMerge(P)) {
5285 NumUses += DeletedValues->size() + 1;
5286 for (Value *V : *DeletedValues)
5287 DeadInsts.push_back(Elt: V);
5288 } else {
5289 for (Slice *S : P.splitSliceTails()) {
5290 Promotable &= Rewriter.visit(I: S);
5291 ++NumUses;
5292 }
5293 for (Slice &S : P) {
5294 Promotable &= Rewriter.visit(I: &S);
5295 ++NumUses;
5296 }
5297 }
5298
5299 NumAllocaPartitionUses += NumUses;
5300 MaxUsesPerAllocaPartition.updateMax(V: NumUses);
5301
5302 // Now that we've processed all the slices in the new partition, check if any
5303 // PHIs or Selects would block promotion.
5304 for (PHINode *PHI : PHIUsers)
5305 if (!isSafePHIToSpeculate(PN&: *PHI)) {
5306 Promotable = false;
5307 PHIUsers.clear();
5308 SelectUsers.clear();
5309 break;
5310 }
5311
5312 SmallVector<std::pair<SelectInst *, RewriteableMemOps>, 2>
5313 NewSelectsToRewrite;
5314 NewSelectsToRewrite.reserve(N: SelectUsers.size());
5315 for (SelectInst *Sel : SelectUsers) {
5316 std::optional<RewriteableMemOps> Ops =
5317 isSafeSelectToSpeculate(SI&: *Sel, PreserveCFG);
5318 if (!Ops) {
5319 Promotable = false;
5320 PHIUsers.clear();
5321 SelectUsers.clear();
5322 NewSelectsToRewrite.clear();
5323 break;
5324 }
5325 NewSelectsToRewrite.emplace_back(Args: std::make_pair(x&: Sel, y&: *Ops));
5326 }
5327
5328 if (Promotable) {
5329 for (Use *U : AS.getDeadUsesIfPromotable()) {
5330 auto *OldInst = dyn_cast<Instruction>(Val: U->get());
5331 Value::dropDroppableUse(U&: *U);
5332 if (OldInst)
5333 if (isInstructionTriviallyDead(I: OldInst))
5334 DeadInsts.push_back(Elt: OldInst);
5335 }
5336 if (PHIUsers.empty() && SelectUsers.empty()) {
5337 // Promote the alloca.
5338 PromotableAllocas.insert(X: NewAI);
5339 } else {
5340 // If we have either PHIs or Selects to speculate, add them to those
5341 // worklists and re-queue the new alloca so that we promote in on the
5342 // next iteration.
5343 SpeculatablePHIs.insert_range(R&: PHIUsers);
5344 SelectsToRewrite.reserve(NumEntries: SelectsToRewrite.size() +
5345 NewSelectsToRewrite.size());
5346 for (auto &&KV : llvm::make_range(
5347 x: std::make_move_iterator(i: NewSelectsToRewrite.begin()),
5348 y: std::make_move_iterator(i: NewSelectsToRewrite.end())))
5349 SelectsToRewrite.insert(KV: std::move(KV));
5350 Worklist.insert(X: NewAI);
5351 }
5352 } else {
5353 // Drop any post-promotion work items if promotion didn't happen.
5354 while (PostPromotionWorklist.size() > PPWOldSize)
5355 PostPromotionWorklist.pop_back();
5356
5357 // We couldn't promote and we didn't create a new partition, nothing
5358 // happened.
5359 if (NewAI == &AI)
5360 return {nullptr, 0};
5361
5362 // If we can't promote the alloca, iterate on it to check for new
5363 // refinements exposed by splitting the current alloca. Don't iterate on an
5364 // alloca which didn't actually change and didn't get promoted.
5365 Worklist.insert(X: NewAI);
5366 }
5367
5368 return {NewAI, DL.getTypeSizeInBits(Ty: PartitionTy).getFixedValue()};
5369}
5370
5371// There isn't a shared interface to get the "address" parts out of a
5372// dbg.declare and dbg.assign, so provide some wrappers.
5373bool isKillAddress(const DbgVariableRecord *DVR) {
5374 if (DVR->getType() == DbgVariableRecord::LocationType::Assign)
5375 return DVR->isKillAddress();
5376 return DVR->isKillLocation();
5377}
5378
5379const DIExpression *getAddressExpression(const DbgVariableRecord *DVR) {
5380 if (DVR->getType() == DbgVariableRecord::LocationType::Assign)
5381 return DVR->getAddressExpression();
5382 return DVR->getExpression();
5383}
5384
5385/// Create or replace an existing fragment in a DIExpression with \p Frag.
5386/// If the expression already contains a DW_OP_LLVM_extract_bits_[sz]ext
5387/// operation, add \p BitExtractOffset to the offset part.
5388///
5389/// Returns the new expression, or nullptr if this fails (see details below).
5390///
5391/// This function is similar to DIExpression::createFragmentExpression except
5392/// for 3 important distinctions:
5393/// 1. The new fragment isn't relative to an existing fragment.
5394/// 2. It assumes the computed location is a memory location. This means we
5395/// don't need to perform checks that creating the fragment preserves the
5396/// expression semantics.
5397/// 3. Existing extract_bits are modified independently of fragment changes
5398/// using \p BitExtractOffset. A change to the fragment offset or size
5399/// may affect a bit extract. But a bit extract offset can change
5400/// independently of the fragment dimensions.
5401///
5402/// Returns the new expression, or nullptr if one couldn't be created.
5403/// Ideally this is only used to signal that a bit-extract has become
5404/// zero-sized (and thus the new debug record has no size and can be
5405/// dropped), however, it fails for other reasons too - see the FIXME below.
5406///
5407/// FIXME: To keep the change that introduces this function NFC it bails
5408/// in some situations unecessarily, e.g. when fragment and bit extract
5409/// sizes differ.
5410static DIExpression *createOrReplaceFragment(const DIExpression *Expr,
5411 DIExpression::FragmentInfo Frag,
5412 int64_t BitExtractOffset) {
5413 SmallVector<uint64_t, 8> Ops;
5414 bool HasFragment = false;
5415 bool HasBitExtract = false;
5416
5417 for (auto &Op : Expr->expr_ops()) {
5418 if (Op.getOp() == dwarf::DW_OP_LLVM_fragment) {
5419 HasFragment = true;
5420 continue;
5421 }
5422 if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext ||
5423 Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_sext) {
5424 HasBitExtract = true;
5425 int64_t ExtractOffsetInBits = Op.getArg(I: 0);
5426 int64_t ExtractSizeInBits = Op.getArg(I: 1);
5427
5428 // DIExpression::createFragmentExpression doesn't know how to handle
5429 // a fragment that is smaller than the extract. Copy the behaviour
5430 // (bail) to avoid non-NFC changes.
5431 // FIXME: Don't do this.
5432 if (Frag.SizeInBits < uint64_t(ExtractSizeInBits))
5433 return nullptr;
5434
5435 assert(BitExtractOffset <= 0);
5436 int64_t AdjustedOffset = ExtractOffsetInBits + BitExtractOffset;
5437
5438 // DIExpression::createFragmentExpression doesn't know what to do
5439 // if the new extract starts "outside" the existing one. Copy the
5440 // behaviour (bail) to avoid non-NFC changes.
5441 // FIXME: Don't do this.
5442 if (AdjustedOffset < 0)
5443 return nullptr;
5444
5445 Ops.push_back(Elt: Op.getOp());
5446 Ops.push_back(Elt: std::max<int64_t>(a: 0, b: AdjustedOffset));
5447 Ops.push_back(Elt: ExtractSizeInBits);
5448 continue;
5449 }
5450 Op.appendToVector(V&: Ops);
5451 }
5452
5453 // Unsupported by createFragmentExpression, so don't support it here yet to
5454 // preserve NFC-ness.
5455 if (HasFragment && HasBitExtract)
5456 return nullptr;
5457
5458 if (!HasBitExtract) {
5459 Ops.push_back(Elt: dwarf::DW_OP_LLVM_fragment);
5460 Ops.push_back(Elt: Frag.OffsetInBits);
5461 Ops.push_back(Elt: Frag.SizeInBits);
5462 }
5463 return DIExpression::get(Context&: Expr->getContext(), Elements: Ops);
5464}
5465
5466/// Insert a new DbgRecord.
5467/// \p Orig Original to copy record type, debug loc and variable from, and
5468/// additionally value and value expression for dbg_assign records.
5469/// \p NewAddr Location's new base address.
5470/// \p NewAddrExpr New expression to apply to address.
5471/// \p BeforeInst Insert position.
5472/// \p NewFragment New fragment (absolute, non-relative).
5473/// \p BitExtractAdjustment Offset to apply to any extract_bits op.
5474static void
5475insertNewDbgInst(DIBuilder &DIB, DbgVariableRecord *Orig, AllocaInst *NewAddr,
5476 DIExpression *NewAddrExpr, Instruction *BeforeInst,
5477 std::optional<DIExpression::FragmentInfo> NewFragment,
5478 int64_t BitExtractAdjustment) {
5479 (void)DIB;
5480
5481 // A dbg_assign puts fragment info in the value expression only. The address
5482 // expression has already been built: NewAddrExpr. A dbg_declare puts the
5483 // new fragment info into NewAddrExpr (as it only has one expression).
5484 DIExpression *NewFragmentExpr =
5485 Orig->isDbgAssign() ? Orig->getExpression() : NewAddrExpr;
5486 if (NewFragment)
5487 NewFragmentExpr = createOrReplaceFragment(Expr: NewFragmentExpr, Frag: *NewFragment,
5488 BitExtractOffset: BitExtractAdjustment);
5489 if (!NewFragmentExpr)
5490 return;
5491
5492 if (Orig->isDbgDeclare()) {
5493 DbgVariableRecord *DVR = DbgVariableRecord::createDVRDeclare(
5494 Address: NewAddr, DV: Orig->getVariable(), Expr: NewFragmentExpr, DI: Orig->getDebugLoc());
5495 BeforeInst->getParent()->insertDbgRecordBefore(DR: DVR,
5496 Here: BeforeInst->getIterator());
5497 return;
5498 }
5499
5500 if (Orig->isDbgValue()) {
5501 DbgVariableRecord *DVR = DbgVariableRecord::createDbgVariableRecord(
5502 Location: NewAddr, DV: Orig->getVariable(), Expr: NewFragmentExpr, DI: Orig->getDebugLoc());
5503 // Drop debug information if the expression doesn't start with a
5504 // DW_OP_deref. This is because without a DW_OP_deref, the #dbg_value
5505 // describes the address of alloca rather than the value inside the alloca.
5506 if (!NewFragmentExpr->startsWithDeref())
5507 DVR->setKillAddress();
5508 BeforeInst->getParent()->insertDbgRecordBefore(DR: DVR,
5509 Here: BeforeInst->getIterator());
5510 return;
5511 }
5512
5513 // Apply a DIAssignID to the store if it doesn't already have it.
5514 if (!NewAddr->hasMetadata(KindID: LLVMContext::MD_DIAssignID)) {
5515 NewAddr->setMetadata(KindID: LLVMContext::MD_DIAssignID,
5516 Node: DIAssignID::getDistinct(Context&: NewAddr->getContext()));
5517 }
5518
5519 DbgVariableRecord *NewAssign = DbgVariableRecord::createLinkedDVRAssign(
5520 LinkedInstr: NewAddr, Val: Orig->getValue(), Variable: Orig->getVariable(), Expression: NewFragmentExpr, Address: NewAddr,
5521 AddressExpression: NewAddrExpr, DI: Orig->getDebugLoc());
5522 LLVM_DEBUG(dbgs() << "Created new DVRAssign: " << *NewAssign << "\n");
5523 (void)NewAssign;
5524}
5525
5526/// Walks the slices of an alloca and form partitions based on them,
5527/// rewriting each of their uses.
5528bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
5529 if (AS.begin() == AS.end())
5530 return false;
5531
5532 unsigned NumPartitions = 0;
5533 bool Changed = false;
5534 const DataLayout &DL = AI.getModule()->getDataLayout();
5535
5536 // First try to pre-split loads and stores.
5537 Changed |= presplitLoadsAndStores(AI, AS);
5538
5539 // Now that we have identified any pre-splitting opportunities,
5540 // mark loads and stores unsplittable except for the following case.
5541 // We leave a slice splittable if all other slices are disjoint or fully
5542 // included in the slice, such as whole-alloca loads and stores.
5543 // If we fail to split these during pre-splitting, we want to force them
5544 // to be rewritten into a partition.
5545 bool IsSorted = true;
5546
5547 uint64_t AllocaSize = AI.getAllocationSize(DL)->getFixedValue();
5548 const uint64_t MaxBitVectorSize = 1024;
5549 if (AllocaSize <= MaxBitVectorSize) {
5550 // If a byte boundary is included in any load or store, a slice starting or
5551 // ending at the boundary is not splittable.
5552 SmallBitVector SplittableOffset(AllocaSize + 1, true);
5553 for (Slice &S : AS)
5554 for (unsigned O = S.beginOffset() + 1;
5555 O < S.endOffset() && O < AllocaSize; O++)
5556 SplittableOffset.reset(Idx: O);
5557
5558 for (Slice &S : AS) {
5559 if (!S.isSplittable())
5560 continue;
5561
5562 if ((S.beginOffset() > AllocaSize || SplittableOffset[S.beginOffset()]) &&
5563 (S.endOffset() > AllocaSize || SplittableOffset[S.endOffset()]))
5564 continue;
5565
5566 if (isa<LoadInst>(Val: S.getUse()->getUser()) ||
5567 isa<StoreInst>(Val: S.getUse()->getUser())) {
5568 S.makeUnsplittable();
5569 IsSorted = false;
5570 }
5571 }
5572 } else {
5573 // We only allow whole-alloca splittable loads and stores
5574 // for a large alloca to avoid creating too large BitVector.
5575 for (Slice &S : AS) {
5576 if (!S.isSplittable())
5577 continue;
5578
5579 if (S.beginOffset() == 0 && S.endOffset() >= AllocaSize)
5580 continue;
5581
5582 if (isa<LoadInst>(Val: S.getUse()->getUser()) ||
5583 isa<StoreInst>(Val: S.getUse()->getUser())) {
5584 S.makeUnsplittable();
5585 IsSorted = false;
5586 }
5587 }
5588 }
5589
5590 if (!IsSorted)
5591 llvm::stable_sort(Range&: AS);
5592
5593 /// Describes the allocas introduced by rewritePartition in order to migrate
5594 /// the debug info.
5595 struct Fragment {
5596 AllocaInst *Alloca;
5597 uint64_t Offset;
5598 uint64_t Size;
5599 Fragment(AllocaInst *AI, uint64_t O, uint64_t S)
5600 : Alloca(AI), Offset(O), Size(S) {}
5601 };
5602 SmallVector<Fragment, 4> Fragments;
5603
5604 // Rewrite each partition.
5605 for (auto &P : AS.partitions()) {
5606 auto [NewAI, ActiveBits] = rewritePartition(AI, AS, P);
5607 if (NewAI) {
5608 Changed = true;
5609 if (NewAI != &AI) {
5610 uint64_t SizeOfByte = 8;
5611 // Don't include any padding.
5612 uint64_t Size = std::min(a: ActiveBits, b: P.size() * SizeOfByte);
5613 Fragments.push_back(
5614 Elt: Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
5615 }
5616 }
5617 ++NumPartitions;
5618 }
5619
5620 NumAllocaPartitions += NumPartitions;
5621 MaxPartitionsPerAlloca.updateMax(V: NumPartitions);
5622
5623 // Migrate debug information from the old alloca to the new alloca(s)
5624 // and the individual partitions.
5625 auto MigrateOne = [&](DbgVariableRecord *DbgVariable) {
5626 // Can't overlap with undef memory.
5627 if (isKillAddress(DVR: DbgVariable))
5628 return;
5629
5630 const Value *DbgPtr = DbgVariable->getAddress();
5631 DIExpression::FragmentInfo VarFrag =
5632 DbgVariable->getFragmentOrEntireVariable();
5633 // Get the address expression constant offset if one exists and the ops
5634 // that come after it.
5635 int64_t CurrentExprOffsetInBytes = 0;
5636 SmallVector<uint64_t> PostOffsetOps;
5637 if (!getAddressExpression(DVR: DbgVariable)
5638 ->extractLeadingOffset(OffsetInBytes&: CurrentExprOffsetInBytes, RemainingOps&: PostOffsetOps))
5639 return; // Couldn't interpret this DIExpression - drop the var.
5640
5641 // Offset defined by a DW_OP_LLVM_extract_bits_[sz]ext.
5642 int64_t ExtractOffsetInBits = 0;
5643 for (auto Op : getAddressExpression(DVR: DbgVariable)->expr_ops()) {
5644 if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext ||
5645 Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_sext) {
5646 ExtractOffsetInBits = Op.getArg(I: 0);
5647 break;
5648 }
5649 }
5650
5651 DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
5652 for (auto Fragment : Fragments) {
5653 int64_t OffsetFromLocationInBits;
5654 std::optional<DIExpression::FragmentInfo> NewDbgFragment;
5655 // Find the variable fragment that the new alloca slice covers.
5656 // Drop debug info for this variable fragment if we can't compute an
5657 // intersect between it and the alloca slice.
5658 if (!DIExpression::calculateFragmentIntersect(
5659 DL, SliceStart: &AI, SliceOffsetInBits: Fragment.Offset, SliceSizeInBits: Fragment.Size, DbgPtr,
5660 DbgPtrOffsetInBits: CurrentExprOffsetInBytes * 8, DbgExtractOffsetInBits: ExtractOffsetInBits, VarFrag,
5661 Result&: NewDbgFragment, OffsetFromLocationInBits))
5662 continue; // Do not migrate this fragment to this slice.
5663
5664 // Zero sized fragment indicates there's no intersect between the variable
5665 // fragment and the alloca slice. Skip this slice for this variable
5666 // fragment.
5667 if (NewDbgFragment && !NewDbgFragment->SizeInBits)
5668 continue; // Do not migrate this fragment to this slice.
5669
5670 // No fragment indicates DbgVariable's variable or fragment exactly
5671 // overlaps the slice; copy its fragment (or nullopt if there isn't one).
5672 if (!NewDbgFragment)
5673 NewDbgFragment = DbgVariable->getFragment();
5674
5675 // Reduce the new expression offset by the bit-extract offset since
5676 // we'll be keeping that.
5677 int64_t OffestFromNewAllocaInBits =
5678 OffsetFromLocationInBits - ExtractOffsetInBits;
5679 // We need to adjust an existing bit extract if the offset expression
5680 // can't eat the slack (i.e., if the new offset would be negative).
5681 int64_t BitExtractOffset =
5682 std::min<int64_t>(a: 0, b: OffestFromNewAllocaInBits);
5683 // The magnitude of a negative value indicates the number of bits into
5684 // the existing variable fragment that the memory region begins. The new
5685 // variable fragment already excludes those bits - the new DbgPtr offset
5686 // only needs to be applied if it's positive.
5687 OffestFromNewAllocaInBits =
5688 std::max(a: int64_t(0), b: OffestFromNewAllocaInBits);
5689
5690 // Rebuild the expression:
5691 // {Offset(OffestFromNewAllocaInBits), PostOffsetOps, NewDbgFragment}
5692 // Add NewDbgFragment later, because dbg.assigns don't want it in the
5693 // address expression but the value expression instead.
5694 DIExpression *NewExpr = DIExpression::get(Context&: AI.getContext(), Elements: PostOffsetOps);
5695 if (OffestFromNewAllocaInBits > 0) {
5696 int64_t OffsetInBytes = (OffestFromNewAllocaInBits + 7) / 8;
5697 NewExpr = DIExpression::prepend(Expr: NewExpr, /*flags=*/Flags: 0, Offset: OffsetInBytes);
5698 }
5699
5700 // Remove any existing intrinsics on the new alloca describing
5701 // the variable fragment.
5702 auto RemoveOne = [DbgVariable](auto *OldDII) {
5703 auto SameVariableFragment = [](const auto *LHS, const auto *RHS) {
5704 return LHS->getVariable() == RHS->getVariable() &&
5705 LHS->getDebugLoc()->getInlinedAt() ==
5706 RHS->getDebugLoc()->getInlinedAt();
5707 };
5708 if (SameVariableFragment(OldDII, DbgVariable))
5709 OldDII->eraseFromParent();
5710 };
5711 for_each(Range: findDVRDeclares(V: Fragment.Alloca), F: RemoveOne);
5712 for_each(Range: findDVRValues(V: Fragment.Alloca), F: RemoveOne);
5713 insertNewDbgInst(DIB, Orig: DbgVariable, NewAddr: Fragment.Alloca, NewAddrExpr: NewExpr, BeforeInst: &AI,
5714 NewFragment: NewDbgFragment, BitExtractAdjustment: BitExtractOffset);
5715 }
5716 };
5717
5718 // Migrate debug information from the old alloca to the new alloca(s)
5719 // and the individual partitions.
5720 for_each(Range: findDVRDeclares(V: &AI), F: MigrateOne);
5721 for_each(Range: findDVRValues(V: &AI), F: MigrateOne);
5722 for_each(Range: at::getDVRAssignmentMarkers(Inst: &AI), F: MigrateOne);
5723
5724 return Changed;
5725}
5726
5727/// Clobber a use with poison, deleting the used value if it becomes dead.
5728void SROA::clobberUse(Use &U) {
5729 Value *OldV = U;
5730 // Replace the use with an poison value.
5731 U = PoisonValue::get(T: OldV->getType());
5732
5733 // Check for this making an instruction dead. We have to garbage collect
5734 // all the dead instructions to ensure the uses of any alloca end up being
5735 // minimal.
5736 if (Instruction *OldI = dyn_cast<Instruction>(Val: OldV))
5737 if (isInstructionTriviallyDead(I: OldI)) {
5738 DeadInsts.push_back(Elt: OldI);
5739 }
5740}
5741
5742/// A basic LoadAndStorePromoter that does not remove store nodes.
5743class BasicLoadAndStorePromoter : public LoadAndStorePromoter {
5744public:
5745 BasicLoadAndStorePromoter(ArrayRef<const Instruction *> Insts, SSAUpdater &S,
5746 Type *ZeroType)
5747 : LoadAndStorePromoter(Insts, S), ZeroType(ZeroType) {}
5748 bool shouldDelete(Instruction *I) const override {
5749 return !isa<StoreInst>(Val: I) && !isa<AllocaInst>(Val: I);
5750 }
5751
5752 Value *getValueToUseForAlloca(Instruction *I) const override {
5753 return UndefValue::get(T: ZeroType);
5754 }
5755
5756private:
5757 Type *ZeroType;
5758};
5759
5760bool SROA::propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS) {
5761 // Look through each "partition", looking for slices with the same start/end
5762 // that do not overlap with any before them. The slices are sorted by
5763 // increasing beginOffset. We don't use AS.partitions(), as it will use a more
5764 // sophisticated algorithm that takes splittable slices into account.
5765 LLVM_DEBUG(dbgs() << "Attempting to propagate values on " << AI << "\n");
5766 bool AllSameAndValid = true;
5767 Type *PartitionType = nullptr;
5768 SmallVector<Instruction *> Insts;
5769 uint64_t BeginOffset = 0;
5770 uint64_t EndOffset = 0;
5771
5772 auto Flush = [&]() {
5773 if (AllSameAndValid && !Insts.empty()) {
5774 LLVM_DEBUG(dbgs() << "Propagate values on slice [" << BeginOffset << ", "
5775 << EndOffset << ")\n");
5776 SmallVector<PHINode *, 4> NewPHIs;
5777 SSAUpdater SSA(&NewPHIs);
5778 Insts.push_back(Elt: &AI);
5779 BasicLoadAndStorePromoter Promoter(Insts, SSA, PartitionType);
5780 Promoter.run(Insts);
5781 }
5782 AllSameAndValid = true;
5783 PartitionType = nullptr;
5784 Insts.clear();
5785 };
5786
5787 for (Slice &S : AS) {
5788 auto *User = cast<Instruction>(Val: S.getUse()->getUser());
5789 if (isAssumeLikeIntrinsic(I: User)) {
5790 LLVM_DEBUG({
5791 dbgs() << "Ignoring slice: ";
5792 AS.print(dbgs(), &S);
5793 });
5794 continue;
5795 }
5796 if (S.beginOffset() >= EndOffset) {
5797 Flush();
5798 BeginOffset = S.beginOffset();
5799 EndOffset = S.endOffset();
5800 } else if (S.beginOffset() != BeginOffset || S.endOffset() != EndOffset) {
5801 if (AllSameAndValid) {
5802 LLVM_DEBUG({
5803 dbgs() << "Slice does not match range [" << BeginOffset << ", "
5804 << EndOffset << ")";
5805 AS.print(dbgs(), &S);
5806 });
5807 AllSameAndValid = false;
5808 }
5809 EndOffset = std::max(a: EndOffset, b: S.endOffset());
5810 continue;
5811 }
5812
5813 if (auto *LI = dyn_cast<LoadInst>(Val: User)) {
5814 Type *UserTy = LI->getType();
5815 // LoadAndStorePromoter requires all the types to be the same.
5816 if (!LI->isSimple() || (PartitionType && UserTy != PartitionType))
5817 AllSameAndValid = false;
5818 PartitionType = UserTy;
5819 Insts.push_back(Elt: User);
5820 } else if (auto *SI = dyn_cast<StoreInst>(Val: User)) {
5821 Type *UserTy = SI->getValueOperand()->getType();
5822 if (!SI->isSimple() || (PartitionType && UserTy != PartitionType))
5823 AllSameAndValid = false;
5824 PartitionType = UserTy;
5825 Insts.push_back(Elt: User);
5826 } else {
5827 AllSameAndValid = false;
5828 }
5829 }
5830
5831 Flush();
5832 return true;
5833}
5834
5835/// Analyze an alloca for SROA.
5836///
5837/// This analyzes the alloca to ensure we can reason about it, builds
5838/// the slices of the alloca, and then hands it off to be split and
5839/// rewritten as needed.
5840std::pair<bool /*Changed*/, bool /*CFGChanged*/>
5841SROA::runOnAlloca(AllocaInst &AI) {
5842 bool Changed = false;
5843 bool CFGChanged = false;
5844
5845 LLVM_DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
5846 ++NumAllocasAnalyzed;
5847
5848 // Special case dead allocas, as they're trivial.
5849 if (AI.use_empty()) {
5850 AI.eraseFromParent();
5851 Changed = true;
5852 return {Changed, CFGChanged};
5853 }
5854 const DataLayout &DL = AI.getDataLayout();
5855
5856 // Skip alloca forms that this analysis can't handle.
5857 std::optional<TypeSize> Size = AI.getAllocationSize(DL);
5858 if (AI.isArrayAllocation() || !Size || Size->isScalable() || Size->isZero())
5859 return {Changed, CFGChanged};
5860
5861 // First, split any FCA loads and stores touching this alloca to promote
5862 // better splitting and promotion opportunities.
5863 IRBuilderTy IRB(&AI);
5864 AggLoadStoreRewriter AggRewriter(DL, IRB);
5865 Changed |= AggRewriter.rewrite(I&: AI);
5866
5867 // Build the slices using a recursive instruction-visiting builder.
5868 AllocaSlices AS(DL, AI);
5869 LLVM_DEBUG(AS.print(dbgs()));
5870 if (AS.isEscaped())
5871 return {Changed, CFGChanged};
5872
5873 if (AS.isEscapedReadOnly()) {
5874 Changed |= propagateStoredValuesToLoads(AI, AS);
5875 return {Changed, CFGChanged};
5876 }
5877
5878 for (auto &P : AS.partitions()) {
5879 // For now, we can't split if a field is accessed both via protected field
5880 // and not, because that would mean that we would need to introduce sign and
5881 // auth operations to convert between the protected and non-protected uses,
5882 // and this pass doesn't know how to do that. Also, this case is unlikely to
5883 // occur in normal code.
5884 std::optional<Value *> ProtectedFieldDisc;
5885 auto SliceHasMismatch = [&](Slice &S) {
5886 if (auto *II = dyn_cast<IntrinsicInst>(Val: S.getUse()->getUser()))
5887 if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
5888 II->getIntrinsicID() == Intrinsic::lifetime_end)
5889 return false;
5890 if (!ProtectedFieldDisc)
5891 ProtectedFieldDisc = S.ProtectedFieldDisc;
5892 return *ProtectedFieldDisc != S.ProtectedFieldDisc;
5893 };
5894 for (Slice &S : P)
5895 if (SliceHasMismatch(S))
5896 return {Changed, CFGChanged};
5897 for (Slice *S : P.splitSliceTails())
5898 if (SliceHasMismatch(*S))
5899 return {Changed, CFGChanged};
5900 }
5901
5902 // Delete all the dead users of this alloca before splitting and rewriting it.
5903 for (Instruction *DeadUser : AS.getDeadUsers()) {
5904 // Free up everything used by this instruction.
5905 for (Use &DeadOp : DeadUser->operands())
5906 clobberUse(U&: DeadOp);
5907
5908 // Now replace the uses of this instruction.
5909 DeadUser->replaceAllUsesWith(V: PoisonValue::get(T: DeadUser->getType()));
5910
5911 // And mark it for deletion.
5912 DeadInsts.push_back(Elt: DeadUser);
5913 Changed = true;
5914 }
5915 for (Use *DeadOp : AS.getDeadOperands()) {
5916 clobberUse(U&: *DeadOp);
5917 Changed = true;
5918 }
5919 for (IntrinsicInst *PFPUser : AS.getPFPUsers()) {
5920 PFPUser->replaceAllUsesWith(V: PFPUser->getArgOperand(i: 0));
5921
5922 DeadInsts.push_back(Elt: PFPUser);
5923 Changed = true;
5924 }
5925
5926 // No slices to split. Leave the dead alloca for a later pass to clean up.
5927 if (AS.begin() == AS.end())
5928 return {Changed, CFGChanged};
5929
5930 Changed |= splitAlloca(AI, AS);
5931
5932 LLVM_DEBUG(dbgs() << " Speculating PHIs\n");
5933 while (!SpeculatablePHIs.empty())
5934 speculatePHINodeLoads(IRB, PN&: *SpeculatablePHIs.pop_back_val());
5935
5936 LLVM_DEBUG(dbgs() << " Rewriting Selects\n");
5937 auto RemainingSelectsToRewrite = SelectsToRewrite.takeVector();
5938 while (!RemainingSelectsToRewrite.empty()) {
5939 const auto [K, V] = RemainingSelectsToRewrite.pop_back_val();
5940 CFGChanged |=
5941 rewriteSelectInstMemOps(SI&: *K, Ops: V, IRB, DTU: PreserveCFG ? nullptr : DTU);
5942 }
5943
5944 return {Changed, CFGChanged};
5945}
5946
5947/// Delete the dead instructions accumulated in this run.
5948///
5949/// Recursively deletes the dead instructions we've accumulated. This is done
5950/// at the very end to maximize locality of the recursive delete and to
5951/// minimize the problems of invalidated instruction pointers as such pointers
5952/// are used heavily in the intermediate stages of the algorithm.
5953///
5954/// We also record the alloca instructions deleted here so that they aren't
5955/// subsequently handed to mem2reg to promote.
5956bool SROA::deleteDeadInstructions(
5957 SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
5958 bool Changed = false;
5959 while (!DeadInsts.empty()) {
5960 Instruction *I = dyn_cast_or_null<Instruction>(Val: DeadInsts.pop_back_val());
5961 if (!I)
5962 continue;
5963 LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
5964
5965 // If the instruction is an alloca, find the possible dbg.declare connected
5966 // to it, and remove it too. We must do this before calling RAUW or we will
5967 // not be able to find it.
5968 if (AllocaInst *AI = dyn_cast<AllocaInst>(Val: I)) {
5969 DeletedAllocas.insert(Ptr: AI);
5970 for (DbgVariableRecord *OldDII : findDVRDeclares(V: AI))
5971 OldDII->eraseFromParent();
5972 }
5973
5974 at::deleteAssignmentMarkers(Inst: I);
5975 I->replaceAllUsesWith(V: UndefValue::get(T: I->getType()));
5976
5977 for (Use &Operand : I->operands())
5978 if (Instruction *U = dyn_cast<Instruction>(Val&: Operand)) {
5979 // Zero out the operand and see if it becomes trivially dead.
5980 Operand = nullptr;
5981 if (isInstructionTriviallyDead(I: U))
5982 DeadInsts.push_back(Elt: U);
5983 }
5984
5985 ++NumDeleted;
5986 I->eraseFromParent();
5987 Changed = true;
5988 }
5989 return Changed;
5990}
5991/// Promote the allocas, using the best available technique.
5992///
5993/// This attempts to promote whatever allocas have been identified as viable in
5994/// the PromotableAllocas list. If that list is empty, there is nothing to do.
5995/// This function returns whether any promotion occurred.
5996bool SROA::promoteAllocas() {
5997 if (PromotableAllocas.empty())
5998 return false;
5999
6000 if (SROASkipMem2Reg) {
6001 LLVM_DEBUG(dbgs() << "Not promoting allocas with mem2reg!\n");
6002 } else {
6003 LLVM_DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
6004 NumPromoted += PromotableAllocas.size();
6005 PromoteMemToReg(Allocas: PromotableAllocas.getArrayRef(), DT&: DTU->getDomTree(), AC);
6006 }
6007
6008 PromotableAllocas.clear();
6009 return true;
6010}
6011
6012std::pair<bool /*Changed*/, bool /*CFGChanged*/> SROA::runSROA(Function &F) {
6013 LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
6014
6015 const DataLayout &DL = F.getDataLayout();
6016 BasicBlock &EntryBB = F.getEntryBlock();
6017 for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(x: EntryBB.end());
6018 I != E; ++I) {
6019 if (AllocaInst *AI = dyn_cast<AllocaInst>(Val&: I)) {
6020 std::optional<TypeSize> Size = AI->getAllocationSize(DL);
6021 if (Size && Size->isScalable() && isAllocaPromotable(AI))
6022 PromotableAllocas.insert(X: AI);
6023 else
6024 Worklist.insert(X: AI);
6025 }
6026 }
6027
6028 bool Changed = false;
6029 bool CFGChanged = false;
6030 // A set of deleted alloca instruction pointers which should be removed from
6031 // the list of promotable allocas.
6032 SmallPtrSet<AllocaInst *, 4> DeletedAllocas;
6033
6034 do {
6035 while (!Worklist.empty()) {
6036 auto [IterationChanged, IterationCFGChanged] =
6037 runOnAlloca(AI&: *Worklist.pop_back_val());
6038 Changed |= IterationChanged;
6039 CFGChanged |= IterationCFGChanged;
6040
6041 Changed |= deleteDeadInstructions(DeletedAllocas);
6042
6043 // Remove the deleted allocas from various lists so that we don't try to
6044 // continue processing them.
6045 if (!DeletedAllocas.empty()) {
6046 Worklist.set_subtract(DeletedAllocas);
6047 PostPromotionWorklist.set_subtract(DeletedAllocas);
6048 PromotableAllocas.set_subtract(DeletedAllocas);
6049 DeletedAllocas.clear();
6050 }
6051 }
6052
6053 Changed |= promoteAllocas();
6054
6055 Worklist = PostPromotionWorklist;
6056 PostPromotionWorklist.clear();
6057 } while (!Worklist.empty());
6058
6059 assert((!CFGChanged || Changed) && "Can not only modify the CFG.");
6060 assert((!CFGChanged || !PreserveCFG) &&
6061 "Should not have modified the CFG when told to preserve it.");
6062
6063 if (Changed && isAssignmentTrackingEnabled(M: *F.getParent())) {
6064 for (auto &BB : F) {
6065 RemoveRedundantDbgInstrs(BB: &BB);
6066 }
6067 }
6068
6069 return {Changed, CFGChanged};
6070}
6071
6072PreservedAnalyses SROAPass::run(Function &F, FunctionAnalysisManager &AM) {
6073 DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(IR&: F);
6074 AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(IR&: F);
6075 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
6076 auto [Changed, CFGChanged] =
6077 SROA(&F.getContext(), &DTU, &AC, PreserveCFG).runSROA(F);
6078 if (!Changed)
6079 return PreservedAnalyses::all();
6080 PreservedAnalyses PA;
6081 if (!CFGChanged)
6082 PA.preserveSet<CFGAnalyses>();
6083 PA.preserve<DominatorTreeAnalysis>();
6084 return PA;
6085}
6086
6087void SROAPass::printPipeline(
6088 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
6089 static_cast<PassInfoMixin<SROAPass> *>(this)->printPipeline(
6090 OS, MapClassName2PassName);
6091 OS << (PreserveCFG == SROAOptions::PreserveCFG ? "<preserve-cfg>"
6092 : "<modify-cfg>");
6093}
6094
6095SROAPass::SROAPass(SROAOptions PreserveCFG) : PreserveCFG(PreserveCFG) {}
6096
6097namespace {
6098
6099/// A legacy pass for the legacy pass manager that wraps the \c SROA pass.
6100class SROALegacyPass : public FunctionPass {
6101 SROAOptions PreserveCFG;
6102
6103public:
6104 static char ID;
6105
6106 SROALegacyPass(SROAOptions PreserveCFG = SROAOptions::PreserveCFG)
6107 : FunctionPass(ID), PreserveCFG(PreserveCFG) {
6108 initializeSROALegacyPassPass(*PassRegistry::getPassRegistry());
6109 }
6110
6111 bool runOnFunction(Function &F) override {
6112 if (skipFunction(F))
6113 return false;
6114
6115 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
6116 AssumptionCache &AC =
6117 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
6118 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
6119 auto [Changed, _] =
6120 SROA(&F.getContext(), &DTU, &AC, PreserveCFG).runSROA(F);
6121 return Changed;
6122 }
6123
6124 void getAnalysisUsage(AnalysisUsage &AU) const override {
6125 AU.addRequired<AssumptionCacheTracker>();
6126 AU.addRequired<DominatorTreeWrapperPass>();
6127 AU.addPreserved<GlobalsAAWrapperPass>();
6128 AU.addPreserved<DominatorTreeWrapperPass>();
6129 }
6130
6131 StringRef getPassName() const override { return "SROA"; }
6132};
6133
6134} // end anonymous namespace
6135
6136char SROALegacyPass::ID = 0;
6137
6138FunctionPass *llvm::createSROAPass(bool PreserveCFG) {
6139 return new SROALegacyPass(PreserveCFG ? SROAOptions::PreserveCFG
6140 : SROAOptions::ModifyCFG);
6141}
6142
6143INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa",
6144 "Scalar Replacement Of Aggregates", false, false)
6145INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6146INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6147INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates",
6148 false, false)
6149