1//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This transformation implements the well known scalar replacement of
10/// aggregates transformation. It tries to identify promotable elements of an
11/// aggregate alloca, and promote them to registers. It will also try to
12/// convert uses of an element (or set of elements) of an alloca into a vector
13/// or bitfield-style integer scalar if appropriate.
14///
15/// It works to do this with minimal slicing of the alloca so that regions
16/// which are merely transferred in and out of external memory remain unchanged
17/// and are not decomposed to scalar code.
18///
19/// Because this also performs alloca promotion, it can be thought of as also
20/// serving the purpose of SSA formation. The algorithm iterates on the
21/// function until all opportunities for promotion have been realized.
22///
23//===----------------------------------------------------------------------===//
24
25#include "llvm/Transforms/Scalar/SROA.h"
26#include "llvm/ADT/APInt.h"
27#include "llvm/ADT/ArrayRef.h"
28#include "llvm/ADT/DenseMap.h"
29#include "llvm/ADT/MapVector.h"
30#include "llvm/ADT/PointerIntPair.h"
31#include "llvm/ADT/STLExtras.h"
32#include "llvm/ADT/SetVector.h"
33#include "llvm/ADT/SmallBitVector.h"
34#include "llvm/ADT/SmallPtrSet.h"
35#include "llvm/ADT/SmallVector.h"
36#include "llvm/ADT/Statistic.h"
37#include "llvm/ADT/StringRef.h"
38#include "llvm/ADT/Twine.h"
39#include "llvm/ADT/iterator.h"
40#include "llvm/ADT/iterator_range.h"
41#include "llvm/Analysis/AssumptionCache.h"
42#include "llvm/Analysis/DomTreeUpdater.h"
43#include "llvm/Analysis/GlobalsModRef.h"
44#include "llvm/Analysis/Loads.h"
45#include "llvm/Analysis/PtrUseVisitor.h"
46#include "llvm/Analysis/ValueTracking.h"
47#include "llvm/Config/llvm-config.h"
48#include "llvm/IR/BasicBlock.h"
49#include "llvm/IR/Constant.h"
50#include "llvm/IR/ConstantFolder.h"
51#include "llvm/IR/Constants.h"
52#include "llvm/IR/DIBuilder.h"
53#include "llvm/IR/DataLayout.h"
54#include "llvm/IR/DebugInfo.h"
55#include "llvm/IR/DebugInfoMetadata.h"
56#include "llvm/IR/DerivedTypes.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/GlobalAlias.h"
60#include "llvm/IR/IRBuilder.h"
61#include "llvm/IR/InstVisitor.h"
62#include "llvm/IR/Instruction.h"
63#include "llvm/IR/Instructions.h"
64#include "llvm/IR/IntrinsicInst.h"
65#include "llvm/IR/Intrinsics.h"
66#include "llvm/IR/LLVMContext.h"
67#include "llvm/IR/Metadata.h"
68#include "llvm/IR/Module.h"
69#include "llvm/IR/Operator.h"
70#include "llvm/IR/PassManager.h"
71#include "llvm/IR/Type.h"
72#include "llvm/IR/Use.h"
73#include "llvm/IR/User.h"
74#include "llvm/IR/Value.h"
75#include "llvm/IR/ValueHandle.h"
76#include "llvm/InitializePasses.h"
77#include "llvm/Pass.h"
78#include "llvm/Support/Casting.h"
79#include "llvm/Support/CommandLine.h"
80#include "llvm/Support/Compiler.h"
81#include "llvm/Support/Debug.h"
82#include "llvm/Support/ErrorHandling.h"
83#include "llvm/Support/raw_ostream.h"
84#include "llvm/Transforms/Scalar.h"
85#include "llvm/Transforms/Utils/BasicBlockUtils.h"
86#include "llvm/Transforms/Utils/Local.h"
87#include "llvm/Transforms/Utils/PromoteMemToReg.h"
88#include "llvm/Transforms/Utils/SSAUpdater.h"
89#include <algorithm>
90#include <cassert>
91#include <cstddef>
92#include <cstdint>
93#include <cstring>
94#include <iterator>
95#include <queue>
96#include <string>
97#include <tuple>
98#include <utility>
99#include <variant>
100#include <vector>
101
102using namespace llvm;
103
104#define DEBUG_TYPE "sroa"
105
106STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
107STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
108STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca");
109STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten");
110STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition");
111STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced");
112STATISTIC(NumPromoted, "Number of allocas promoted to SSA values");
113STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
114STATISTIC(NumLoadsPredicated,
115 "Number of loads rewritten into predicated loads to allow promotion");
116STATISTIC(
117 NumStoresPredicated,
118 "Number of stores rewritten into predicated loads to allow promotion");
119STATISTIC(NumDeleted, "Number of instructions deleted");
120STATISTIC(NumVectorized, "Number of vectorized aggregates");
121
122namespace llvm {
123/// Disable running mem2reg during SROA in order to test or debug SROA.
124static cl::opt<bool> SROASkipMem2Reg("sroa-skip-mem2reg", cl::init(Val: false),
125 cl::Hidden);
126extern cl::opt<bool> ProfcheckDisableMetadataFixes;
127} // namespace llvm
128
129namespace {
130
131class AllocaSliceRewriter;
132class AllocaSlices;
133class Partition;
134
135class SelectHandSpeculativity {
136 unsigned char Storage = 0; // None are speculatable by default.
137 using TrueVal = Bitfield::Element<bool, 0, 1>; // Low 0'th bit.
138 using FalseVal = Bitfield::Element<bool, 1, 1>; // Low 1'th bit.
139public:
140 SelectHandSpeculativity() = default;
141 SelectHandSpeculativity &setAsSpeculatable(bool isTrueVal);
142 bool isSpeculatable(bool isTrueVal) const;
143 bool areAllSpeculatable() const;
144 bool areAnySpeculatable() const;
145 bool areNoneSpeculatable() const;
146 // For interop as int half of PointerIntPair.
147 explicit operator intptr_t() const { return static_cast<intptr_t>(Storage); }
148 explicit SelectHandSpeculativity(intptr_t Storage_) : Storage(Storage_) {}
149};
150static_assert(sizeof(SelectHandSpeculativity) == sizeof(unsigned char));
151
152using PossiblySpeculatableLoad =
153 PointerIntPair<LoadInst *, 2, SelectHandSpeculativity>;
154using UnspeculatableStore = StoreInst *;
155using RewriteableMemOp =
156 std::variant<PossiblySpeculatableLoad, UnspeculatableStore>;
157using RewriteableMemOps = SmallVector<RewriteableMemOp, 2>;
158
159/// An optimization pass providing Scalar Replacement of Aggregates.
160///
161/// This pass takes allocations which can be completely analyzed (that is, they
162/// don't escape) and tries to turn them into scalar SSA values. There are
163/// a few steps to this process.
164///
165/// 1) It takes allocations of aggregates and analyzes the ways in which they
166/// are used to try to split them into smaller allocations, ideally of
167/// a single scalar data type. It will split up memcpy and memset accesses
168/// as necessary and try to isolate individual scalar accesses.
169/// 2) It will transform accesses into forms which are suitable for SSA value
170/// promotion. This can be replacing a memset with a scalar store of an
171/// integer value, or it can involve speculating operations on a PHI or
172/// select to be a PHI or select of the results.
173/// 3) Finally, this will try to detect a pattern of accesses which map cleanly
174/// onto insert and extract operations on a vector value, and convert them to
175/// this form. By doing so, it will enable promotion of vector aggregates to
176/// SSA vector values.
177class SROA {
178 LLVMContext *const C;
179 DomTreeUpdater *const DTU;
180 AssumptionCache *const AC;
181 const bool PreserveCFG;
182
183 /// Worklist of alloca instructions to simplify.
184 ///
185 /// Each alloca in the function is added to this. Each new alloca formed gets
186 /// added to it as well to recursively simplify unless that alloca can be
187 /// directly promoted. Finally, each time we rewrite a use of an alloca other
188 /// the one being actively rewritten, we add it back onto the list if not
189 /// already present to ensure it is re-visited.
190 SmallSetVector<AllocaInst *, 16> Worklist;
191
192 /// A collection of instructions to delete.
193 /// We try to batch deletions to simplify code and make things a bit more
194 /// efficient. We also make sure there is no dangling pointers.
195 SmallVector<WeakVH, 8> DeadInsts;
196
197 /// Post-promotion worklist.
198 ///
199 /// Sometimes we discover an alloca which has a high probability of becoming
200 /// viable for SROA after a round of promotion takes place. In those cases,
201 /// the alloca is enqueued here for re-processing.
202 ///
203 /// Note that we have to be very careful to clear allocas out of this list in
204 /// the event they are deleted.
205 SmallSetVector<AllocaInst *, 16> PostPromotionWorklist;
206
207 /// A collection of alloca instructions we can directly promote.
208 SetVector<AllocaInst *, SmallVector<AllocaInst *>,
209 SmallPtrSet<AllocaInst *, 16>, 16>
210 PromotableAllocas;
211
212 /// A worklist of PHIs to speculate prior to promoting allocas.
213 ///
214 /// All of these PHIs have been checked for the safety of speculation and by
215 /// being speculated will allow promoting allocas currently in the promotable
216 /// queue.
217 SmallSetVector<PHINode *, 8> SpeculatablePHIs;
218
219 /// A worklist of select instructions to rewrite prior to promoting
220 /// allocas.
221 SmallMapVector<SelectInst *, RewriteableMemOps, 8> SelectsToRewrite;
222
223 /// Select instructions that use an alloca and are subsequently loaded can be
224 /// rewritten to load both input pointers and then select between the result,
225 /// allowing the load of the alloca to be promoted.
226 /// From this:
227 /// %P2 = select i1 %cond, ptr %Alloca, ptr %Other
228 /// %V = load <type>, ptr %P2
229 /// to:
230 /// %V1 = load <type>, ptr %Alloca -> will be mem2reg'd
231 /// %V2 = load <type>, ptr %Other
232 /// %V = select i1 %cond, <type> %V1, <type> %V2
233 ///
234 /// We can do this to a select if its only uses are loads
235 /// and if either the operand to the select can be loaded unconditionally,
236 /// or if we are allowed to perform CFG modifications.
237 /// If found an intervening bitcast with a single use of the load,
238 /// allow the promotion.
239 static std::optional<RewriteableMemOps>
240 isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG);
241
242public:
243 SROA(LLVMContext *C, DomTreeUpdater *DTU, AssumptionCache *AC,
244 SROAOptions PreserveCFG_)
245 : C(C), DTU(DTU), AC(AC),
246 PreserveCFG(PreserveCFG_ == SROAOptions::PreserveCFG) {}
247
248 /// Main run method used by both the SROAPass and by the legacy pass.
249 std::pair<bool /*Changed*/, bool /*CFGChanged*/> runSROA(Function &F);
250
251private:
252 friend class AllocaSliceRewriter;
253
254 bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS);
255 std::pair<AllocaInst *, uint64_t>
256 rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P);
257 bool splitAlloca(AllocaInst &AI, AllocaSlices &AS);
258 bool propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS);
259 std::pair<bool /*Changed*/, bool /*CFGChanged*/> runOnAlloca(AllocaInst &AI);
260 void clobberUse(Use &U);
261 bool deleteDeadInstructions(SmallPtrSetImpl<AllocaInst *> &DeletedAllocas);
262 bool promoteAllocas();
263};
264
265} // end anonymous namespace
266
267/// Calculate the fragment of a variable to use when slicing a store
268/// based on the slice dimensions, existing fragment, and base storage
269/// fragment.
270/// Results:
271/// UseFrag - Use Target as the new fragment.
272/// UseNoFrag - The new slice already covers the whole variable.
273/// Skip - The new alloca slice doesn't include this variable.
274/// FIXME: Can we use calculateFragmentIntersect instead?
275namespace {
276enum FragCalcResult { UseFrag, UseNoFrag, Skip };
277}
278static FragCalcResult
279calculateFragment(DILocalVariable *Variable,
280 uint64_t NewStorageSliceOffsetInBits,
281 uint64_t NewStorageSliceSizeInBits,
282 std::optional<DIExpression::FragmentInfo> StorageFragment,
283 std::optional<DIExpression::FragmentInfo> CurrentFragment,
284 DIExpression::FragmentInfo &Target) {
285 // If the base storage describes part of the variable apply the offset and
286 // the size constraint.
287 if (StorageFragment) {
288 Target.SizeInBits =
289 std::min(a: NewStorageSliceSizeInBits, b: StorageFragment->SizeInBits);
290 Target.OffsetInBits =
291 NewStorageSliceOffsetInBits + StorageFragment->OffsetInBits;
292 } else {
293 Target.SizeInBits = NewStorageSliceSizeInBits;
294 Target.OffsetInBits = NewStorageSliceOffsetInBits;
295 }
296
297 // If this slice extracts the entirety of an independent variable from a
298 // larger alloca, do not produce a fragment expression, as the variable is
299 // not fragmented.
300 if (!CurrentFragment) {
301 if (auto Size = Variable->getSizeInBits()) {
302 // Treat the current fragment as covering the whole variable.
303 CurrentFragment = DIExpression::FragmentInfo(*Size, 0);
304 if (Target == CurrentFragment)
305 return UseNoFrag;
306 }
307 }
308
309 // No additional work to do if there isn't a fragment already, or there is
310 // but it already exactly describes the new assignment.
311 if (!CurrentFragment || *CurrentFragment == Target)
312 return UseFrag;
313
314 // Reject the target fragment if it doesn't fit wholly within the current
315 // fragment. TODO: We could instead chop up the target to fit in the case of
316 // a partial overlap.
317 if (Target.startInBits() < CurrentFragment->startInBits() ||
318 Target.endInBits() > CurrentFragment->endInBits())
319 return Skip;
320
321 // Target fits within the current fragment, return it.
322 return UseFrag;
323}
324
325static DebugVariable getAggregateVariable(DbgVariableRecord *DVR) {
326 return DebugVariable(DVR->getVariable(), std::nullopt,
327 DVR->getDebugLoc().getInlinedAt());
328}
329
330/// Find linked dbg.assign and generate a new one with the correct
331/// FragmentInfo. Link Inst to the new dbg.assign. If Value is nullptr the
332/// value component is copied from the old dbg.assign to the new.
333/// \param OldAlloca Alloca for the variable before splitting.
334/// \param IsSplit True if the store (not necessarily alloca)
335/// is being split.
336/// \param OldAllocaOffsetInBits Offset of the slice taken from OldAlloca.
337/// \param SliceSizeInBits New number of bits being written to.
338/// \param OldInst Instruction that is being split.
339/// \param Inst New instruction performing this part of the
340/// split store.
341/// \param Dest Store destination.
342/// \param Value Stored value.
343/// \param DL Datalayout.
344static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
345 uint64_t OldAllocaOffsetInBits,
346 uint64_t SliceSizeInBits, Instruction *OldInst,
347 Instruction *Inst, Value *Dest, Value *Value,
348 const DataLayout &DL) {
349 // If we want allocas to be migrated using this helper then we need to ensure
350 // that the BaseFragments map code still works. A simple solution would be
351 // to choose to always clone alloca dbg_assigns (rather than sometimes
352 // "stealing" them).
353 assert(!isa<AllocaInst>(Inst) && "Unexpected alloca");
354
355 auto DVRAssignMarkerRange = at::getDVRAssignmentMarkers(Inst: OldInst);
356 // Nothing to do if OldInst has no linked dbg.assign intrinsics.
357 if (DVRAssignMarkerRange.empty())
358 return;
359
360 LLVM_DEBUG(dbgs() << " migrateDebugInfo\n");
361 LLVM_DEBUG(dbgs() << " OldAlloca: " << *OldAlloca << "\n");
362 LLVM_DEBUG(dbgs() << " IsSplit: " << IsSplit << "\n");
363 LLVM_DEBUG(dbgs() << " OldAllocaOffsetInBits: " << OldAllocaOffsetInBits
364 << "\n");
365 LLVM_DEBUG(dbgs() << " SliceSizeInBits: " << SliceSizeInBits << "\n");
366 LLVM_DEBUG(dbgs() << " OldInst: " << *OldInst << "\n");
367 LLVM_DEBUG(dbgs() << " Inst: " << *Inst << "\n");
368 LLVM_DEBUG(dbgs() << " Dest: " << *Dest << "\n");
369 if (Value)
370 LLVM_DEBUG(dbgs() << " Value: " << *Value << "\n");
371
372 /// Map of aggregate variables to their fragment associated with OldAlloca.
373 DenseMap<DebugVariable, std::optional<DIExpression::FragmentInfo>>
374 BaseFragments;
375 for (auto *DVR : at::getDVRAssignmentMarkers(Inst: OldAlloca))
376 BaseFragments[getAggregateVariable(DVR)] =
377 DVR->getExpression()->getFragmentInfo();
378
379 // The new inst needs a DIAssignID unique metadata tag (if OldInst has
380 // one). It shouldn't already have one: assert this assumption.
381 assert(!Inst->getMetadata(LLVMContext::MD_DIAssignID));
382 DIAssignID *NewID = nullptr;
383 auto &Ctx = Inst->getContext();
384 DIBuilder DIB(*OldInst->getModule(), /*AllowUnresolved*/ false);
385 assert(OldAlloca->isStaticAlloca());
386
387 auto MigrateDbgAssign = [&](DbgVariableRecord *DbgAssign) {
388 LLVM_DEBUG(dbgs() << " existing dbg.assign is: " << *DbgAssign
389 << "\n");
390 auto *Expr = DbgAssign->getExpression();
391 bool SetKillLocation = false;
392
393 if (IsSplit) {
394 std::optional<DIExpression::FragmentInfo> BaseFragment;
395 {
396 auto R = BaseFragments.find(Val: getAggregateVariable(DVR: DbgAssign));
397 if (R == BaseFragments.end())
398 return;
399 BaseFragment = R->second;
400 }
401 std::optional<DIExpression::FragmentInfo> CurrentFragment =
402 Expr->getFragmentInfo();
403 DIExpression::FragmentInfo NewFragment;
404 FragCalcResult Result = calculateFragment(
405 Variable: DbgAssign->getVariable(), NewStorageSliceOffsetInBits: OldAllocaOffsetInBits, NewStorageSliceSizeInBits: SliceSizeInBits,
406 StorageFragment: BaseFragment, CurrentFragment, Target&: NewFragment);
407
408 if (Result == Skip)
409 return;
410 if (Result == UseFrag && !(NewFragment == CurrentFragment)) {
411 if (CurrentFragment) {
412 // Rewrite NewFragment to be relative to the existing one (this is
413 // what createFragmentExpression wants). CalculateFragment has
414 // already resolved the size for us. FIXME: Should it return the
415 // relative fragment too?
416 NewFragment.OffsetInBits -= CurrentFragment->OffsetInBits;
417 }
418 // Add the new fragment info to the existing expression if possible.
419 if (auto E = DIExpression::createFragmentExpression(
420 Expr, OffsetInBits: NewFragment.OffsetInBits, SizeInBits: NewFragment.SizeInBits)) {
421 Expr = *E;
422 } else {
423 // Otherwise, add the new fragment info to an empty expression and
424 // discard the value component of this dbg.assign as the value cannot
425 // be computed with the new fragment.
426 Expr = *DIExpression::createFragmentExpression(
427 Expr: DIExpression::get(Context&: Expr->getContext(), Elements: {}),
428 OffsetInBits: NewFragment.OffsetInBits, SizeInBits: NewFragment.SizeInBits);
429 SetKillLocation = true;
430 }
431 }
432 }
433
434 // If we haven't created a DIAssignID ID do that now and attach it to Inst.
435 if (!NewID) {
436 NewID = DIAssignID::getDistinct(Context&: Ctx);
437 Inst->setMetadata(KindID: LLVMContext::MD_DIAssignID, Node: NewID);
438 }
439
440 DbgVariableRecord *NewAssign;
441 if (IsSplit) {
442 ::Value *NewValue = Value ? Value : DbgAssign->getValue();
443 NewAssign = cast<DbgVariableRecord>(Val: cast<DbgRecord *>(
444 Val: DIB.insertDbgAssign(LinkedInstr: Inst, Val: NewValue, SrcVar: DbgAssign->getVariable(), ValExpr: Expr,
445 Addr: Dest, AddrExpr: DIExpression::get(Context&: Expr->getContext(), Elements: {}),
446 DL: DbgAssign->getDebugLoc())));
447 } else {
448 // The store is not split, simply steal the existing dbg_assign.
449 NewAssign = DbgAssign;
450 NewAssign->setAssignId(NewID); // FIXME: Can we avoid generating new IDs?
451 NewAssign->setAddress(Dest);
452 if (Value)
453 NewAssign->replaceVariableLocationOp(OpIdx: 0u, NewValue: Value);
454 assert(Expr == NewAssign->getExpression());
455 }
456
457 // If we've updated the value but the original dbg.assign has an arglist
458 // then kill it now - we can't use the requested new value.
459 // We can't replace the DIArgList with the new value as it'd leave
460 // the DIExpression in an invalid state (DW_OP_LLVM_arg operands without
461 // an arglist). And we can't keep the DIArgList in case the linked store
462 // is being split - in which case the DIArgList + expression may no longer
463 // be computing the correct value.
464 // This should be a very rare situation as it requires the value being
465 // stored to differ from the dbg.assign (i.e., the value has been
466 // represented differently in the debug intrinsic for some reason).
467 SetKillLocation |=
468 Value && (DbgAssign->hasArgList() ||
469 !DbgAssign->getExpression()->isSingleLocationExpression());
470 if (SetKillLocation)
471 NewAssign->setKillLocation();
472
473 // We could use more precision here at the cost of some additional (code)
474 // complexity - if the original dbg.assign was adjacent to its store, we
475 // could position this new dbg.assign adjacent to its store rather than the
476 // old dbg.assgn. That would result in interleaved dbg.assigns rather than
477 // what we get now:
478 // split store !1
479 // split store !2
480 // dbg.assign !1
481 // dbg.assign !2
482 // This (current behaviour) results results in debug assignments being
483 // noted as slightly offset (in code) from the store. In practice this
484 // should have little effect on the debugging experience due to the fact
485 // that all the split stores should get the same line number.
486 if (NewAssign != DbgAssign) {
487 NewAssign->moveBefore(MoveBefore: DbgAssign->getIterator());
488 NewAssign->setDebugLoc(DbgAssign->getDebugLoc());
489 }
490 LLVM_DEBUG(dbgs() << "Created new assign: " << *NewAssign << "\n");
491 };
492
493 for_each(Range&: DVRAssignMarkerRange, F: MigrateDbgAssign);
494}
495
496namespace {
497
498/// A custom IRBuilder inserter which prefixes all names, but only in
499/// Assert builds.
500class IRBuilderPrefixedInserter final : public IRBuilderDefaultInserter {
501 std::string Prefix;
502
503 Twine getNameWithPrefix(const Twine &Name) const {
504 return Name.isTriviallyEmpty() ? Name : Prefix + Name;
505 }
506
507public:
508 void SetNamePrefix(const Twine &P) { Prefix = P.str(); }
509
510 void InsertHelper(Instruction *I, const Twine &Name,
511 BasicBlock::iterator InsertPt) const override {
512 IRBuilderDefaultInserter::InsertHelper(I, Name: getNameWithPrefix(Name),
513 InsertPt);
514 }
515};
516
517/// Provide a type for IRBuilder that drops names in release builds.
518using IRBuilderTy = IRBuilder<ConstantFolder, IRBuilderPrefixedInserter>;
519
520/// A used slice of an alloca.
521///
522/// This structure represents a slice of an alloca used by some instruction. It
523/// stores both the begin and end offsets of this use, a pointer to the use
524/// itself, and a flag indicating whether we can classify the use as splittable
525/// or not when forming partitions of the alloca.
526class Slice {
527 /// The beginning offset of the range.
528 uint64_t BeginOffset = 0;
529
530 /// The ending offset, not included in the range.
531 uint64_t EndOffset = 0;
532
533 /// Storage for both the use of this slice and whether it can be
534 /// split.
535 PointerIntPair<Use *, 1, bool> UseAndIsSplittable;
536
537public:
538 Slice() = default;
539
540 Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable,
541 Value *ProtectedFieldDisc)
542 : BeginOffset(BeginOffset), EndOffset(EndOffset),
543 UseAndIsSplittable(U, IsSplittable),
544 ProtectedFieldDisc(ProtectedFieldDisc) {}
545
546 uint64_t beginOffset() const { return BeginOffset; }
547 uint64_t endOffset() const { return EndOffset; }
548
549 bool isSplittable() const { return UseAndIsSplittable.getInt(); }
550 void makeUnsplittable() { UseAndIsSplittable.setInt(false); }
551
552 Use *getUse() const { return UseAndIsSplittable.getPointer(); }
553
554 bool isDead() const { return getUse() == nullptr; }
555 void kill() { UseAndIsSplittable.setPointer(nullptr); }
556
557 // When this access is via an llvm.protected.field.ptr intrinsic, contains
558 // the second argument to the intrinsic, the discriminator.
559 Value *ProtectedFieldDisc;
560
561 /// Support for ordering ranges.
562 ///
563 /// This provides an ordering over ranges such that start offsets are
564 /// always increasing, and within equal start offsets, the end offsets are
565 /// decreasing. Thus the spanning range comes first in a cluster with the
566 /// same start position.
567 bool operator<(const Slice &RHS) const {
568 if (beginOffset() < RHS.beginOffset())
569 return true;
570 if (beginOffset() > RHS.beginOffset())
571 return false;
572 if (isSplittable() != RHS.isSplittable())
573 return !isSplittable();
574 if (endOffset() > RHS.endOffset())
575 return true;
576 return false;
577 }
578
579 /// Support comparison with a single offset to allow binary searches.
580 [[maybe_unused]] friend bool operator<(const Slice &LHS, uint64_t RHSOffset) {
581 return LHS.beginOffset() < RHSOffset;
582 }
583 [[maybe_unused]] friend bool operator<(uint64_t LHSOffset, const Slice &RHS) {
584 return LHSOffset < RHS.beginOffset();
585 }
586
587 bool operator==(const Slice &RHS) const {
588 return isSplittable() == RHS.isSplittable() &&
589 beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset();
590 }
591 bool operator!=(const Slice &RHS) const { return !operator==(RHS); }
592};
593
594/// Representation of the alloca slices.
595///
596/// This class represents the slices of an alloca which are formed by its
597/// various uses. If a pointer escapes, we can't fully build a representation
598/// for the slices used and we reflect that in this structure. The uses are
599/// stored, sorted by increasing beginning offset and with unsplittable slices
600/// starting at a particular offset before splittable slices.
601class AllocaSlices {
602public:
603 /// Construct the slices of a particular alloca.
604 AllocaSlices(const DataLayout &DL, AllocaInst &AI);
605
606 /// Test whether a pointer to the allocation escapes our analysis.
607 ///
608 /// If this is true, the slices are never fully built and should be
609 /// ignored.
610 bool isEscaped() const { return PointerEscapingInstr; }
611 bool isEscapedReadOnly() const { return PointerEscapingInstrReadOnly; }
612
613 /// Support for iterating over the slices.
614 /// @{
615 using iterator = SmallVectorImpl<Slice>::iterator;
616 using range = iterator_range<iterator>;
617
618 iterator begin() { return Slices.begin(); }
619 iterator end() { return Slices.end(); }
620
621 using const_iterator = SmallVectorImpl<Slice>::const_iterator;
622 using const_range = iterator_range<const_iterator>;
623
624 const_iterator begin() const { return Slices.begin(); }
625 const_iterator end() const { return Slices.end(); }
626 /// @}
627
628 /// Erase a range of slices.
629 void erase(iterator Start, iterator Stop) { Slices.erase(CS: Start, CE: Stop); }
630
631 /// Insert new slices for this alloca.
632 ///
633 /// This moves the slices into the alloca's slices collection, and re-sorts
634 /// everything so that the usual ordering properties of the alloca's slices
635 /// hold.
636 void insert(ArrayRef<Slice> NewSlices) {
637 int OldSize = Slices.size();
638 Slices.append(in_start: NewSlices.begin(), in_end: NewSlices.end());
639 auto SliceI = Slices.begin() + OldSize;
640 std::stable_sort(first: SliceI, last: Slices.end());
641 std::inplace_merge(first: Slices.begin(), middle: SliceI, last: Slices.end());
642 }
643
644 // Forward declare the iterator and range accessor for walking the
645 // partitions.
646 class partition_iterator;
647 iterator_range<partition_iterator> partitions();
648
649 /// Access the dead users for this alloca.
650 ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
651
652 /// Access the users for this alloca that are llvm.protected.field.ptr
653 /// intrinsics.
654 ArrayRef<IntrinsicInst *> getPFPUsers() const { return PFPUsers; }
655
656 /// Access Uses that should be dropped if the alloca is promotable.
657 ArrayRef<Use *> getDeadUsesIfPromotable() const {
658 return DeadUseIfPromotable;
659 }
660
661 /// Access the dead operands referring to this alloca.
662 ///
663 /// These are operands which have cannot actually be used to refer to the
664 /// alloca as they are outside its range and the user doesn't correct for
665 /// that. These mostly consist of PHI node inputs and the like which we just
666 /// need to replace with undef.
667 ArrayRef<Use *> getDeadOperands() const { return DeadOperands; }
668
669#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
670 void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const;
671 void printSlice(raw_ostream &OS, const_iterator I,
672 StringRef Indent = " ") const;
673 void printUse(raw_ostream &OS, const_iterator I,
674 StringRef Indent = " ") const;
675 void print(raw_ostream &OS) const;
676 void dump(const_iterator I) const;
677 void dump() const;
678#endif
679
680private:
681 template <typename DerivedT, typename RetT = void> class BuilderBase;
682 class SliceBuilder;
683
684 friend class AllocaSlices::SliceBuilder;
685
686#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
687 /// Handle to alloca instruction to simplify method interfaces.
688 AllocaInst &AI;
689#endif
690
691 /// The instruction responsible for this alloca not having a known set
692 /// of slices.
693 ///
694 /// When an instruction (potentially) escapes the pointer to the alloca, we
695 /// store a pointer to that here and abort trying to form slices of the
696 /// alloca. This will be null if the alloca slices are analyzed successfully.
697 Instruction *PointerEscapingInstr;
698 Instruction *PointerEscapingInstrReadOnly;
699
700 /// The slices of the alloca.
701 ///
702 /// We store a vector of the slices formed by uses of the alloca here. This
703 /// vector is sorted by increasing begin offset, and then the unsplittable
704 /// slices before the splittable ones. See the Slice inner class for more
705 /// details.
706 SmallVector<Slice, 8> Slices;
707
708 /// Instructions which will become dead if we rewrite the alloca.
709 ///
710 /// Note that these are not separated by slice. This is because we expect an
711 /// alloca to be completely rewritten or not rewritten at all. If rewritten,
712 /// all these instructions can simply be removed and replaced with poison as
713 /// they come from outside of the allocated space.
714 SmallVector<Instruction *, 8> DeadUsers;
715
716 /// Users that are llvm.protected.field.ptr intrinsics. These will be RAUW'd
717 /// to their first argument if we rewrite the alloca.
718 SmallVector<IntrinsicInst *, 0> PFPUsers;
719
720 /// Uses which will become dead if can promote the alloca.
721 SmallVector<Use *, 8> DeadUseIfPromotable;
722
723 /// Operands which will become dead if we rewrite the alloca.
724 ///
725 /// These are operands that in their particular use can be replaced with
726 /// poison when we rewrite the alloca. These show up in out-of-bounds inputs
727 /// to PHI nodes and the like. They aren't entirely dead (there might be
728 /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
729 /// want to swap this particular input for poison to simplify the use lists of
730 /// the alloca.
731 SmallVector<Use *, 8> DeadOperands;
732};
733
734/// A partition of the slices.
735///
736/// An ephemeral representation for a range of slices which can be viewed as
737/// a partition of the alloca. This range represents a span of the alloca's
738/// memory which cannot be split, and provides access to all of the slices
739/// overlapping some part of the partition.
740///
741/// Objects of this type are produced by traversing the alloca's slices, but
742/// are only ephemeral and not persistent.
743class Partition {
744private:
745 friend class AllocaSlices;
746 friend class AllocaSlices::partition_iterator;
747
748 using iterator = AllocaSlices::iterator;
749
750 /// The beginning and ending offsets of the alloca for this
751 /// partition.
752 uint64_t BeginOffset = 0, EndOffset = 0;
753
754 /// The start and end iterators of this partition.
755 iterator SI, SJ;
756
757 /// A collection of split slice tails overlapping the partition.
758 SmallVector<Slice *, 4> SplitTails;
759
760 /// Raw constructor builds an empty partition starting and ending at
761 /// the given iterator.
762 Partition(iterator SI) : SI(SI), SJ(SI) {}
763
764public:
765 /// The start offset of this partition.
766 ///
767 /// All of the contained slices start at or after this offset.
768 uint64_t beginOffset() const { return BeginOffset; }
769
770 /// The end offset of this partition.
771 ///
772 /// All of the contained slices end at or before this offset.
773 uint64_t endOffset() const { return EndOffset; }
774
775 /// The size of the partition.
776 ///
777 /// Note that this can never be zero.
778 uint64_t size() const {
779 assert(BeginOffset < EndOffset && "Partitions must span some bytes!");
780 return EndOffset - BeginOffset;
781 }
782
783 /// Test whether this partition contains no slices, and merely spans
784 /// a region occupied by split slices.
785 bool empty() const { return SI == SJ; }
786
787 /// \name Iterate slices that start within the partition.
788 /// These may be splittable or unsplittable. They have a begin offset >= the
789 /// partition begin offset.
790 /// @{
791 // FIXME: We should probably define a "concat_iterator" helper and use that
792 // to stitch together pointee_iterators over the split tails and the
793 // contiguous iterators of the partition. That would give a much nicer
794 // interface here. We could then additionally expose filtered iterators for
795 // split, unsplit, and unsplittable splices based on the usage patterns.
796 iterator begin() const { return SI; }
797 iterator end() const { return SJ; }
798 /// @}
799
800 /// Get the sequence of split slice tails.
801 ///
802 /// These tails are of slices which start before this partition but are
803 /// split and overlap into the partition. We accumulate these while forming
804 /// partitions.
805 ArrayRef<Slice *> splitSliceTails() const { return SplitTails; }
806};
807
808} // end anonymous namespace
809
810/// An iterator over partitions of the alloca's slices.
811///
812/// This iterator implements the core algorithm for partitioning the alloca's
813/// slices. It is a forward iterator as we don't support backtracking for
814/// efficiency reasons, and re-use a single storage area to maintain the
815/// current set of split slices.
816///
817/// It is templated on the slice iterator type to use so that it can operate
818/// with either const or non-const slice iterators.
819class AllocaSlices::partition_iterator
820 : public iterator_facade_base<partition_iterator, std::forward_iterator_tag,
821 Partition> {
822 friend class AllocaSlices;
823
824 /// Most of the state for walking the partitions is held in a class
825 /// with a nice interface for examining them.
826 Partition P;
827
828 /// We need to keep the end of the slices to know when to stop.
829 AllocaSlices::iterator SE;
830
831 /// We also need to keep track of the maximum split end offset seen.
832 /// FIXME: Do we really?
833 uint64_t MaxSplitSliceEndOffset = 0;
834
835 /// Sets the partition to be empty at given iterator, and sets the
836 /// end iterator.
837 partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE)
838 : P(SI), SE(SE) {
839 // If not already at the end, advance our state to form the initial
840 // partition.
841 if (SI != SE)
842 advance();
843 }
844
845 /// Advance the iterator to the next partition.
846 ///
847 /// Requires that the iterator not be at the end of the slices.
848 void advance() {
849 assert((P.SI != SE || !P.SplitTails.empty()) &&
850 "Cannot advance past the end of the slices!");
851
852 // Clear out any split uses which have ended.
853 if (!P.SplitTails.empty()) {
854 if (P.EndOffset >= MaxSplitSliceEndOffset) {
855 // If we've finished all splits, this is easy.
856 P.SplitTails.clear();
857 MaxSplitSliceEndOffset = 0;
858 } else {
859 // Remove the uses which have ended in the prior partition. This
860 // cannot change the max split slice end because we just checked that
861 // the prior partition ended prior to that max.
862 llvm::erase_if(C&: P.SplitTails,
863 P: [&](Slice *S) { return S->endOffset() <= P.EndOffset; });
864 assert(llvm::any_of(P.SplitTails,
865 [&](Slice *S) {
866 return S->endOffset() == MaxSplitSliceEndOffset;
867 }) &&
868 "Could not find the current max split slice offset!");
869 assert(llvm::all_of(P.SplitTails,
870 [&](Slice *S) {
871 return S->endOffset() <= MaxSplitSliceEndOffset;
872 }) &&
873 "Max split slice end offset is not actually the max!");
874 }
875 }
876
877 // If P.SI is already at the end, then we've cleared the split tail and
878 // now have an end iterator.
879 if (P.SI == SE) {
880 assert(P.SplitTails.empty() && "Failed to clear the split slices!");
881 return;
882 }
883
884 // If we had a non-empty partition previously, set up the state for
885 // subsequent partitions.
886 if (P.SI != P.SJ) {
887 // Accumulate all the splittable slices which started in the old
888 // partition into the split list.
889 for (Slice &S : P)
890 if (S.isSplittable() && S.endOffset() > P.EndOffset) {
891 P.SplitTails.push_back(Elt: &S);
892 MaxSplitSliceEndOffset =
893 std::max(a: S.endOffset(), b: MaxSplitSliceEndOffset);
894 }
895
896 // Start from the end of the previous partition.
897 P.SI = P.SJ;
898
899 // If P.SI is now at the end, we at most have a tail of split slices.
900 if (P.SI == SE) {
901 P.BeginOffset = P.EndOffset;
902 P.EndOffset = MaxSplitSliceEndOffset;
903 return;
904 }
905
906 // If the we have split slices and the next slice is after a gap and is
907 // not splittable immediately form an empty partition for the split
908 // slices up until the next slice begins.
909 if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset &&
910 !P.SI->isSplittable()) {
911 P.BeginOffset = P.EndOffset;
912 P.EndOffset = P.SI->beginOffset();
913 return;
914 }
915 }
916
917 // OK, we need to consume new slices. Set the end offset based on the
918 // current slice, and step SJ past it. The beginning offset of the
919 // partition is the beginning offset of the next slice unless we have
920 // pre-existing split slices that are continuing, in which case we begin
921 // at the prior end offset.
922 P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset;
923 P.EndOffset = P.SI->endOffset();
924 ++P.SJ;
925
926 // There are two strategies to form a partition based on whether the
927 // partition starts with an unsplittable slice or a splittable slice.
928 if (!P.SI->isSplittable()) {
929 // When we're forming an unsplittable region, it must always start at
930 // the first slice and will extend through its end.
931 assert(P.BeginOffset == P.SI->beginOffset());
932
933 // Form a partition including all of the overlapping slices with this
934 // unsplittable slice.
935 while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
936 if (!P.SJ->isSplittable())
937 P.EndOffset = std::max(a: P.EndOffset, b: P.SJ->endOffset());
938 ++P.SJ;
939 }
940
941 // We have a partition across a set of overlapping unsplittable
942 // partitions.
943 return;
944 }
945
946 // If we're starting with a splittable slice, then we need to form
947 // a synthetic partition spanning it and any other overlapping splittable
948 // splices.
949 assert(P.SI->isSplittable() && "Forming a splittable partition!");
950
951 // Collect all of the overlapping splittable slices.
952 while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset &&
953 P.SJ->isSplittable()) {
954 P.EndOffset = std::max(a: P.EndOffset, b: P.SJ->endOffset());
955 ++P.SJ;
956 }
957
958 // Back upiP.EndOffset if we ended the span early when encountering an
959 // unsplittable slice. This synthesizes the early end offset of
960 // a partition spanning only splittable slices.
961 if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
962 assert(!P.SJ->isSplittable());
963 P.EndOffset = P.SJ->beginOffset();
964 }
965 }
966
967public:
968 bool operator==(const partition_iterator &RHS) const {
969 assert(SE == RHS.SE &&
970 "End iterators don't match between compared partition iterators!");
971
972 // The observed positions of partitions is marked by the P.SI iterator and
973 // the emptiness of the split slices. The latter is only relevant when
974 // P.SI == SE, as the end iterator will additionally have an empty split
975 // slices list, but the prior may have the same P.SI and a tail of split
976 // slices.
977 if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) {
978 assert(P.SJ == RHS.P.SJ &&
979 "Same set of slices formed two different sized partitions!");
980 assert(P.SplitTails.size() == RHS.P.SplitTails.size() &&
981 "Same slice position with differently sized non-empty split "
982 "slice tails!");
983 return true;
984 }
985 return false;
986 }
987
988 partition_iterator &operator++() {
989 advance();
990 return *this;
991 }
992
993 Partition &operator*() { return P; }
994};
995
996/// A forward range over the partitions of the alloca's slices.
997///
998/// This accesses an iterator range over the partitions of the alloca's
999/// slices. It computes these partitions on the fly based on the overlapping
1000/// offsets of the slices and the ability to split them. It will visit "empty"
1001/// partitions to cover regions of the alloca only accessed via split
1002/// slices.
1003iterator_range<AllocaSlices::partition_iterator> AllocaSlices::partitions() {
1004 return make_range(x: partition_iterator(begin(), end()),
1005 y: partition_iterator(end(), end()));
1006}
1007
1008static Value *foldSelectInst(SelectInst &SI) {
1009 // If the condition being selected on is a constant or the same value is
1010 // being selected between, fold the select. Yes this does (rarely) happen
1011 // early on.
1012 if (ConstantInt *CI = dyn_cast<ConstantInt>(Val: SI.getCondition()))
1013 return SI.getOperand(i_nocapture: 1 + CI->isZero());
1014 if (SI.getOperand(i_nocapture: 1) == SI.getOperand(i_nocapture: 2))
1015 return SI.getOperand(i_nocapture: 1);
1016
1017 return nullptr;
1018}
1019
1020/// A helper that folds a PHI node or a select.
1021static Value *foldPHINodeOrSelectInst(Instruction &I) {
1022 if (PHINode *PN = dyn_cast<PHINode>(Val: &I)) {
1023 // If PN merges together the same value, return that value.
1024 return PN->hasConstantValue();
1025 }
1026 return foldSelectInst(SI&: cast<SelectInst>(Val&: I));
1027}
1028
1029/// Builder for the alloca slices.
1030///
1031/// This class builds a set of alloca slices by recursively visiting the uses
1032/// of an alloca and making a slice for each load and store at each offset.
1033class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
1034 friend class PtrUseVisitor<SliceBuilder>;
1035 friend class InstVisitor<SliceBuilder>;
1036
1037 using Base = PtrUseVisitor<SliceBuilder>;
1038
1039 const uint64_t AllocSize;
1040 AllocaSlices &AS;
1041
1042 SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap;
1043 SmallDenseMap<Instruction *, uint64_t> PHIOrSelectSizes;
1044
1045 /// Set to de-duplicate dead instructions found in the use walk.
1046 SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
1047
1048 // When this access is via an llvm.protected.field.ptr intrinsic, contains
1049 // the second argument to the intrinsic, the discriminator.
1050 Value *ProtectedFieldDisc = nullptr;
1051
1052public:
1053 SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
1054 : PtrUseVisitor<SliceBuilder>(DL),
1055 AllocSize(AI.getAllocationSize(DL)->getFixedValue()), AS(AS) {}
1056
1057private:
1058 void markAsDead(Instruction &I) {
1059 if (VisitedDeadInsts.insert(Ptr: &I).second)
1060 AS.DeadUsers.push_back(Elt: &I);
1061 }
1062
1063 void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
1064 bool IsSplittable = false) {
1065 // Completely skip uses which have a zero size or start either before or
1066 // past the end of the allocation.
1067 if (Size == 0 || Offset.uge(RHS: AllocSize)) {
1068 LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @"
1069 << Offset
1070 << " which has zero size or starts outside of the "
1071 << AllocSize << " byte alloca:\n"
1072 << " alloca: " << AS.AI << "\n"
1073 << " use: " << I << "\n");
1074 return markAsDead(I);
1075 }
1076
1077 uint64_t BeginOffset = Offset.getZExtValue();
1078 uint64_t EndOffset = BeginOffset + Size;
1079
1080 // Clamp the end offset to the end of the allocation. Note that this is
1081 // formulated to handle even the case where "BeginOffset + Size" overflows.
1082 // This may appear superficially to be something we could ignore entirely,
1083 // but that is not so! There may be widened loads or PHI-node uses where
1084 // some instructions are dead but not others. We can't completely ignore
1085 // them, and so have to record at least the information here.
1086 assert(AllocSize >= BeginOffset); // Established above.
1087 if (Size > AllocSize - BeginOffset) {
1088 LLVM_DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @"
1089 << Offset << " to remain within the " << AllocSize
1090 << " byte alloca:\n"
1091 << " alloca: " << AS.AI << "\n"
1092 << " use: " << I << "\n");
1093 EndOffset = AllocSize;
1094 }
1095
1096 AS.Slices.push_back(
1097 Elt: Slice(BeginOffset, EndOffset, U, IsSplittable, ProtectedFieldDisc));
1098 }
1099
1100 void visitBitCastInst(BitCastInst &BC) {
1101 if (BC.use_empty())
1102 return markAsDead(I&: BC);
1103
1104 return Base::visitBitCastInst(BC);
1105 }
1106
1107 void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
1108 if (ASC.use_empty())
1109 return markAsDead(I&: ASC);
1110
1111 return Base::visitAddrSpaceCastInst(ASC);
1112 }
1113
1114 void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
1115 if (GEPI.use_empty())
1116 return markAsDead(I&: GEPI);
1117
1118 return Base::visitGetElementPtrInst(GEPI);
1119 }
1120
1121 void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
1122 uint64_t Size, bool IsVolatile) {
1123 // We allow splitting of non-volatile loads and stores where the type is an
1124 // integer type. These may be used to implement 'memcpy' or other "transfer
1125 // of bits" patterns.
1126 bool IsSplittable =
1127 Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty);
1128
1129 insertUse(I, Offset, Size, IsSplittable);
1130 }
1131
1132 void visitLoadInst(LoadInst &LI) {
1133 assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
1134 "All simple FCA loads should have been pre-split");
1135
1136 // If there is a load with an unknown offset, we can still perform store
1137 // to load forwarding for other known-offset loads.
1138 if (!IsOffsetKnown)
1139 return PI.setEscapedReadOnly(&LI);
1140
1141 TypeSize Size = DL.getTypeStoreSize(Ty: LI.getType());
1142 if (Size.isScalable()) {
1143 unsigned VScale = LI.getFunction()->getVScaleValue();
1144 if (!VScale)
1145 return PI.setAborted(&LI);
1146
1147 Size = TypeSize::getFixed(ExactSize: Size.getKnownMinValue() * VScale);
1148 }
1149
1150 return handleLoadOrStore(Ty: LI.getType(), I&: LI, Offset, Size: Size.getFixedValue(),
1151 IsVolatile: LI.isVolatile());
1152 }
1153
1154 void visitStoreInst(StoreInst &SI) {
1155 Value *ValOp = SI.getValueOperand();
1156 if (ValOp == *U)
1157 return PI.setEscapedAndAborted(&SI);
1158 if (!IsOffsetKnown)
1159 return PI.setAborted(&SI);
1160
1161 TypeSize StoreSize = DL.getTypeStoreSize(Ty: ValOp->getType());
1162 if (StoreSize.isScalable()) {
1163 unsigned VScale = SI.getFunction()->getVScaleValue();
1164 if (!VScale)
1165 return PI.setAborted(&SI);
1166
1167 StoreSize = TypeSize::getFixed(ExactSize: StoreSize.getKnownMinValue() * VScale);
1168 }
1169
1170 uint64_t Size = StoreSize.getFixedValue();
1171
1172 // If this memory access can be shown to *statically* extend outside the
1173 // bounds of the allocation, it's behavior is undefined, so simply
1174 // ignore it. Note that this is more strict than the generic clamping
1175 // behavior of insertUse. We also try to handle cases which might run the
1176 // risk of overflow.
1177 // FIXME: We should instead consider the pointer to have escaped if this
1178 // function is being instrumented for addressing bugs or race conditions.
1179 if (Size > AllocSize || Offset.ugt(RHS: AllocSize - Size)) {
1180 LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @"
1181 << Offset << " which extends past the end of the "
1182 << AllocSize << " byte alloca:\n"
1183 << " alloca: " << AS.AI << "\n"
1184 << " use: " << SI << "\n");
1185 return markAsDead(I&: SI);
1186 }
1187
1188 assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
1189 "All simple FCA stores should have been pre-split");
1190 handleLoadOrStore(Ty: ValOp->getType(), I&: SI, Offset, Size, IsVolatile: SI.isVolatile());
1191 }
1192
1193 void visitMemSetInst(MemSetInst &II) {
1194 assert(II.getRawDest() == *U && "Pointer use is not the destination?");
1195 ConstantInt *Length = dyn_cast<ConstantInt>(Val: II.getLength());
1196 if ((Length && Length->getValue() == 0) ||
1197 (IsOffsetKnown && Offset.uge(RHS: AllocSize)))
1198 // Zero-length mem transfer intrinsics can be ignored entirely.
1199 return markAsDead(I&: II);
1200
1201 if (!IsOffsetKnown)
1202 return PI.setAborted(&II);
1203
1204 insertUse(I&: II, Offset,
1205 Size: Length ? Length->getLimitedValue()
1206 : AllocSize - Offset.getLimitedValue(),
1207 IsSplittable: (bool)Length);
1208 }
1209
1210 void visitMemTransferInst(MemTransferInst &II) {
1211 ConstantInt *Length = dyn_cast<ConstantInt>(Val: II.getLength());
1212 if (Length && Length->getValue() == 0)
1213 // Zero-length mem transfer intrinsics can be ignored entirely.
1214 return markAsDead(I&: II);
1215
1216 // Because we can visit these intrinsics twice, also check to see if the
1217 // first time marked this instruction as dead. If so, skip it.
1218 if (VisitedDeadInsts.count(Ptr: &II))
1219 return;
1220
1221 if (!IsOffsetKnown)
1222 return PI.setAborted(&II);
1223
1224 // This side of the transfer is completely out-of-bounds, and so we can
1225 // nuke the entire transfer. However, we also need to nuke the other side
1226 // if already added to our partitions.
1227 // FIXME: Yet another place we really should bypass this when
1228 // instrumenting for ASan.
1229 if (Offset.uge(RHS: AllocSize)) {
1230 SmallDenseMap<Instruction *, unsigned>::iterator MTPI =
1231 MemTransferSliceMap.find(Val: &II);
1232 if (MTPI != MemTransferSliceMap.end())
1233 AS.Slices[MTPI->second].kill();
1234 return markAsDead(I&: II);
1235 }
1236
1237 uint64_t RawOffset = Offset.getLimitedValue();
1238 uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset;
1239
1240 // Check for the special case where the same exact value is used for both
1241 // source and dest.
1242 if (*U == II.getRawDest() && *U == II.getRawSource()) {
1243 // For non-volatile transfers this is a no-op.
1244 if (!II.isVolatile())
1245 return markAsDead(I&: II);
1246
1247 return insertUse(I&: II, Offset, Size, /*IsSplittable=*/false);
1248 }
1249
1250 // If we have seen both source and destination for a mem transfer, then
1251 // they both point to the same alloca.
1252 bool Inserted;
1253 SmallDenseMap<Instruction *, unsigned>::iterator MTPI;
1254 std::tie(args&: MTPI, args&: Inserted) =
1255 MemTransferSliceMap.insert(KV: std::make_pair(x: &II, y: AS.Slices.size()));
1256 unsigned PrevIdx = MTPI->second;
1257 if (!Inserted) {
1258 Slice &PrevP = AS.Slices[PrevIdx];
1259
1260 // Check if the begin offsets match and this is a non-volatile transfer.
1261 // In that case, we can completely elide the transfer.
1262 if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) {
1263 PrevP.kill();
1264 return markAsDead(I&: II);
1265 }
1266
1267 // Otherwise we have an offset transfer within the same alloca. We can't
1268 // split those.
1269 PrevP.makeUnsplittable();
1270 }
1271
1272 // Insert the use now that we've fixed up the splittable nature.
1273 insertUse(I&: II, Offset, Size, /*IsSplittable=*/Inserted && Length);
1274
1275 // Check that we ended up with a valid index in the map.
1276 assert(AS.Slices[PrevIdx].getUse()->getUser() == &II &&
1277 "Map index doesn't point back to a slice with this user.");
1278 }
1279
1280 // Disable SRoA for any intrinsics except for lifetime invariants.
1281 // FIXME: What about debug intrinsics? This matches old behavior, but
1282 // doesn't make sense.
1283 void visitIntrinsicInst(IntrinsicInst &II) {
1284 if (II.isDroppable()) {
1285 AS.DeadUseIfPromotable.push_back(Elt: U);
1286 return;
1287 }
1288
1289 if (!IsOffsetKnown)
1290 return PI.setAborted(&II);
1291
1292 if (II.isLifetimeStartOrEnd()) {
1293 insertUse(I&: II, Offset, Size: AllocSize, IsSplittable: true);
1294 return;
1295 }
1296
1297 if (II.getIntrinsicID() == Intrinsic::protected_field_ptr) {
1298 // We only handle loads and stores as users of llvm.protected.field.ptr.
1299 // Other uses may add items to the worklist, which will cause
1300 // ProtectedFieldDisc to be tracked incorrectly.
1301 AS.PFPUsers.push_back(Elt: &II);
1302 ProtectedFieldDisc = II.getArgOperand(i: 1);
1303 for (Use &U : II.uses()) {
1304 this->U = &U;
1305 if (auto *LI = dyn_cast<LoadInst>(Val: U.getUser()))
1306 visitLoadInst(LI&: *LI);
1307 else if (auto *SI = dyn_cast<StoreInst>(Val: U.getUser()))
1308 visitStoreInst(SI&: *SI);
1309 else
1310 PI.setAborted(&II);
1311 if (PI.isAborted())
1312 break;
1313 }
1314 ProtectedFieldDisc = nullptr;
1315 return;
1316 }
1317
1318 Base::visitIntrinsicInst(II);
1319 }
1320
1321 Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) {
1322 // We consider any PHI or select that results in a direct load or store of
1323 // the same offset to be a viable use for slicing purposes. These uses
1324 // are considered unsplittable and the size is the maximum loaded or stored
1325 // size.
1326 SmallPtrSet<Instruction *, 4> Visited;
1327 SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses;
1328 Visited.insert(Ptr: Root);
1329 Uses.push_back(Elt: std::make_pair(x: cast<Instruction>(Val&: *U), y&: Root));
1330 const DataLayout &DL = Root->getDataLayout();
1331 // If there are no loads or stores, the access is dead. We mark that as
1332 // a size zero access.
1333 Size = 0;
1334 do {
1335 Instruction *I, *UsedI;
1336 std::tie(args&: UsedI, args&: I) = Uses.pop_back_val();
1337
1338 if (LoadInst *LI = dyn_cast<LoadInst>(Val: I)) {
1339 TypeSize LoadSize = DL.getTypeStoreSize(Ty: LI->getType());
1340 if (LoadSize.isScalable()) {
1341 PI.setAborted(LI);
1342 return nullptr;
1343 }
1344 Size = std::max(a: Size, b: LoadSize.getFixedValue());
1345 continue;
1346 }
1347 if (StoreInst *SI = dyn_cast<StoreInst>(Val: I)) {
1348 Value *Op = SI->getOperand(i_nocapture: 0);
1349 if (Op == UsedI)
1350 return SI;
1351 TypeSize StoreSize = DL.getTypeStoreSize(Ty: Op->getType());
1352 if (StoreSize.isScalable()) {
1353 PI.setAborted(SI);
1354 return nullptr;
1355 }
1356 Size = std::max(a: Size, b: StoreSize.getFixedValue());
1357 continue;
1358 }
1359
1360 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: I)) {
1361 if (!GEP->hasAllZeroIndices())
1362 return GEP;
1363 } else if (!isa<BitCastInst>(Val: I) && !isa<PHINode>(Val: I) &&
1364 !isa<SelectInst>(Val: I) && !isa<AddrSpaceCastInst>(Val: I)) {
1365 return I;
1366 }
1367
1368 for (User *U : I->users())
1369 if (Visited.insert(Ptr: cast<Instruction>(Val: U)).second)
1370 Uses.push_back(Elt: std::make_pair(x&: I, y: cast<Instruction>(Val: U)));
1371 } while (!Uses.empty());
1372
1373 return nullptr;
1374 }
1375
1376 void visitPHINodeOrSelectInst(Instruction &I) {
1377 assert(isa<PHINode>(I) || isa<SelectInst>(I));
1378 if (I.use_empty())
1379 return markAsDead(I);
1380
1381 // If this is a PHI node before a catchswitch, we cannot insert any non-PHI
1382 // instructions in this BB, which may be required during rewriting. Bail out
1383 // on these cases.
1384 if (isa<PHINode>(Val: I) && !I.getParent()->hasInsertionPt())
1385 return PI.setAborted(&I);
1386
1387 // TODO: We could use simplifyInstruction here to fold PHINodes and
1388 // SelectInsts. However, doing so requires to change the current
1389 // dead-operand-tracking mechanism. For instance, suppose neither loading
1390 // from %U nor %other traps. Then "load (select undef, %U, %other)" does not
1391 // trap either. However, if we simply replace %U with undef using the
1392 // current dead-operand-tracking mechanism, "load (select undef, undef,
1393 // %other)" may trap because the select may return the first operand
1394 // "undef".
1395 if (Value *Result = foldPHINodeOrSelectInst(I)) {
1396 if (Result == *U)
1397 // If the result of the constant fold will be the pointer, recurse
1398 // through the PHI/select as if we had RAUW'ed it.
1399 enqueueUsers(I);
1400 else
1401 // Otherwise the operand to the PHI/select is dead, and we can replace
1402 // it with poison.
1403 AS.DeadOperands.push_back(Elt: U);
1404
1405 return;
1406 }
1407
1408 if (!IsOffsetKnown)
1409 return PI.setAborted(&I);
1410
1411 // See if we already have computed info on this node.
1412 uint64_t &Size = PHIOrSelectSizes[&I];
1413 if (!Size) {
1414 // This is a new PHI/Select, check for an unsafe use of it.
1415 if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(Root: &I, Size))
1416 return PI.setAborted(UnsafeI);
1417 }
1418
1419 // For PHI and select operands outside the alloca, we can't nuke the entire
1420 // phi or select -- the other side might still be relevant, so we special
1421 // case them here and use a separate structure to track the operands
1422 // themselves which should be replaced with poison.
1423 // FIXME: This should instead be escaped in the event we're instrumenting
1424 // for address sanitization.
1425 if (Offset.uge(RHS: AllocSize)) {
1426 AS.DeadOperands.push_back(Elt: U);
1427 return;
1428 }
1429
1430 insertUse(I, Offset, Size);
1431 }
1432
1433 void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(I&: PN); }
1434
1435 void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(I&: SI); }
1436
1437 /// Disable SROA entirely if there are unhandled users of the alloca.
1438 void visitInstruction(Instruction &I) { PI.setAborted(&I); }
1439
1440 void visitCallBase(CallBase &CB) {
1441 // If the call operand is read-only and only does a read-only or address
1442 // capture, then we mark it as EscapedReadOnly.
1443 if (CB.isDataOperand(U) &&
1444 !capturesFullProvenance(CC: CB.getCaptureInfo(OpNo: U->getOperandNo())) &&
1445 CB.onlyReadsMemory(OpNo: U->getOperandNo())) {
1446 PI.setEscapedReadOnly(&CB);
1447 return;
1448 }
1449
1450 Base::visitCallBase(CB);
1451 }
1452};
1453
1454AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
1455 :
1456#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1457 AI(AI),
1458#endif
1459 PointerEscapingInstr(nullptr), PointerEscapingInstrReadOnly(nullptr) {
1460 SliceBuilder PB(DL, AI, *this);
1461 SliceBuilder::PtrInfo PtrI = PB.visitPtr(I&: AI);
1462 if (PtrI.isEscaped() || PtrI.isAborted()) {
1463 // FIXME: We should sink the escape vs. abort info into the caller nicely,
1464 // possibly by just storing the PtrInfo in the AllocaSlices.
1465 PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst()
1466 : PtrI.getAbortingInst();
1467 assert(PointerEscapingInstr && "Did not track a bad instruction");
1468 return;
1469 }
1470 PointerEscapingInstrReadOnly = PtrI.getEscapedReadOnlyInst();
1471
1472 llvm::erase_if(C&: Slices, P: [](const Slice &S) { return S.isDead(); });
1473
1474 // Sort the uses. This arranges for the offsets to be in ascending order,
1475 // and the sizes to be in descending order.
1476 llvm::stable_sort(Range&: Slices);
1477}
1478
1479#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1480
1481void AllocaSlices::print(raw_ostream &OS, const_iterator I,
1482 StringRef Indent) const {
1483 printSlice(OS, I, Indent);
1484 OS << "\n";
1485 printUse(OS, I, Indent);
1486}
1487
1488void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I,
1489 StringRef Indent) const {
1490 OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")"
1491 << " slice #" << (I - begin())
1492 << (I->isSplittable() ? " (splittable)" : "");
1493}
1494
1495void AllocaSlices::printUse(raw_ostream &OS, const_iterator I,
1496 StringRef Indent) const {
1497 OS << Indent << " used by: " << *I->getUse()->getUser() << "\n";
1498}
1499
1500void AllocaSlices::print(raw_ostream &OS) const {
1501 if (PointerEscapingInstr) {
1502 OS << "Can't analyze slices for alloca: " << AI << "\n"
1503 << " A pointer to this alloca escaped by:\n"
1504 << " " << *PointerEscapingInstr << "\n";
1505 return;
1506 }
1507
1508 if (PointerEscapingInstrReadOnly)
1509 OS << "Escapes into ReadOnly: " << *PointerEscapingInstrReadOnly << "\n";
1510
1511 OS << "Slices of alloca: " << AI << "\n";
1512 for (const_iterator I = begin(), E = end(); I != E; ++I)
1513 print(OS, I);
1514}
1515
1516LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const {
1517 print(dbgs(), I);
1518}
1519LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }
1520
1521#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1522
1523/// Walk the range of a partitioning looking for a common type to cover this
1524/// sequence of slices.
1525static std::pair<Type *, IntegerType *>
1526findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E,
1527 uint64_t EndOffset) {
1528 Type *Ty = nullptr;
1529 bool TyIsCommon = true;
1530 IntegerType *ITy = nullptr;
1531
1532 // Note that we need to look at *every* alloca slice's Use to ensure we
1533 // always get consistent results regardless of the order of slices.
1534 for (AllocaSlices::const_iterator I = B; I != E; ++I) {
1535 Use *U = I->getUse();
1536 if (isa<IntrinsicInst>(Val: *U->getUser()))
1537 continue;
1538 if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset)
1539 continue;
1540
1541 Type *UserTy = nullptr;
1542 if (LoadInst *LI = dyn_cast<LoadInst>(Val: U->getUser())) {
1543 UserTy = LI->getType();
1544 } else if (StoreInst *SI = dyn_cast<StoreInst>(Val: U->getUser())) {
1545 UserTy = SI->getValueOperand()->getType();
1546 }
1547
1548 if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(Val: UserTy)) {
1549 // If the type is larger than the partition, skip it. We only encounter
1550 // this for split integer operations where we want to use the type of the
1551 // entity causing the split. Also skip if the type is not a byte width
1552 // multiple.
1553 if (UserITy->getBitWidth() % 8 != 0 ||
1554 UserITy->getBitWidth() / 8 > (EndOffset - B->beginOffset()))
1555 continue;
1556
1557 // Track the largest bitwidth integer type used in this way in case there
1558 // is no common type.
1559 if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth())
1560 ITy = UserITy;
1561 }
1562
1563 // To avoid depending on the order of slices, Ty and TyIsCommon must not
1564 // depend on types skipped above.
1565 if (!UserTy || (Ty && Ty != UserTy))
1566 TyIsCommon = false; // Give up on anything but an iN type.
1567 else
1568 Ty = UserTy;
1569 }
1570
1571 return {TyIsCommon ? Ty : nullptr, ITy};
1572}
1573
1574/// PHI instructions that use an alloca and are subsequently loaded can be
1575/// rewritten to load both input pointers in the pred blocks and then PHI the
1576/// results, allowing the load of the alloca to be promoted.
1577/// From this:
1578/// %P2 = phi [i32* %Alloca, i32* %Other]
1579/// %V = load i32* %P2
1580/// to:
1581/// %V1 = load i32* %Alloca -> will be mem2reg'd
1582/// ...
1583/// %V2 = load i32* %Other
1584/// ...
1585/// %V = phi [i32 %V1, i32 %V2]
1586///
1587/// We can do this to a select if its only uses are loads and if the operands
1588/// to the select can be loaded unconditionally.
1589///
1590/// FIXME: This should be hoisted into a generic utility, likely in
1591/// Transforms/Util/Local.h
1592static bool isSafePHIToSpeculate(PHINode &PN) {
1593 const DataLayout &DL = PN.getDataLayout();
1594
1595 // For now, we can only do this promotion if the load is in the same block
1596 // as the PHI, and if there are no stores between the phi and load.
1597 // TODO: Allow recursive phi users.
1598 // TODO: Allow stores.
1599 BasicBlock *BB = PN.getParent();
1600 Align MaxAlign;
1601 uint64_t APWidth = DL.getIndexTypeSizeInBits(Ty: PN.getType());
1602 Type *LoadType = nullptr;
1603 for (User *U : PN.users()) {
1604 LoadInst *LI = dyn_cast<LoadInst>(Val: U);
1605 if (!LI || !LI->isSimple())
1606 return false;
1607
1608 // For now we only allow loads in the same block as the PHI. This is
1609 // a common case that happens when instcombine merges two loads through
1610 // a PHI.
1611 if (LI->getParent() != BB)
1612 return false;
1613
1614 if (LoadType) {
1615 if (LoadType != LI->getType())
1616 return false;
1617 } else {
1618 LoadType = LI->getType();
1619 }
1620
1621 // Ensure that there are no instructions between the PHI and the load that
1622 // could store.
1623 for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI)
1624 if (BBI->mayWriteToMemory())
1625 return false;
1626
1627 MaxAlign = std::max(a: MaxAlign, b: LI->getAlign());
1628 }
1629
1630 if (!LoadType)
1631 return false;
1632
1633 APInt LoadSize =
1634 APInt(APWidth, DL.getTypeStoreSize(Ty: LoadType).getFixedValue());
1635
1636 // We can only transform this if it is safe to push the loads into the
1637 // predecessor blocks. The only thing to watch out for is that we can't put
1638 // a possibly trapping load in the predecessor if it is a critical edge.
1639 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1640 Instruction *TI = PN.getIncomingBlock(i: Idx)->getTerminator();
1641 Value *InVal = PN.getIncomingValue(i: Idx);
1642
1643 // If the value is produced by the terminator of the predecessor (an
1644 // invoke) or it has side-effects, there is no valid place to put a load
1645 // in the predecessor.
1646 if (TI == InVal || TI->mayHaveSideEffects())
1647 return false;
1648
1649 // If the predecessor has a single successor, then the edge isn't
1650 // critical.
1651 if (TI->getNumSuccessors() == 1)
1652 continue;
1653
1654 // If this pointer is always safe to load, or if we can prove that there
1655 // is already a load in the block, then we can move the load to the pred
1656 // block.
1657 if (isSafeToLoadUnconditionally(V: InVal, Alignment: MaxAlign, Size: LoadSize, DL, ScanFrom: TI))
1658 continue;
1659
1660 return false;
1661 }
1662
1663 return true;
1664}
1665
1666static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN) {
1667 LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
1668
1669 LoadInst *SomeLoad = cast<LoadInst>(Val: PN.user_back());
1670 Type *LoadTy = SomeLoad->getType();
1671 IRB.SetInsertPoint(&PN);
1672 PHINode *NewPN = IRB.CreatePHI(Ty: LoadTy, NumReservedValues: PN.getNumIncomingValues(),
1673 Name: PN.getName() + ".sroa.speculated");
1674
1675 // Get the AA tags and alignment to use from one of the loads. It does not
1676 // matter which one we get and if any differ.
1677 AAMDNodes AATags = SomeLoad->getAAMetadata();
1678 Align Alignment = SomeLoad->getAlign();
1679
1680 // Rewrite all loads of the PN to use the new PHI.
1681 while (!PN.use_empty()) {
1682 LoadInst *LI = cast<LoadInst>(Val: PN.user_back());
1683 LI->replaceAllUsesWith(V: NewPN);
1684 LI->eraseFromParent();
1685 }
1686
1687 // Inject loads into all of the pred blocks.
1688 DenseMap<BasicBlock *, Value *> InjectedLoads;
1689 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1690 BasicBlock *Pred = PN.getIncomingBlock(i: Idx);
1691 Value *InVal = PN.getIncomingValue(i: Idx);
1692
1693 // A PHI node is allowed to have multiple (duplicated) entries for the same
1694 // basic block, as long as the value is the same. So if we already injected
1695 // a load in the predecessor, then we should reuse the same load for all
1696 // duplicated entries.
1697 if (Value *V = InjectedLoads.lookup(Val: Pred)) {
1698 NewPN->addIncoming(V, BB: Pred);
1699 continue;
1700 }
1701
1702 Instruction *TI = Pred->getTerminator();
1703 IRB.SetInsertPoint(TI);
1704
1705 LoadInst *Load = IRB.CreateAlignedLoad(
1706 Ty: LoadTy, Ptr: InVal, Align: Alignment,
1707 Name: (PN.getName() + ".sroa.speculate.load." + Pred->getName()));
1708 ++NumLoadsSpeculated;
1709 if (AATags)
1710 Load->setAAMetadata(AATags);
1711 NewPN->addIncoming(V: Load, BB: Pred);
1712 InjectedLoads[Pred] = Load;
1713 }
1714
1715 LLVM_DEBUG(dbgs() << " speculated to: " << *NewPN << "\n");
1716 PN.eraseFromParent();
1717}
1718
1719SelectHandSpeculativity &
1720SelectHandSpeculativity::setAsSpeculatable(bool isTrueVal) {
1721 if (isTrueVal)
1722 Bitfield::set<SelectHandSpeculativity::TrueVal>(Packed&: Storage, Value: true);
1723 else
1724 Bitfield::set<SelectHandSpeculativity::FalseVal>(Packed&: Storage, Value: true);
1725 return *this;
1726}
1727
1728bool SelectHandSpeculativity::isSpeculatable(bool isTrueVal) const {
1729 return isTrueVal ? Bitfield::get<SelectHandSpeculativity::TrueVal>(Packed: Storage)
1730 : Bitfield::get<SelectHandSpeculativity::FalseVal>(Packed: Storage);
1731}
1732
1733bool SelectHandSpeculativity::areAllSpeculatable() const {
1734 return isSpeculatable(/*isTrueVal=*/true) &&
1735 isSpeculatable(/*isTrueVal=*/false);
1736}
1737
1738bool SelectHandSpeculativity::areAnySpeculatable() const {
1739 return isSpeculatable(/*isTrueVal=*/true) ||
1740 isSpeculatable(/*isTrueVal=*/false);
1741}
1742bool SelectHandSpeculativity::areNoneSpeculatable() const {
1743 return !areAnySpeculatable();
1744}
1745
1746static SelectHandSpeculativity
1747isSafeLoadOfSelectToSpeculate(LoadInst &LI, SelectInst &SI, bool PreserveCFG) {
1748 assert(LI.isSimple() && "Only for simple loads");
1749 SelectHandSpeculativity Spec;
1750
1751 const DataLayout &DL = SI.getDataLayout();
1752 for (Value *Value : {SI.getTrueValue(), SI.getFalseValue()})
1753 if (isSafeToLoadUnconditionally(V: Value, Ty: LI.getType(), Alignment: LI.getAlign(), DL,
1754 ScanFrom: &LI))
1755 Spec.setAsSpeculatable(/*isTrueVal=*/Value == SI.getTrueValue());
1756 else if (PreserveCFG)
1757 return Spec;
1758
1759 return Spec;
1760}
1761
1762std::optional<RewriteableMemOps>
1763SROA::isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG) {
1764 RewriteableMemOps Ops;
1765
1766 for (User *U : SI.users()) {
1767 if (auto *BC = dyn_cast<BitCastInst>(Val: U); BC && BC->hasOneUse())
1768 U = *BC->user_begin();
1769
1770 if (auto *Store = dyn_cast<StoreInst>(Val: U)) {
1771 // Note that atomic stores can be transformed; atomic semantics do not
1772 // have any meaning for a local alloca. Stores are not speculatable,
1773 // however, so if we can't turn it into a predicated store, we are done.
1774 if (Store->isVolatile() || PreserveCFG)
1775 return {}; // Give up on this `select`.
1776 Ops.emplace_back(Args&: Store);
1777 continue;
1778 }
1779
1780 auto *LI = dyn_cast<LoadInst>(Val: U);
1781
1782 // Note that atomic loads can be transformed;
1783 // atomic semantics do not have any meaning for a local alloca.
1784 if (!LI || LI->isVolatile())
1785 return {}; // Give up on this `select`.
1786
1787 PossiblySpeculatableLoad Load(LI);
1788 if (!LI->isSimple()) {
1789 // If the `load` is not simple, we can't speculatively execute it,
1790 // but we could handle this via a CFG modification. But can we?
1791 if (PreserveCFG)
1792 return {}; // Give up on this `select`.
1793 Ops.emplace_back(Args&: Load);
1794 continue;
1795 }
1796
1797 SelectHandSpeculativity Spec =
1798 isSafeLoadOfSelectToSpeculate(LI&: *LI, SI, PreserveCFG);
1799 if (PreserveCFG && !Spec.areAllSpeculatable())
1800 return {}; // Give up on this `select`.
1801
1802 Load.setInt(Spec);
1803 Ops.emplace_back(Args&: Load);
1804 }
1805
1806 return Ops;
1807}
1808
1809static void speculateSelectInstLoads(SelectInst &SI, LoadInst &LI,
1810 IRBuilderTy &IRB) {
1811 LLVM_DEBUG(dbgs() << " original load: " << SI << "\n");
1812
1813 Value *TV = SI.getTrueValue();
1814 Value *FV = SI.getFalseValue();
1815 // Replace the given load of the select with a select of two loads.
1816
1817 assert(LI.isSimple() && "We only speculate simple loads");
1818
1819 IRB.SetInsertPoint(&LI);
1820
1821 LoadInst *TL =
1822 IRB.CreateAlignedLoad(Ty: LI.getType(), Ptr: TV, Align: LI.getAlign(),
1823 Name: LI.getName() + ".sroa.speculate.load.true");
1824 LoadInst *FL =
1825 IRB.CreateAlignedLoad(Ty: LI.getType(), Ptr: FV, Align: LI.getAlign(),
1826 Name: LI.getName() + ".sroa.speculate.load.false");
1827 NumLoadsSpeculated += 2;
1828
1829 // Transfer alignment and AA info if present.
1830 TL->setAlignment(LI.getAlign());
1831 FL->setAlignment(LI.getAlign());
1832
1833 AAMDNodes Tags = LI.getAAMetadata();
1834 if (Tags) {
1835 TL->setAAMetadata(Tags);
1836 FL->setAAMetadata(Tags);
1837 }
1838
1839 Value *V = IRB.CreateSelect(C: SI.getCondition(), True: TL, False: FL,
1840 Name: LI.getName() + ".sroa.speculated",
1841 MDFrom: ProfcheckDisableMetadataFixes ? nullptr : &SI);
1842
1843 LLVM_DEBUG(dbgs() << " speculated to: " << *V << "\n");
1844 LI.replaceAllUsesWith(V);
1845}
1846
1847template <typename T>
1848static void rewriteMemOpOfSelect(SelectInst &SI, T &I,
1849 SelectHandSpeculativity Spec,
1850 DomTreeUpdater &DTU) {
1851 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && "Only for load and store!");
1852 LLVM_DEBUG(dbgs() << " original mem op: " << I << "\n");
1853 BasicBlock *Head = I.getParent();
1854 Instruction *ThenTerm = nullptr;
1855 Instruction *ElseTerm = nullptr;
1856 if (Spec.areNoneSpeculatable())
1857 SplitBlockAndInsertIfThenElse(SI.getCondition(), &I, &ThenTerm, &ElseTerm,
1858 SI.getMetadata(KindID: LLVMContext::MD_prof), &DTU);
1859 else {
1860 SplitBlockAndInsertIfThen(SI.getCondition(), &I, /*Unreachable=*/false,
1861 SI.getMetadata(KindID: LLVMContext::MD_prof), &DTU,
1862 /*LI=*/nullptr, /*ThenBlock=*/nullptr);
1863 if (Spec.isSpeculatable(/*isTrueVal=*/true))
1864 cast<BranchInst>(Val: Head->getTerminator())->swapSuccessors();
1865 }
1866 auto *HeadBI = cast<BranchInst>(Val: Head->getTerminator());
1867 Spec = {}; // Do not use `Spec` beyond this point.
1868 BasicBlock *Tail = I.getParent();
1869 Tail->setName(Head->getName() + ".cont");
1870 PHINode *PN;
1871 if (isa<LoadInst>(I))
1872 PN = PHINode::Create(Ty: I.getType(), NumReservedValues: 2, NameStr: "", InsertBefore: I.getIterator());
1873 for (BasicBlock *SuccBB : successors(BB: Head)) {
1874 bool IsThen = SuccBB == HeadBI->getSuccessor(i: 0);
1875 int SuccIdx = IsThen ? 0 : 1;
1876 auto *NewMemOpBB = SuccBB == Tail ? Head : SuccBB;
1877 auto &CondMemOp = cast<T>(*I.clone());
1878 if (NewMemOpBB != Head) {
1879 NewMemOpBB->setName(Head->getName() + (IsThen ? ".then" : ".else"));
1880 if (isa<LoadInst>(I))
1881 ++NumLoadsPredicated;
1882 else
1883 ++NumStoresPredicated;
1884 } else {
1885 CondMemOp.dropUBImplyingAttrsAndMetadata();
1886 ++NumLoadsSpeculated;
1887 }
1888 CondMemOp.insertBefore(NewMemOpBB->getTerminator()->getIterator());
1889 Value *Ptr = SI.getOperand(i_nocapture: 1 + SuccIdx);
1890 CondMemOp.setOperand(I.getPointerOperandIndex(), Ptr);
1891 if (isa<LoadInst>(I)) {
1892 CondMemOp.setName(I.getName() + (IsThen ? ".then" : ".else") + ".val");
1893 PN->addIncoming(V: &CondMemOp, BB: NewMemOpBB);
1894 } else
1895 LLVM_DEBUG(dbgs() << " to: " << CondMemOp << "\n");
1896 }
1897 if (isa<LoadInst>(I)) {
1898 PN->takeName(V: &I);
1899 LLVM_DEBUG(dbgs() << " to: " << *PN << "\n");
1900 I.replaceAllUsesWith(PN);
1901 }
1902}
1903
1904static void rewriteMemOpOfSelect(SelectInst &SelInst, Instruction &I,
1905 SelectHandSpeculativity Spec,
1906 DomTreeUpdater &DTU) {
1907 if (auto *LI = dyn_cast<LoadInst>(Val: &I))
1908 rewriteMemOpOfSelect(SI&: SelInst, I&: *LI, Spec, DTU);
1909 else if (auto *SI = dyn_cast<StoreInst>(Val: &I))
1910 rewriteMemOpOfSelect(SI&: SelInst, I&: *SI, Spec, DTU);
1911 else
1912 llvm_unreachable_internal(msg: "Only for load and store.");
1913}
1914
1915static bool rewriteSelectInstMemOps(SelectInst &SI,
1916 const RewriteableMemOps &Ops,
1917 IRBuilderTy &IRB, DomTreeUpdater *DTU) {
1918 bool CFGChanged = false;
1919 LLVM_DEBUG(dbgs() << " original select: " << SI << "\n");
1920
1921 for (const RewriteableMemOp &Op : Ops) {
1922 SelectHandSpeculativity Spec;
1923 Instruction *I;
1924 if (auto *const *US = std::get_if<UnspeculatableStore>(ptr: &Op)) {
1925 I = *US;
1926 } else {
1927 auto PSL = std::get<PossiblySpeculatableLoad>(v: Op);
1928 I = PSL.getPointer();
1929 Spec = PSL.getInt();
1930 }
1931 if (Spec.areAllSpeculatable()) {
1932 speculateSelectInstLoads(SI, LI&: cast<LoadInst>(Val&: *I), IRB);
1933 } else {
1934 assert(DTU && "Should not get here when not allowed to modify the CFG!");
1935 rewriteMemOpOfSelect(SelInst&: SI, I&: *I, Spec, DTU&: *DTU);
1936 CFGChanged = true;
1937 }
1938 I->eraseFromParent();
1939 }
1940
1941 for (User *U : make_early_inc_range(Range: SI.users()))
1942 cast<BitCastInst>(Val: U)->eraseFromParent();
1943 SI.eraseFromParent();
1944 return CFGChanged;
1945}
1946
1947/// Compute an adjusted pointer from Ptr by Offset bytes where the
1948/// resulting pointer has PointerTy.
1949static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
1950 APInt Offset, Type *PointerTy,
1951 const Twine &NamePrefix) {
1952 if (Offset != 0)
1953 Ptr = IRB.CreateInBoundsPtrAdd(Ptr, Offset: IRB.getInt(AI: Offset),
1954 Name: NamePrefix + "sroa_idx");
1955 return IRB.CreatePointerBitCastOrAddrSpaceCast(V: Ptr, DestTy: PointerTy,
1956 Name: NamePrefix + "sroa_cast");
1957}
1958
1959/// Compute the adjusted alignment for a load or store from an offset.
1960static Align getAdjustedAlignment(Instruction *I, uint64_t Offset) {
1961 return commonAlignment(A: getLoadStoreAlignment(I), Offset);
1962}
1963
1964/// Test whether we can convert a value from the old to the new type.
1965///
1966/// This predicate should be used to guard calls to convertValue in order to
1967/// ensure that we only try to convert viable values. The strategy is that we
1968/// will peel off single element struct and array wrappings to get to an
1969/// underlying value, and convert that value.
1970static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy,
1971 unsigned VScale = 0) {
1972 if (OldTy == NewTy)
1973 return true;
1974
1975 // For integer types, we can't handle any bit-width differences. This would
1976 // break both vector conversions with extension and introduce endianness
1977 // issues when in conjunction with loads and stores.
1978 if (isa<IntegerType>(Val: OldTy) && isa<IntegerType>(Val: NewTy)) {
1979 assert(cast<IntegerType>(OldTy)->getBitWidth() !=
1980 cast<IntegerType>(NewTy)->getBitWidth() &&
1981 "We can't have the same bitwidth for different int types");
1982 return false;
1983 }
1984
1985 TypeSize NewSize = DL.getTypeSizeInBits(Ty: NewTy);
1986 TypeSize OldSize = DL.getTypeSizeInBits(Ty: OldTy);
1987
1988 if ((isa<ScalableVectorType>(Val: NewTy) && isa<FixedVectorType>(Val: OldTy)) ||
1989 (isa<ScalableVectorType>(Val: OldTy) && isa<FixedVectorType>(Val: NewTy))) {
1990 // Conversion is only possible when the size of scalable vectors is known.
1991 if (!VScale)
1992 return false;
1993
1994 // For ptr-to-int and int-to-ptr casts, the pointer side is resolved within
1995 // a single domain (either fixed or scalable). Any additional conversion
1996 // between fixed and scalable types is handled through integer types.
1997 auto OldVTy = OldTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(OldTy) : OldTy;
1998 auto NewVTy = NewTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(NewTy) : NewTy;
1999
2000 if (isa<ScalableVectorType>(Val: NewTy)) {
2001 if (!VectorType::getWithSizeAndScalar(SizeTy: cast<VectorType>(Val: NewVTy), EltTy: OldVTy))
2002 return false;
2003
2004 NewSize = TypeSize::getFixed(ExactSize: NewSize.getKnownMinValue() * VScale);
2005 } else {
2006 if (!VectorType::getWithSizeAndScalar(SizeTy: cast<VectorType>(Val: OldVTy), EltTy: NewVTy))
2007 return false;
2008
2009 OldSize = TypeSize::getFixed(ExactSize: OldSize.getKnownMinValue() * VScale);
2010 }
2011 }
2012
2013 if (NewSize != OldSize)
2014 return false;
2015 if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
2016 return false;
2017
2018 // We can convert pointers to integers and vice-versa. Same for vectors
2019 // of pointers and integers.
2020 OldTy = OldTy->getScalarType();
2021 NewTy = NewTy->getScalarType();
2022 if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
2023 if (NewTy->isPointerTy() && OldTy->isPointerTy()) {
2024 unsigned OldAS = OldTy->getPointerAddressSpace();
2025 unsigned NewAS = NewTy->getPointerAddressSpace();
2026 // Convert pointers if they are pointers from the same address space or
2027 // different integral (not non-integral) address spaces with the same
2028 // pointer size.
2029 return OldAS == NewAS ||
2030 (!DL.isNonIntegralAddressSpace(AddrSpace: OldAS) &&
2031 !DL.isNonIntegralAddressSpace(AddrSpace: NewAS) &&
2032 DL.getPointerSize(AS: OldAS) == DL.getPointerSize(AS: NewAS));
2033 }
2034
2035 // We can convert integers to integral pointers, but not to non-integral
2036 // pointers.
2037 if (OldTy->isIntegerTy())
2038 return !DL.isNonIntegralPointerType(Ty: NewTy);
2039
2040 // We can convert integral pointers to integers, but non-integral pointers
2041 // need to remain pointers.
2042 if (!DL.isNonIntegralPointerType(Ty: OldTy))
2043 return NewTy->isIntegerTy();
2044
2045 return false;
2046 }
2047
2048 if (OldTy->isTargetExtTy() || NewTy->isTargetExtTy())
2049 return false;
2050
2051 return true;
2052}
2053
2054/// Test whether the given slice use can be promoted to a vector.
2055///
2056/// This function is called to test each entry in a partition which is slated
2057/// for a single slice.
2058static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
2059 VectorType *Ty,
2060 uint64_t ElementSize,
2061 const DataLayout &DL,
2062 unsigned VScale) {
2063 // First validate the slice offsets.
2064 uint64_t BeginOffset =
2065 std::max(a: S.beginOffset(), b: P.beginOffset()) - P.beginOffset();
2066 uint64_t BeginIndex = BeginOffset / ElementSize;
2067 if (BeginIndex * ElementSize != BeginOffset ||
2068 BeginIndex >= cast<FixedVectorType>(Val: Ty)->getNumElements())
2069 return false;
2070 uint64_t EndOffset = std::min(a: S.endOffset(), b: P.endOffset()) - P.beginOffset();
2071 uint64_t EndIndex = EndOffset / ElementSize;
2072 if (EndIndex * ElementSize != EndOffset ||
2073 EndIndex > cast<FixedVectorType>(Val: Ty)->getNumElements())
2074 return false;
2075
2076 assert(EndIndex > BeginIndex && "Empty vector!");
2077 uint64_t NumElements = EndIndex - BeginIndex;
2078 Type *SliceTy = (NumElements == 1)
2079 ? Ty->getElementType()
2080 : FixedVectorType::get(ElementType: Ty->getElementType(), NumElts: NumElements);
2081
2082 Type *SplitIntTy =
2083 Type::getIntNTy(C&: Ty->getContext(), N: NumElements * ElementSize * 8);
2084
2085 Use *U = S.getUse();
2086
2087 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Val: U->getUser())) {
2088 if (MI->isVolatile())
2089 return false;
2090 if (!S.isSplittable())
2091 return false; // Skip any unsplittable intrinsics.
2092 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: U->getUser())) {
2093 if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
2094 return false;
2095 } else if (LoadInst *LI = dyn_cast<LoadInst>(Val: U->getUser())) {
2096 if (LI->isVolatile())
2097 return false;
2098 Type *LTy = LI->getType();
2099 // Disable vector promotion when there are loads or stores of an FCA.
2100 if (LTy->isStructTy())
2101 return false;
2102 if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
2103 assert(LTy->isIntegerTy());
2104 LTy = SplitIntTy;
2105 }
2106 if (!canConvertValue(DL, OldTy: SliceTy, NewTy: LTy, VScale))
2107 return false;
2108 } else if (StoreInst *SI = dyn_cast<StoreInst>(Val: U->getUser())) {
2109 if (SI->isVolatile())
2110 return false;
2111 Type *STy = SI->getValueOperand()->getType();
2112 // Disable vector promotion when there are loads or stores of an FCA.
2113 if (STy->isStructTy())
2114 return false;
2115 if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
2116 assert(STy->isIntegerTy());
2117 STy = SplitIntTy;
2118 }
2119 if (!canConvertValue(DL, OldTy: STy, NewTy: SliceTy, VScale))
2120 return false;
2121 } else {
2122 return false;
2123 }
2124
2125 return true;
2126}
2127
2128/// Test whether any vector type in \p CandidateTys is viable for promotion.
2129///
2130/// This implements the necessary checking for \c isVectorPromotionViable over
2131/// all slices of the alloca for the given VectorType.
2132static VectorType *
2133checkVectorTypesForPromotion(Partition &P, const DataLayout &DL,
2134 SmallVectorImpl<VectorType *> &CandidateTys,
2135 bool HaveCommonEltTy, Type *CommonEltTy,
2136 bool HaveVecPtrTy, bool HaveCommonVecPtrTy,
2137 VectorType *CommonVecPtrTy, unsigned VScale) {
2138 // If we didn't find a vector type, nothing to do here.
2139 if (CandidateTys.empty())
2140 return nullptr;
2141
2142 // Pointer-ness is sticky, if we had a vector-of-pointers candidate type,
2143 // then we should choose it, not some other alternative.
2144 // But, we can't perform a no-op pointer address space change via bitcast,
2145 // so if we didn't have a common pointer element type, bail.
2146 if (HaveVecPtrTy && !HaveCommonVecPtrTy)
2147 return nullptr;
2148
2149 // Try to pick the "best" element type out of the choices.
2150 if (!HaveCommonEltTy && HaveVecPtrTy) {
2151 // If there was a pointer element type, there's really only one choice.
2152 CandidateTys.clear();
2153 CandidateTys.push_back(Elt: CommonVecPtrTy);
2154 } else if (!HaveCommonEltTy && !HaveVecPtrTy) {
2155 // Integer-ify vector types.
2156 for (VectorType *&VTy : CandidateTys) {
2157 if (!VTy->getElementType()->isIntegerTy())
2158 VTy = cast<VectorType>(Val: VTy->getWithNewType(EltTy: IntegerType::getIntNTy(
2159 C&: VTy->getContext(), N: VTy->getScalarSizeInBits())));
2160 }
2161
2162 // Rank the remaining candidate vector types. This is easy because we know
2163 // they're all integer vectors. We sort by ascending number of elements.
2164 auto RankVectorTypesComp = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
2165 (void)DL;
2166 assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
2167 DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
2168 "Cannot have vector types of different sizes!");
2169 assert(RHSTy->getElementType()->isIntegerTy() &&
2170 "All non-integer types eliminated!");
2171 assert(LHSTy->getElementType()->isIntegerTy() &&
2172 "All non-integer types eliminated!");
2173 return cast<FixedVectorType>(Val: RHSTy)->getNumElements() <
2174 cast<FixedVectorType>(Val: LHSTy)->getNumElements();
2175 };
2176 auto RankVectorTypesEq = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
2177 (void)DL;
2178 assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
2179 DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
2180 "Cannot have vector types of different sizes!");
2181 assert(RHSTy->getElementType()->isIntegerTy() &&
2182 "All non-integer types eliminated!");
2183 assert(LHSTy->getElementType()->isIntegerTy() &&
2184 "All non-integer types eliminated!");
2185 return cast<FixedVectorType>(Val: RHSTy)->getNumElements() ==
2186 cast<FixedVectorType>(Val: LHSTy)->getNumElements();
2187 };
2188 llvm::sort(C&: CandidateTys, Comp: RankVectorTypesComp);
2189 CandidateTys.erase(CS: llvm::unique(R&: CandidateTys, P: RankVectorTypesEq),
2190 CE: CandidateTys.end());
2191 } else {
2192// The only way to have the same element type in every vector type is to
2193// have the same vector type. Check that and remove all but one.
2194#ifndef NDEBUG
2195 for (VectorType *VTy : CandidateTys) {
2196 assert(VTy->getElementType() == CommonEltTy &&
2197 "Unaccounted for element type!");
2198 assert(VTy == CandidateTys[0] &&
2199 "Different vector types with the same element type!");
2200 }
2201#endif
2202 CandidateTys.resize(N: 1);
2203 }
2204
2205 // FIXME: hack. Do we have a named constant for this?
2206 // SDAG SDNode can't have more than 65535 operands.
2207 llvm::erase_if(C&: CandidateTys, P: [](VectorType *VTy) {
2208 return cast<FixedVectorType>(Val: VTy)->getNumElements() >
2209 std::numeric_limits<unsigned short>::max();
2210 });
2211
2212 // Find a vector type viable for promotion by iterating over all slices.
2213 auto *VTy = llvm::find_if(Range&: CandidateTys, P: [&](VectorType *VTy) -> bool {
2214 uint64_t ElementSize =
2215 DL.getTypeSizeInBits(Ty: VTy->getElementType()).getFixedValue();
2216
2217 // While the definition of LLVM vectors is bitpacked, we don't support sizes
2218 // that aren't byte sized.
2219 if (ElementSize % 8)
2220 return false;
2221 assert((DL.getTypeSizeInBits(VTy).getFixedValue() % 8) == 0 &&
2222 "vector size not a multiple of element size?");
2223 ElementSize /= 8;
2224
2225 for (const Slice &S : P)
2226 if (!isVectorPromotionViableForSlice(P, S, Ty: VTy, ElementSize, DL, VScale))
2227 return false;
2228
2229 for (const Slice *S : P.splitSliceTails())
2230 if (!isVectorPromotionViableForSlice(P, S: *S, Ty: VTy, ElementSize, DL, VScale))
2231 return false;
2232
2233 return true;
2234 });
2235 return VTy != CandidateTys.end() ? *VTy : nullptr;
2236}
2237
2238static VectorType *createAndCheckVectorTypesForPromotion(
2239 SetVector<Type *> &OtherTys, ArrayRef<VectorType *> CandidateTysCopy,
2240 function_ref<void(Type *)> CheckCandidateType, Partition &P,
2241 const DataLayout &DL, SmallVectorImpl<VectorType *> &CandidateTys,
2242 bool &HaveCommonEltTy, Type *&CommonEltTy, bool &HaveVecPtrTy,
2243 bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy, unsigned VScale) {
2244 [[maybe_unused]] VectorType *OriginalElt =
2245 CandidateTysCopy.size() ? CandidateTysCopy[0] : nullptr;
2246 // Consider additional vector types where the element type size is a
2247 // multiple of load/store element size.
2248 for (Type *Ty : OtherTys) {
2249 if (!VectorType::isValidElementType(ElemTy: Ty))
2250 continue;
2251 unsigned TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
2252 // Make a copy of CandidateTys and iterate through it, because we
2253 // might append to CandidateTys in the loop.
2254 for (VectorType *const VTy : CandidateTysCopy) {
2255 // The elements in the copy should remain invariant throughout the loop
2256 assert(CandidateTysCopy[0] == OriginalElt && "Different Element");
2257 unsigned VectorSize = DL.getTypeSizeInBits(Ty: VTy).getFixedValue();
2258 unsigned ElementSize =
2259 DL.getTypeSizeInBits(Ty: VTy->getElementType()).getFixedValue();
2260 if (TypeSize != VectorSize && TypeSize != ElementSize &&
2261 VectorSize % TypeSize == 0) {
2262 VectorType *NewVTy = VectorType::get(ElementType: Ty, NumElements: VectorSize / TypeSize, Scalable: false);
2263 CheckCandidateType(NewVTy);
2264 }
2265 }
2266 }
2267
2268 return checkVectorTypesForPromotion(
2269 P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
2270 HaveCommonVecPtrTy, CommonVecPtrTy, VScale);
2271}
2272
2273/// Test whether the given alloca partitioning and range of slices can be
2274/// promoted to a vector.
2275///
2276/// This is a quick test to check whether we can rewrite a particular alloca
2277/// partition (and its newly formed alloca) into a vector alloca with only
2278/// whole-vector loads and stores such that it could be promoted to a vector
2279/// SSA value. We only can ensure this for a limited set of operations, and we
2280/// don't want to do the rewrites unless we are confident that the result will
2281/// be promotable, so we have an early test here.
2282static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL,
2283 unsigned VScale) {
2284 // Collect the candidate types for vector-based promotion. Also track whether
2285 // we have different element types.
2286 SmallVector<VectorType *, 4> CandidateTys;
2287 SetVector<Type *> LoadStoreTys;
2288 SetVector<Type *> DeferredTys;
2289 Type *CommonEltTy = nullptr;
2290 VectorType *CommonVecPtrTy = nullptr;
2291 bool HaveVecPtrTy = false;
2292 bool HaveCommonEltTy = true;
2293 bool HaveCommonVecPtrTy = true;
2294 auto CheckCandidateType = [&](Type *Ty) {
2295 if (auto *VTy = dyn_cast<FixedVectorType>(Val: Ty)) {
2296 // Return if bitcast to vectors is different for total size in bits.
2297 if (!CandidateTys.empty()) {
2298 VectorType *V = CandidateTys[0];
2299 if (DL.getTypeSizeInBits(Ty: VTy).getFixedValue() !=
2300 DL.getTypeSizeInBits(Ty: V).getFixedValue()) {
2301 CandidateTys.clear();
2302 return;
2303 }
2304 }
2305 CandidateTys.push_back(Elt: VTy);
2306 Type *EltTy = VTy->getElementType();
2307
2308 if (!CommonEltTy)
2309 CommonEltTy = EltTy;
2310 else if (CommonEltTy != EltTy)
2311 HaveCommonEltTy = false;
2312
2313 if (EltTy->isPointerTy()) {
2314 HaveVecPtrTy = true;
2315 if (!CommonVecPtrTy)
2316 CommonVecPtrTy = VTy;
2317 else if (CommonVecPtrTy != VTy)
2318 HaveCommonVecPtrTy = false;
2319 }
2320 }
2321 };
2322
2323 // Put load and store types into a set for de-duplication.
2324 for (const Slice &S : P) {
2325 Type *Ty;
2326 if (auto *LI = dyn_cast<LoadInst>(Val: S.getUse()->getUser()))
2327 Ty = LI->getType();
2328 else if (auto *SI = dyn_cast<StoreInst>(Val: S.getUse()->getUser()))
2329 Ty = SI->getValueOperand()->getType();
2330 else
2331 continue;
2332
2333 auto CandTy = Ty->getScalarType();
2334 if (CandTy->isPointerTy() && (S.beginOffset() != P.beginOffset() ||
2335 S.endOffset() != P.endOffset())) {
2336 DeferredTys.insert(X: Ty);
2337 continue;
2338 }
2339
2340 LoadStoreTys.insert(X: Ty);
2341 // Consider any loads or stores that are the exact size of the slice.
2342 if (S.beginOffset() == P.beginOffset() && S.endOffset() == P.endOffset())
2343 CheckCandidateType(Ty);
2344 }
2345
2346 SmallVector<VectorType *, 4> CandidateTysCopy = CandidateTys;
2347 if (auto *VTy = createAndCheckVectorTypesForPromotion(
2348 OtherTys&: LoadStoreTys, CandidateTysCopy, CheckCandidateType, P, DL,
2349 CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
2350 HaveCommonVecPtrTy, CommonVecPtrTy, VScale))
2351 return VTy;
2352
2353 CandidateTys.clear();
2354 return createAndCheckVectorTypesForPromotion(
2355 OtherTys&: DeferredTys, CandidateTysCopy, CheckCandidateType, P, DL, CandidateTys,
2356 HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, HaveCommonVecPtrTy,
2357 CommonVecPtrTy, VScale);
2358}
2359
2360/// Test whether a slice of an alloca is valid for integer widening.
2361///
2362/// This implements the necessary checking for the \c isIntegerWideningViable
2363/// test below on a single slice of the alloca.
2364static bool isIntegerWideningViableForSlice(const Slice &S,
2365 uint64_t AllocBeginOffset,
2366 Type *AllocaTy,
2367 const DataLayout &DL,
2368 bool &WholeAllocaOp) {
2369 uint64_t Size = DL.getTypeStoreSize(Ty: AllocaTy).getFixedValue();
2370
2371 uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
2372 uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
2373
2374 Use *U = S.getUse();
2375
2376 // Lifetime intrinsics operate over the whole alloca whose sizes are usually
2377 // larger than other load/store slices (RelEnd > Size). But lifetime are
2378 // always promotable and should not impact other slices' promotability of the
2379 // partition.
2380 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: U->getUser())) {
2381 if (II->isLifetimeStartOrEnd() || II->isDroppable())
2382 return true;
2383 }
2384
2385 // We can't reasonably handle cases where the load or store extends past
2386 // the end of the alloca's type and into its padding.
2387 if (RelEnd > Size)
2388 return false;
2389
2390 if (LoadInst *LI = dyn_cast<LoadInst>(Val: U->getUser())) {
2391 if (LI->isVolatile())
2392 return false;
2393 // We can't handle loads that extend past the allocated memory.
2394 TypeSize LoadSize = DL.getTypeStoreSize(Ty: LI->getType());
2395 if (!LoadSize.isFixed() || LoadSize.getFixedValue() > Size)
2396 return false;
2397 // So far, AllocaSliceRewriter does not support widening split slice tails
2398 // in rewriteIntegerLoad.
2399 if (S.beginOffset() < AllocBeginOffset)
2400 return false;
2401 // Note that we don't count vector loads or stores as whole-alloca
2402 // operations which enable integer widening because we would prefer to use
2403 // vector widening instead.
2404 if (!isa<VectorType>(Val: LI->getType()) && RelBegin == 0 && RelEnd == Size)
2405 WholeAllocaOp = true;
2406 if (IntegerType *ITy = dyn_cast<IntegerType>(Val: LI->getType())) {
2407 if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(Ty: ITy).getFixedValue())
2408 return false;
2409 } else if (RelBegin != 0 || RelEnd != Size ||
2410 !canConvertValue(DL, OldTy: AllocaTy, NewTy: LI->getType())) {
2411 // Non-integer loads need to be convertible from the alloca type so that
2412 // they are promotable.
2413 return false;
2414 }
2415 } else if (StoreInst *SI = dyn_cast<StoreInst>(Val: U->getUser())) {
2416 Type *ValueTy = SI->getValueOperand()->getType();
2417 if (SI->isVolatile())
2418 return false;
2419 // We can't handle stores that extend past the allocated memory.
2420 TypeSize StoreSize = DL.getTypeStoreSize(Ty: ValueTy);
2421 if (!StoreSize.isFixed() || StoreSize.getFixedValue() > Size)
2422 return false;
2423 // So far, AllocaSliceRewriter does not support widening split slice tails
2424 // in rewriteIntegerStore.
2425 if (S.beginOffset() < AllocBeginOffset)
2426 return false;
2427 // Note that we don't count vector loads or stores as whole-alloca
2428 // operations which enable integer widening because we would prefer to use
2429 // vector widening instead.
2430 if (!isa<VectorType>(Val: ValueTy) && RelBegin == 0 && RelEnd == Size)
2431 WholeAllocaOp = true;
2432 if (IntegerType *ITy = dyn_cast<IntegerType>(Val: ValueTy)) {
2433 if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(Ty: ITy).getFixedValue())
2434 return false;
2435 } else if (RelBegin != 0 || RelEnd != Size ||
2436 !canConvertValue(DL, OldTy: ValueTy, NewTy: AllocaTy)) {
2437 // Non-integer stores need to be convertible to the alloca type so that
2438 // they are promotable.
2439 return false;
2440 }
2441 } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Val: U->getUser())) {
2442 if (MI->isVolatile() || !isa<Constant>(Val: MI->getLength()))
2443 return false;
2444 if (!S.isSplittable())
2445 return false; // Skip any unsplittable intrinsics.
2446 } else {
2447 return false;
2448 }
2449
2450 return true;
2451}
2452
2453/// Test whether the given alloca partition's integer operations can be
2454/// widened to promotable ones.
2455///
2456/// This is a quick test to check whether we can rewrite the integer loads and
2457/// stores to a particular alloca into wider loads and stores and be able to
2458/// promote the resulting alloca.
2459static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
2460 const DataLayout &DL) {
2461 uint64_t SizeInBits = DL.getTypeSizeInBits(Ty: AllocaTy).getFixedValue();
2462 // Don't create integer types larger than the maximum bitwidth.
2463 if (SizeInBits > IntegerType::MAX_INT_BITS)
2464 return false;
2465
2466 // Don't try to handle allocas with bit-padding.
2467 if (SizeInBits != DL.getTypeStoreSizeInBits(Ty: AllocaTy).getFixedValue())
2468 return false;
2469
2470 // We need to ensure that an integer type with the appropriate bitwidth can
2471 // be converted to the alloca type, whatever that is. We don't want to force
2472 // the alloca itself to have an integer type if there is a more suitable one.
2473 Type *IntTy = Type::getIntNTy(C&: AllocaTy->getContext(), N: SizeInBits);
2474 if (!canConvertValue(DL, OldTy: AllocaTy, NewTy: IntTy) ||
2475 !canConvertValue(DL, OldTy: IntTy, NewTy: AllocaTy))
2476 return false;
2477
2478 // While examining uses, we ensure that the alloca has a covering load or
2479 // store. We don't want to widen the integer operations only to fail to
2480 // promote due to some other unsplittable entry (which we may make splittable
2481 // later). However, if there are only splittable uses, go ahead and assume
2482 // that we cover the alloca.
2483 // FIXME: We shouldn't consider split slices that happen to start in the
2484 // partition here...
2485 bool WholeAllocaOp = P.empty() && DL.isLegalInteger(Width: SizeInBits);
2486
2487 for (const Slice &S : P)
2488 if (!isIntegerWideningViableForSlice(S, AllocBeginOffset: P.beginOffset(), AllocaTy, DL,
2489 WholeAllocaOp))
2490 return false;
2491
2492 for (const Slice *S : P.splitSliceTails())
2493 if (!isIntegerWideningViableForSlice(S: *S, AllocBeginOffset: P.beginOffset(), AllocaTy, DL,
2494 WholeAllocaOp))
2495 return false;
2496
2497 return WholeAllocaOp;
2498}
2499
2500static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
2501 IntegerType *Ty, uint64_t Offset,
2502 const Twine &Name) {
2503 LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2504 IntegerType *IntTy = cast<IntegerType>(Val: V->getType());
2505 assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <=
2506 DL.getTypeStoreSize(IntTy).getFixedValue() &&
2507 "Element extends past full value");
2508 uint64_t ShAmt = 8 * Offset;
2509 if (DL.isBigEndian())
2510 ShAmt = 8 * (DL.getTypeStoreSize(Ty: IntTy).getFixedValue() -
2511 DL.getTypeStoreSize(Ty).getFixedValue() - Offset);
2512 if (ShAmt) {
2513 V = IRB.CreateLShr(LHS: V, RHS: ShAmt, Name: Name + ".shift");
2514 LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2515 }
2516 assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2517 "Cannot extract to a larger integer!");
2518 if (Ty != IntTy) {
2519 V = IRB.CreateTrunc(V, DestTy: Ty, Name: Name + ".trunc");
2520 LLVM_DEBUG(dbgs() << " trunced: " << *V << "\n");
2521 }
2522 return V;
2523}
2524
2525static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
2526 Value *V, uint64_t Offset, const Twine &Name) {
2527 IntegerType *IntTy = cast<IntegerType>(Val: Old->getType());
2528 IntegerType *Ty = cast<IntegerType>(Val: V->getType());
2529 assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2530 "Cannot insert a larger integer!");
2531 LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2532 if (Ty != IntTy) {
2533 V = IRB.CreateZExt(V, DestTy: IntTy, Name: Name + ".ext");
2534 LLVM_DEBUG(dbgs() << " extended: " << *V << "\n");
2535 }
2536 assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <=
2537 DL.getTypeStoreSize(IntTy).getFixedValue() &&
2538 "Element store outside of alloca store");
2539 uint64_t ShAmt = 8 * Offset;
2540 if (DL.isBigEndian())
2541 ShAmt = 8 * (DL.getTypeStoreSize(Ty: IntTy).getFixedValue() -
2542 DL.getTypeStoreSize(Ty).getFixedValue() - Offset);
2543 if (ShAmt) {
2544 V = IRB.CreateShl(LHS: V, RHS: ShAmt, Name: Name + ".shift");
2545 LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2546 }
2547
2548 if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) {
2549 APInt Mask = ~Ty->getMask().zext(width: IntTy->getBitWidth()).shl(shiftAmt: ShAmt);
2550 Old = IRB.CreateAnd(LHS: Old, RHS: Mask, Name: Name + ".mask");
2551 LLVM_DEBUG(dbgs() << " masked: " << *Old << "\n");
2552 V = IRB.CreateOr(LHS: Old, RHS: V, Name: Name + ".insert");
2553 LLVM_DEBUG(dbgs() << " inserted: " << *V << "\n");
2554 }
2555 return V;
2556}
2557
2558static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
2559 unsigned EndIndex, const Twine &Name) {
2560 auto *VecTy = cast<FixedVectorType>(Val: V->getType());
2561 unsigned NumElements = EndIndex - BeginIndex;
2562 assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
2563
2564 if (NumElements == VecTy->getNumElements())
2565 return V;
2566
2567 if (NumElements == 1) {
2568 V = IRB.CreateExtractElement(Vec: V, Idx: IRB.getInt32(C: BeginIndex),
2569 Name: Name + ".extract");
2570 LLVM_DEBUG(dbgs() << " extract: " << *V << "\n");
2571 return V;
2572 }
2573
2574 auto Mask = llvm::to_vector<8>(Range: llvm::seq<int>(Begin: BeginIndex, End: EndIndex));
2575 V = IRB.CreateShuffleVector(V, Mask, Name: Name + ".extract");
2576 LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2577 return V;
2578}
2579
2580static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
2581 unsigned BeginIndex, const Twine &Name) {
2582 VectorType *VecTy = cast<VectorType>(Val: Old->getType());
2583 assert(VecTy && "Can only insert a vector into a vector");
2584
2585 VectorType *Ty = dyn_cast<VectorType>(Val: V->getType());
2586 if (!Ty) {
2587 // Single element to insert.
2588 V = IRB.CreateInsertElement(Vec: Old, NewElt: V, Idx: IRB.getInt32(C: BeginIndex),
2589 Name: Name + ".insert");
2590 LLVM_DEBUG(dbgs() << " insert: " << *V << "\n");
2591 return V;
2592 }
2593
2594 unsigned NumSubElements = cast<FixedVectorType>(Val: Ty)->getNumElements();
2595 unsigned NumElements = cast<FixedVectorType>(Val: VecTy)->getNumElements();
2596
2597 assert(NumSubElements <= NumElements && "Too many elements!");
2598 if (NumSubElements == NumElements) {
2599 assert(V->getType() == VecTy && "Vector type mismatch");
2600 return V;
2601 }
2602 unsigned EndIndex = BeginIndex + NumSubElements;
2603
2604 // When inserting a smaller vector into the larger to store, we first
2605 // use a shuffle vector to widen it with undef elements, and then
2606 // a second shuffle vector to select between the loaded vector and the
2607 // incoming vector.
2608 SmallVector<int, 8> Mask;
2609 Mask.reserve(N: NumElements);
2610 for (unsigned Idx = 0; Idx != NumElements; ++Idx)
2611 if (Idx >= BeginIndex && Idx < EndIndex)
2612 Mask.push_back(Elt: Idx - BeginIndex);
2613 else
2614 Mask.push_back(Elt: -1);
2615 V = IRB.CreateShuffleVector(V, Mask, Name: Name + ".expand");
2616 LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2617
2618 Mask.clear();
2619 for (unsigned Idx = 0; Idx != NumElements; ++Idx)
2620 if (Idx >= BeginIndex && Idx < EndIndex)
2621 Mask.push_back(Elt: Idx);
2622 else
2623 Mask.push_back(Elt: Idx + NumElements);
2624 V = IRB.CreateShuffleVector(V1: V, V2: Old, Mask, Name: Name + "blend");
2625 LLVM_DEBUG(dbgs() << " blend: " << *V << "\n");
2626 return V;
2627}
2628
2629/// This function takes two vector values and combines them into a single vector
2630/// by concatenating their elements. The function handles:
2631///
2632/// 1. Element type mismatch: If either vector's element type differs from
2633/// NewAIEltType, the function bitcasts the vector to use NewAIEltType while
2634/// preserving the total bit width (adjusting the number of elements
2635/// accordingly).
2636///
2637/// 2. Size mismatch: After transforming the vectors to have the desired element
2638/// type, if the two vectors have different numbers of elements, the smaller
2639/// vector is extended with poison values to match the size of the larger
2640/// vector before concatenation.
2641///
2642/// 3. Concatenation: The vectors are merged using a shuffle operation that
2643/// places all elements of V0 first, followed by all elements of V1.
2644///
2645/// \param V0 The first vector to merge (must be a vector type)
2646/// \param V1 The second vector to merge (must be a vector type)
2647/// \param DL The data layout for size calculations
2648/// \param NewAIEltTy The desired element type for the result vector
2649/// \param Builder IRBuilder for creating new instructions
2650/// \return A new vector containing all elements from V0 followed by all
2651/// elements from V1
2652static Value *mergeTwoVectors(Value *V0, Value *V1, const DataLayout &DL,
2653 Type *NewAIEltTy, IRBuilder<> &Builder) {
2654 // V0 and V1 are vectors
2655 // Create a new vector type with combined elements
2656 // Use ShuffleVector to concatenate the vectors
2657 auto *VecType0 = cast<FixedVectorType>(Val: V0->getType());
2658 auto *VecType1 = cast<FixedVectorType>(Val: V1->getType());
2659
2660 // If V0/V1 element types are different from NewAllocaElementType,
2661 // we need to introduce bitcasts before merging them
2662 auto BitcastIfNeeded = [&](Value *&V, FixedVectorType *&VecType,
2663 const char *DebugName) {
2664 Type *EltType = VecType->getElementType();
2665 if (EltType != NewAIEltTy) {
2666 // Calculate new number of elements to maintain same bit width
2667 unsigned TotalBits =
2668 VecType->getNumElements() * DL.getTypeSizeInBits(Ty: EltType);
2669 unsigned NewNumElts = TotalBits / DL.getTypeSizeInBits(Ty: NewAIEltTy);
2670
2671 auto *NewVecType = FixedVectorType::get(ElementType: NewAIEltTy, NumElts: NewNumElts);
2672 V = Builder.CreateBitCast(V, DestTy: NewVecType);
2673 VecType = NewVecType;
2674 LLVM_DEBUG(dbgs() << " bitcast " << DebugName << ": " << *V << "\n");
2675 }
2676 };
2677
2678 BitcastIfNeeded(V0, VecType0, "V0");
2679 BitcastIfNeeded(V1, VecType1, "V1");
2680
2681 unsigned NumElts0 = VecType0->getNumElements();
2682 unsigned NumElts1 = VecType1->getNumElements();
2683
2684 SmallVector<int, 16> ShuffleMask;
2685
2686 if (NumElts0 == NumElts1) {
2687 for (unsigned i = 0; i < NumElts0 + NumElts1; ++i)
2688 ShuffleMask.push_back(Elt: i);
2689 } else {
2690 // If two vectors have different sizes, we need to extend
2691 // the smaller vector to the size of the larger vector.
2692 unsigned SmallSize = std::min(a: NumElts0, b: NumElts1);
2693 unsigned LargeSize = std::max(a: NumElts0, b: NumElts1);
2694 bool IsV0Smaller = NumElts0 < NumElts1;
2695 Value *&ExtendedVec = IsV0Smaller ? V0 : V1;
2696 SmallVector<int, 16> ExtendMask;
2697 for (unsigned i = 0; i < SmallSize; ++i)
2698 ExtendMask.push_back(Elt: i);
2699 for (unsigned i = SmallSize; i < LargeSize; ++i)
2700 ExtendMask.push_back(Elt: PoisonMaskElem);
2701 ExtendedVec = Builder.CreateShuffleVector(
2702 V1: ExtendedVec, V2: PoisonValue::get(T: ExtendedVec->getType()), Mask: ExtendMask);
2703 LLVM_DEBUG(dbgs() << " shufflevector: " << *ExtendedVec << "\n");
2704 for (unsigned i = 0; i < NumElts0; ++i)
2705 ShuffleMask.push_back(Elt: i);
2706 for (unsigned i = 0; i < NumElts1; ++i)
2707 ShuffleMask.push_back(Elt: LargeSize + i);
2708 }
2709
2710 return Builder.CreateShuffleVector(V1: V0, V2: V1, Mask: ShuffleMask);
2711}
2712
2713namespace {
2714
2715/// Visitor to rewrite instructions using p particular slice of an alloca
2716/// to use a new alloca.
2717///
2718/// Also implements the rewriting to vector-based accesses when the partition
2719/// passes the isVectorPromotionViable predicate. Most of the rewriting logic
2720/// lives here.
2721class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
2722 // Befriend the base class so it can delegate to private visit methods.
2723 friend class InstVisitor<AllocaSliceRewriter, bool>;
2724
2725 using Base = InstVisitor<AllocaSliceRewriter, bool>;
2726
2727 const DataLayout &DL;
2728 AllocaSlices &AS;
2729 SROA &Pass;
2730 AllocaInst &OldAI, &NewAI;
2731 const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
2732 Type *NewAllocaTy;
2733
2734 // This is a convenience and flag variable that will be null unless the new
2735 // alloca's integer operations should be widened to this integer type due to
2736 // passing isIntegerWideningViable above. If it is non-null, the desired
2737 // integer type will be stored here for easy access during rewriting.
2738 IntegerType *IntTy;
2739
2740 // If we are rewriting an alloca partition which can be written as pure
2741 // vector operations, we stash extra information here. When VecTy is
2742 // non-null, we have some strict guarantees about the rewritten alloca:
2743 // - The new alloca is exactly the size of the vector type here.
2744 // - The accesses all either map to the entire vector or to a single
2745 // element.
2746 // - The set of accessing instructions is only one of those handled above
2747 // in isVectorPromotionViable. Generally these are the same access kinds
2748 // which are promotable via mem2reg.
2749 VectorType *VecTy;
2750 Type *ElementTy;
2751 uint64_t ElementSize;
2752
2753 // The original offset of the slice currently being rewritten relative to
2754 // the original alloca.
2755 uint64_t BeginOffset = 0;
2756 uint64_t EndOffset = 0;
2757
2758 // The new offsets of the slice currently being rewritten relative to the
2759 // original alloca.
2760 uint64_t NewBeginOffset = 0, NewEndOffset = 0;
2761
2762 uint64_t SliceSize = 0;
2763 bool IsSplittable = false;
2764 bool IsSplit = false;
2765 Use *OldUse = nullptr;
2766 Instruction *OldPtr = nullptr;
2767
2768 // Track post-rewrite users which are PHI nodes and Selects.
2769 SmallSetVector<PHINode *, 8> &PHIUsers;
2770 SmallSetVector<SelectInst *, 8> &SelectUsers;
2771
2772 // Utility IR builder, whose name prefix is setup for each visited use, and
2773 // the insertion point is set to point to the user.
2774 IRBuilderTy IRB;
2775
2776 // Return the new alloca, addrspacecasted if required to avoid changing the
2777 // addrspace of a volatile access.
2778 Value *getPtrToNewAI(unsigned AddrSpace, bool IsVolatile) {
2779 if (!IsVolatile || AddrSpace == NewAI.getType()->getPointerAddressSpace())
2780 return &NewAI;
2781
2782 Type *AccessTy = IRB.getPtrTy(AddrSpace);
2783 return IRB.CreateAddrSpaceCast(V: &NewAI, DestTy: AccessTy);
2784 }
2785
2786public:
2787 AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROA &Pass,
2788 AllocaInst &OldAI, AllocaInst &NewAI, Type *NewAllocaTy,
2789 uint64_t NewAllocaBeginOffset,
2790 uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
2791 VectorType *PromotableVecTy,
2792 SmallSetVector<PHINode *, 8> &PHIUsers,
2793 SmallSetVector<SelectInst *, 8> &SelectUsers)
2794 : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
2795 NewAllocaBeginOffset(NewAllocaBeginOffset),
2796 NewAllocaEndOffset(NewAllocaEndOffset), NewAllocaTy(NewAllocaTy),
2797 IntTy(IsIntegerPromotable
2798 ? Type::getIntNTy(
2799 C&: NewAI.getContext(),
2800 N: DL.getTypeSizeInBits(Ty: NewAllocaTy).getFixedValue())
2801 : nullptr),
2802 VecTy(PromotableVecTy),
2803 ElementTy(VecTy ? VecTy->getElementType() : nullptr),
2804 ElementSize(VecTy ? DL.getTypeSizeInBits(Ty: ElementTy).getFixedValue() / 8
2805 : 0),
2806 PHIUsers(PHIUsers), SelectUsers(SelectUsers),
2807 IRB(NewAI.getContext(), ConstantFolder()) {
2808 if (VecTy) {
2809 assert((DL.getTypeSizeInBits(ElementTy).getFixedValue() % 8) == 0 &&
2810 "Only multiple-of-8 sized vector elements are viable");
2811 ++NumVectorized;
2812 }
2813 assert((!IntTy && !VecTy) || (IntTy && !VecTy) || (!IntTy && VecTy));
2814 }
2815
2816 bool visit(AllocaSlices::const_iterator I) {
2817 bool CanSROA = true;
2818 BeginOffset = I->beginOffset();
2819 EndOffset = I->endOffset();
2820 IsSplittable = I->isSplittable();
2821 IsSplit =
2822 BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
2823 LLVM_DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : ""));
2824 LLVM_DEBUG(AS.printSlice(dbgs(), I, ""));
2825 LLVM_DEBUG(dbgs() << "\n");
2826
2827 // Compute the intersecting offset range.
2828 assert(BeginOffset < NewAllocaEndOffset);
2829 assert(EndOffset > NewAllocaBeginOffset);
2830 NewBeginOffset = std::max(a: BeginOffset, b: NewAllocaBeginOffset);
2831 NewEndOffset = std::min(a: EndOffset, b: NewAllocaEndOffset);
2832
2833 SliceSize = NewEndOffset - NewBeginOffset;
2834 LLVM_DEBUG(dbgs() << " Begin:(" << BeginOffset << ", " << EndOffset
2835 << ") NewBegin:(" << NewBeginOffset << ", "
2836 << NewEndOffset << ") NewAllocaBegin:("
2837 << NewAllocaBeginOffset << ", " << NewAllocaEndOffset
2838 << ")\n");
2839 assert(IsSplit || NewBeginOffset == BeginOffset);
2840 OldUse = I->getUse();
2841 OldPtr = cast<Instruction>(Val: OldUse->get());
2842
2843 Instruction *OldUserI = cast<Instruction>(Val: OldUse->getUser());
2844 IRB.SetInsertPoint(OldUserI);
2845 IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
2846 IRB.getInserter().SetNamePrefix(Twine(NewAI.getName()) + "." +
2847 Twine(BeginOffset) + ".");
2848
2849 CanSROA &= visit(I: cast<Instruction>(Val: OldUse->getUser()));
2850 if (VecTy || IntTy)
2851 assert(CanSROA);
2852 return CanSROA;
2853 }
2854
2855 /// Attempts to rewrite a partition using tree-structured merge optimization.
2856 ///
2857 /// This function analyzes a partition to determine if it can be optimized
2858 /// using a tree-structured merge pattern, where multiple non-overlapping
2859 /// stores completely fill an alloca. And there is no load from the alloca in
2860 /// the middle of the stores. Such patterns can be optimized by eliminating
2861 /// the intermediate stores and directly constructing the final vector by
2862 /// using shufflevectors.
2863 ///
2864 /// Example transformation:
2865 /// Before: (stores do not have to be in order)
2866 /// %alloca = alloca <8 x float>
2867 /// store <2 x float> %val0, ptr %alloca ; offset 0-1
2868 /// store <2 x float> %val2, ptr %alloca+16 ; offset 4-5
2869 /// store <2 x float> %val1, ptr %alloca+8 ; offset 2-3
2870 /// store <2 x float> %val3, ptr %alloca+24 ; offset 6-7
2871 ///
2872 /// After:
2873 /// %alloca = alloca <8 x float>
2874 /// %shuffle0 = shufflevector %val0, %val1, <4 x i32> <i32 0, i32 1, i32 2,
2875 /// i32 3>
2876 /// %shuffle1 = shufflevector %val2, %val3, <4 x i32> <i32 0, i32 1, i32 2,
2877 /// i32 3>
2878 /// %shuffle2 = shufflevector %shuffle0, %shuffle1, <8 x i32> <i32 0, i32 1,
2879 /// i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2880 /// store %shuffle2, ptr %alloca
2881 ///
2882 /// The optimization looks for partitions that:
2883 /// 1. Have no overlapping split slice tails
2884 /// 2. Contain non-overlapping stores that cover the entire alloca
2885 /// 3. Have exactly one load that reads the complete alloca structure and not
2886 /// in the middle of the stores (TODO: maybe we can relax the constraint
2887 /// about reading the entire alloca structure)
2888 ///
2889 /// \param P The partition to analyze and potentially rewrite
2890 /// \return An optional vector of values that were deleted during the rewrite
2891 /// process, or std::nullopt if the partition cannot be optimized
2892 /// using tree-structured merge
2893 std::optional<SmallVector<Value *, 4>>
2894 rewriteTreeStructuredMerge(Partition &P) {
2895 // No tail slices that overlap with the partition
2896 if (P.splitSliceTails().size() > 0)
2897 return std::nullopt;
2898
2899 SmallVector<Value *, 4> DeletedValues;
2900 LoadInst *TheLoad = nullptr;
2901
2902 // Structure to hold store information
2903 struct StoreInfo {
2904 StoreInst *Store;
2905 uint64_t BeginOffset;
2906 uint64_t EndOffset;
2907 Value *StoredValue;
2908 StoreInfo(StoreInst *SI, uint64_t Begin, uint64_t End, Value *Val)
2909 : Store(SI), BeginOffset(Begin), EndOffset(End), StoredValue(Val) {}
2910 };
2911
2912 SmallVector<StoreInfo, 4> StoreInfos;
2913
2914 // If the new alloca is a fixed vector type, we use its element type as the
2915 // allocated element type, otherwise we use i8 as the allocated element
2916 Type *AllocatedEltTy =
2917 isa<FixedVectorType>(Val: NewAllocaTy)
2918 ? cast<FixedVectorType>(Val: NewAllocaTy)->getElementType()
2919 : Type::getInt8Ty(C&: NewAI.getContext());
2920 unsigned AllocatedEltTySize = DL.getTypeSizeInBits(Ty: AllocatedEltTy);
2921
2922 // Helper to check if a type is
2923 // 1. A fixed vector type
2924 // 2. The element type is not a pointer
2925 // 3. The element type size is byte-aligned
2926 // We only handle the cases that the ld/st meet these conditions
2927 auto IsTypeValidForTreeStructuredMerge = [&](Type *Ty) -> bool {
2928 auto *FixedVecTy = dyn_cast<FixedVectorType>(Val: Ty);
2929 return FixedVecTy &&
2930 DL.getTypeSizeInBits(Ty: FixedVecTy->getElementType()) % 8 == 0 &&
2931 !FixedVecTy->getElementType()->isPointerTy();
2932 };
2933
2934 for (Slice &S : P) {
2935 auto *User = cast<Instruction>(Val: S.getUse()->getUser());
2936 if (auto *LI = dyn_cast<LoadInst>(Val: User)) {
2937 // Do not handle the case if
2938 // 1. There is more than one load
2939 // 2. The load is volatile
2940 // 3. The load does not read the entire alloca structure
2941 // 4. The load does not meet the conditions in the helper function
2942 if (TheLoad || !IsTypeValidForTreeStructuredMerge(LI->getType()) ||
2943 S.beginOffset() != NewAllocaBeginOffset ||
2944 S.endOffset() != NewAllocaEndOffset || LI->isVolatile())
2945 return std::nullopt;
2946 TheLoad = LI;
2947 } else if (auto *SI = dyn_cast<StoreInst>(Val: User)) {
2948 // Do not handle the case if
2949 // 1. The store does not meet the conditions in the helper function
2950 // 2. The store is volatile
2951 // 3. The total store size is not a multiple of the allocated element
2952 // type size
2953 if (!IsTypeValidForTreeStructuredMerge(
2954 SI->getValueOperand()->getType()) ||
2955 SI->isVolatile())
2956 return std::nullopt;
2957 auto *VecTy = cast<FixedVectorType>(Val: SI->getValueOperand()->getType());
2958 unsigned NumElts = VecTy->getNumElements();
2959 unsigned EltSize = DL.getTypeSizeInBits(Ty: VecTy->getElementType());
2960 if (NumElts * EltSize % AllocatedEltTySize != 0)
2961 return std::nullopt;
2962 StoreInfos.emplace_back(Args&: SI, Args: S.beginOffset(), Args: S.endOffset(),
2963 Args: SI->getValueOperand());
2964 } else {
2965 // If we have instructions other than load and store, we cannot do the
2966 // tree structured merge
2967 return std::nullopt;
2968 }
2969 }
2970 // If we do not have any load, we cannot do the tree structured merge
2971 if (!TheLoad)
2972 return std::nullopt;
2973
2974 // If we do not have multiple stores, we cannot do the tree structured merge
2975 if (StoreInfos.size() < 2)
2976 return std::nullopt;
2977
2978 // Stores should not overlap and should cover the whole alloca
2979 // Sort by begin offset
2980 llvm::sort(C&: StoreInfos, Comp: [](const StoreInfo &A, const StoreInfo &B) {
2981 return A.BeginOffset < B.BeginOffset;
2982 });
2983
2984 // Check for overlaps and coverage
2985 uint64_t ExpectedStart = NewAllocaBeginOffset;
2986 for (auto &StoreInfo : StoreInfos) {
2987 uint64_t BeginOff = StoreInfo.BeginOffset;
2988 uint64_t EndOff = StoreInfo.EndOffset;
2989
2990 // Check for gap or overlap
2991 if (BeginOff != ExpectedStart)
2992 return std::nullopt;
2993
2994 ExpectedStart = EndOff;
2995 }
2996 // Check that stores cover the entire alloca
2997 if (ExpectedStart != NewAllocaEndOffset)
2998 return std::nullopt;
2999
3000 // Stores should be in the same basic block
3001 // The load should not be in the middle of the stores
3002 // Note:
3003 // If the load is in a different basic block with the stores, we can still
3004 // do the tree structured merge. This is because we do not have the
3005 // store->load forwarding here. The merged vector will be stored back to
3006 // NewAI and the new load will load from NewAI. The forwarding will be
3007 // handled later when we try to promote NewAI.
3008 BasicBlock *LoadBB = TheLoad->getParent();
3009 BasicBlock *StoreBB = StoreInfos[0].Store->getParent();
3010
3011 for (auto &StoreInfo : StoreInfos) {
3012 if (StoreInfo.Store->getParent() != StoreBB)
3013 return std::nullopt;
3014 if (LoadBB == StoreBB && !StoreInfo.Store->comesBefore(Other: TheLoad))
3015 return std::nullopt;
3016 }
3017
3018 // If we reach here, the partition can be merged with a tree structured
3019 // merge
3020 LLVM_DEBUG({
3021 dbgs() << "Tree structured merge rewrite:\n Load: " << *TheLoad
3022 << "\n Ordered stores:\n";
3023 for (auto [i, Info] : enumerate(StoreInfos))
3024 dbgs() << " [" << i << "] Range[" << Info.BeginOffset << ", "
3025 << Info.EndOffset << ") \tStore: " << *Info.Store
3026 << "\tValue: " << *Info.StoredValue << "\n";
3027 });
3028
3029 // Instead of having these stores, we merge all the stored values into a
3030 // vector and store the merged value into the alloca
3031 std::queue<Value *> VecElements;
3032 IRBuilder<> Builder(StoreInfos.back().Store);
3033 for (const auto &Info : StoreInfos) {
3034 DeletedValues.push_back(Elt: Info.Store);
3035 VecElements.push(x: Info.StoredValue);
3036 }
3037
3038 LLVM_DEBUG(dbgs() << " Rewrite stores into shufflevectors:\n");
3039 while (VecElements.size() > 1) {
3040 const auto NumElts = VecElements.size();
3041 for ([[maybe_unused]] const auto _ : llvm::seq(Size: NumElts / 2)) {
3042 Value *V0 = VecElements.front();
3043 VecElements.pop();
3044 Value *V1 = VecElements.front();
3045 VecElements.pop();
3046 Value *Merged = mergeTwoVectors(V0, V1, DL, NewAIEltTy: AllocatedEltTy, Builder);
3047 LLVM_DEBUG(dbgs() << " shufflevector: " << *Merged << "\n");
3048 VecElements.push(x: Merged);
3049 }
3050 if (NumElts % 2 == 1) {
3051 Value *V = VecElements.front();
3052 VecElements.pop();
3053 VecElements.push(x: V);
3054 }
3055 }
3056
3057 // Store the merged value into the alloca
3058 Value *MergedValue = VecElements.front();
3059 Builder.CreateAlignedStore(Val: MergedValue, Ptr: &NewAI, Align: getSliceAlign());
3060
3061 IRBuilder<> LoadBuilder(TheLoad);
3062 TheLoad->replaceAllUsesWith(V: LoadBuilder.CreateAlignedLoad(
3063 Ty: TheLoad->getType(), Ptr: &NewAI, Align: getSliceAlign(), isVolatile: TheLoad->isVolatile(),
3064 Name: TheLoad->getName() + ".sroa.new.load"));
3065 DeletedValues.push_back(Elt: TheLoad);
3066
3067 return DeletedValues;
3068 }
3069
3070private:
3071 // Make sure the other visit overloads are visible.
3072 using Base::visit;
3073
3074 // Every instruction which can end up as a user must have a rewrite rule.
3075 bool visitInstruction(Instruction &I) {
3076 LLVM_DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n");
3077 llvm_unreachable("No rewrite rule for this instruction!");
3078 }
3079
3080 Value *getNewAllocaSlicePtr(IRBuilderTy &IRB, Type *PointerTy) {
3081 // Note that the offset computation can use BeginOffset or NewBeginOffset
3082 // interchangeably for unsplit slices.
3083 assert(IsSplit || BeginOffset == NewBeginOffset);
3084 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3085
3086 StringRef OldName = OldPtr->getName();
3087 // Skip through the last '.sroa.' component of the name.
3088 size_t LastSROAPrefix = OldName.rfind(Str: ".sroa.");
3089 if (LastSROAPrefix != StringRef::npos) {
3090 OldName = OldName.substr(Start: LastSROAPrefix + strlen(s: ".sroa."));
3091 // Look for an SROA slice index.
3092 size_t IndexEnd = OldName.find_first_not_of(Chars: "0123456789");
3093 if (IndexEnd != StringRef::npos && OldName[IndexEnd] == '.') {
3094 // Strip the index and look for the offset.
3095 OldName = OldName.substr(Start: IndexEnd + 1);
3096 size_t OffsetEnd = OldName.find_first_not_of(Chars: "0123456789");
3097 if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == '.')
3098 // Strip the offset.
3099 OldName = OldName.substr(Start: OffsetEnd + 1);
3100 }
3101 }
3102 // Strip any SROA suffixes as well.
3103 OldName = OldName.substr(Start: 0, N: OldName.find(Str: ".sroa_"));
3104
3105 return getAdjustedPtr(IRB, DL, Ptr: &NewAI,
3106 Offset: APInt(DL.getIndexTypeSizeInBits(Ty: PointerTy), Offset),
3107 PointerTy, NamePrefix: Twine(OldName) + ".");
3108 }
3109
3110 /// Compute suitable alignment to access this slice of the *new*
3111 /// alloca.
3112 ///
3113 /// You can optionally pass a type to this routine and if that type's ABI
3114 /// alignment is itself suitable, this will return zero.
3115 Align getSliceAlign() {
3116 return commonAlignment(A: NewAI.getAlign(),
3117 Offset: NewBeginOffset - NewAllocaBeginOffset);
3118 }
3119
3120 unsigned getIndex(uint64_t Offset) {
3121 assert(VecTy && "Can only call getIndex when rewriting a vector");
3122 uint64_t RelOffset = Offset - NewAllocaBeginOffset;
3123 assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
3124 uint32_t Index = RelOffset / ElementSize;
3125 assert(Index * ElementSize == RelOffset);
3126 return Index;
3127 }
3128
3129 void deleteIfTriviallyDead(Value *V) {
3130 Instruction *I = cast<Instruction>(Val: V);
3131 if (isInstructionTriviallyDead(I))
3132 Pass.DeadInsts.push_back(Elt: I);
3133 }
3134
3135 Value *rewriteVectorizedLoadInst(LoadInst &LI) {
3136 unsigned BeginIndex = getIndex(Offset: NewBeginOffset);
3137 unsigned EndIndex = getIndex(Offset: NewEndOffset);
3138 assert(EndIndex > BeginIndex && "Empty vector!");
3139
3140 LoadInst *Load =
3141 IRB.CreateAlignedLoad(Ty: NewAllocaTy, Ptr: &NewAI, Align: NewAI.getAlign(), Name: "load");
3142
3143 Load->copyMetadata(SrcInst: LI, WL: {LLVMContext::MD_mem_parallel_loop_access,
3144 LLVMContext::MD_access_group});
3145 return extractVector(IRB, V: Load, BeginIndex, EndIndex, Name: "vec");
3146 }
3147
3148 Value *rewriteIntegerLoad(LoadInst &LI) {
3149 assert(IntTy && "We cannot insert an integer to the alloca");
3150 assert(!LI.isVolatile());
3151 Value *V =
3152 IRB.CreateAlignedLoad(Ty: NewAllocaTy, Ptr: &NewAI, Align: NewAI.getAlign(), Name: "load");
3153 V = IRB.CreateBitPreservingCastChain(DL, V, NewTy: IntTy);
3154 assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
3155 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3156 if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) {
3157 IntegerType *ExtractTy = Type::getIntNTy(C&: LI.getContext(), N: SliceSize * 8);
3158 V = extractInteger(DL, IRB, V, Ty: ExtractTy, Offset, Name: "extract");
3159 }
3160 // It is possible that the extracted type is not the load type. This
3161 // happens if there is a load past the end of the alloca, and as
3162 // a consequence the slice is narrower but still a candidate for integer
3163 // lowering. To handle this case, we just zero extend the extracted
3164 // integer.
3165 assert(cast<IntegerType>(LI.getType())->getBitWidth() >= SliceSize * 8 &&
3166 "Can only handle an extract for an overly wide load");
3167 if (cast<IntegerType>(Val: LI.getType())->getBitWidth() > SliceSize * 8)
3168 V = IRB.CreateZExt(V, DestTy: LI.getType());
3169 return V;
3170 }
3171
3172 bool visitLoadInst(LoadInst &LI) {
3173 LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
3174 Value *OldOp = LI.getOperand(i_nocapture: 0);
3175 assert(OldOp == OldPtr);
3176
3177 AAMDNodes AATags = LI.getAAMetadata();
3178
3179 unsigned AS = LI.getPointerAddressSpace();
3180
3181 Type *TargetTy = IsSplit ? Type::getIntNTy(C&: LI.getContext(), N: SliceSize * 8)
3182 : LI.getType();
3183 bool IsPtrAdjusted = false;
3184 Value *V;
3185 if (VecTy) {
3186 V = rewriteVectorizedLoadInst(LI);
3187 } else if (IntTy && LI.getType()->isIntegerTy()) {
3188 V = rewriteIntegerLoad(LI);
3189 } else if (NewBeginOffset == NewAllocaBeginOffset &&
3190 NewEndOffset == NewAllocaEndOffset &&
3191 (canConvertValue(DL, OldTy: NewAllocaTy, NewTy: TargetTy) ||
3192 (NewAllocaTy->isIntegerTy() && TargetTy->isIntegerTy() &&
3193 DL.getTypeStoreSize(Ty: TargetTy).getFixedValue() > SliceSize &&
3194 !LI.isVolatile()))) {
3195 Value *NewPtr =
3196 getPtrToNewAI(AddrSpace: LI.getPointerAddressSpace(), IsVolatile: LI.isVolatile());
3197 LoadInst *NewLI = IRB.CreateAlignedLoad(
3198 Ty: NewAllocaTy, Ptr: NewPtr, Align: NewAI.getAlign(), isVolatile: LI.isVolatile(), Name: LI.getName());
3199 if (LI.isVolatile())
3200 NewLI->setAtomic(Ordering: LI.getOrdering(), SSID: LI.getSyncScopeID());
3201 if (NewLI->isAtomic())
3202 NewLI->setAlignment(LI.getAlign());
3203
3204 // Copy any metadata that is valid for the new load. This may require
3205 // conversion to a different kind of metadata, e.g. !nonnull might change
3206 // to !range or vice versa.
3207 copyMetadataForLoad(Dest&: *NewLI, Source: LI);
3208
3209 // Do this after copyMetadataForLoad() to preserve the TBAA shift.
3210 if (AATags)
3211 NewLI->setAAMetadata(AATags.adjustForAccess(
3212 Offset: NewBeginOffset - BeginOffset, AccessTy: NewLI->getType(), DL));
3213
3214 // Try to preserve nonnull metadata
3215 V = NewLI;
3216
3217 // If this is an integer load past the end of the slice (which means the
3218 // bytes outside the slice are undef or this load is dead) just forcibly
3219 // fix the integer size with correct handling of endianness.
3220 if (auto *AITy = dyn_cast<IntegerType>(Val: NewAllocaTy))
3221 if (auto *TITy = dyn_cast<IntegerType>(Val: TargetTy))
3222 if (AITy->getBitWidth() < TITy->getBitWidth()) {
3223 V = IRB.CreateZExt(V, DestTy: TITy, Name: "load.ext");
3224 if (DL.isBigEndian())
3225 V = IRB.CreateShl(LHS: V, RHS: TITy->getBitWidth() - AITy->getBitWidth(),
3226 Name: "endian_shift");
3227 }
3228 } else {
3229 Type *LTy = IRB.getPtrTy(AddrSpace: AS);
3230 LoadInst *NewLI =
3231 IRB.CreateAlignedLoad(Ty: TargetTy, Ptr: getNewAllocaSlicePtr(IRB, PointerTy: LTy),
3232 Align: getSliceAlign(), isVolatile: LI.isVolatile(), Name: LI.getName());
3233
3234 if (AATags)
3235 NewLI->setAAMetadata(AATags.adjustForAccess(
3236 Offset: NewBeginOffset - BeginOffset, AccessTy: NewLI->getType(), DL));
3237
3238 if (LI.isVolatile())
3239 NewLI->setAtomic(Ordering: LI.getOrdering(), SSID: LI.getSyncScopeID());
3240 NewLI->copyMetadata(SrcInst: LI, WL: {LLVMContext::MD_mem_parallel_loop_access,
3241 LLVMContext::MD_access_group});
3242
3243 V = NewLI;
3244 IsPtrAdjusted = true;
3245 }
3246 V = IRB.CreateBitPreservingCastChain(DL, V, NewTy: TargetTy);
3247
3248 if (IsSplit) {
3249 assert(!LI.isVolatile());
3250 assert(LI.getType()->isIntegerTy() &&
3251 "Only integer type loads and stores are split");
3252 assert(SliceSize < DL.getTypeStoreSize(LI.getType()).getFixedValue() &&
3253 "Split load isn't smaller than original load");
3254 assert(DL.typeSizeEqualsStoreSize(LI.getType()) &&
3255 "Non-byte-multiple bit width");
3256 // Move the insertion point just past the load so that we can refer to it.
3257 BasicBlock::iterator LIIt = std::next(x: LI.getIterator());
3258 // Ensure the insertion point comes before any debug-info immediately
3259 // after the load, so that variable values referring to the load are
3260 // dominated by it.
3261 LIIt.setHeadBit(true);
3262 IRB.SetInsertPoint(TheBB: LI.getParent(), IP: LIIt);
3263 // Create a placeholder value with the same type as LI to use as the
3264 // basis for the new value. This allows us to replace the uses of LI with
3265 // the computed value, and then replace the placeholder with LI, leaving
3266 // LI only used for this computation.
3267 Value *Placeholder =
3268 new LoadInst(LI.getType(), PoisonValue::get(T: IRB.getPtrTy(AddrSpace: AS)), "",
3269 false, Align(1));
3270 V = insertInteger(DL, IRB, Old: Placeholder, V, Offset: NewBeginOffset - BeginOffset,
3271 Name: "insert");
3272 LI.replaceAllUsesWith(V);
3273 Placeholder->replaceAllUsesWith(V: &LI);
3274 Placeholder->deleteValue();
3275 } else {
3276 LI.replaceAllUsesWith(V);
3277 }
3278
3279 Pass.DeadInsts.push_back(Elt: &LI);
3280 deleteIfTriviallyDead(V: OldOp);
3281 LLVM_DEBUG(dbgs() << " to: " << *V << "\n");
3282 return !LI.isVolatile() && !IsPtrAdjusted;
3283 }
3284
3285 bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp,
3286 AAMDNodes AATags) {
3287 // Capture V for the purpose of debug-info accounting once it's converted
3288 // to a vector store.
3289 Value *OrigV = V;
3290 if (V->getType() != VecTy) {
3291 unsigned BeginIndex = getIndex(Offset: NewBeginOffset);
3292 unsigned EndIndex = getIndex(Offset: NewEndOffset);
3293 assert(EndIndex > BeginIndex && "Empty vector!");
3294 unsigned NumElements = EndIndex - BeginIndex;
3295 assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
3296 "Too many elements!");
3297 Type *SliceTy = (NumElements == 1)
3298 ? ElementTy
3299 : FixedVectorType::get(ElementType: ElementTy, NumElts: NumElements);
3300 if (V->getType() != SliceTy)
3301 V = IRB.CreateBitPreservingCastChain(DL, V, NewTy: SliceTy);
3302
3303 // Mix in the existing elements.
3304 Value *Old =
3305 IRB.CreateAlignedLoad(Ty: NewAllocaTy, Ptr: &NewAI, Align: NewAI.getAlign(), Name: "load");
3306 V = insertVector(IRB, Old, V, BeginIndex, Name: "vec");
3307 }
3308 StoreInst *Store = IRB.CreateAlignedStore(Val: V, Ptr: &NewAI, Align: NewAI.getAlign());
3309 Store->copyMetadata(SrcInst: SI, WL: {LLVMContext::MD_mem_parallel_loop_access,
3310 LLVMContext::MD_access_group});
3311 if (AATags)
3312 Store->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3313 AccessTy: V->getType(), DL));
3314 Pass.DeadInsts.push_back(Elt: &SI);
3315
3316 // NOTE: Careful to use OrigV rather than V.
3317 migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * 8, SliceSizeInBits: SliceSize * 8, OldInst: &SI,
3318 Inst: Store, Dest: Store->getPointerOperand(), Value: OrigV, DL);
3319 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3320 return true;
3321 }
3322
3323 bool rewriteIntegerStore(Value *V, StoreInst &SI, AAMDNodes AATags) {
3324 assert(IntTy && "We cannot extract an integer from the alloca");
3325 assert(!SI.isVolatile());
3326 if (DL.getTypeSizeInBits(Ty: V->getType()).getFixedValue() !=
3327 IntTy->getBitWidth()) {
3328 Value *Old = IRB.CreateAlignedLoad(Ty: NewAllocaTy, Ptr: &NewAI, Align: NewAI.getAlign(),
3329 Name: "oldload");
3330 Old = IRB.CreateBitPreservingCastChain(DL, V: Old, NewTy: IntTy);
3331 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
3332 uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
3333 V = insertInteger(DL, IRB, Old, V: SI.getValueOperand(), Offset, Name: "insert");
3334 }
3335 V = IRB.CreateBitPreservingCastChain(DL, V, NewTy: NewAllocaTy);
3336 StoreInst *Store = IRB.CreateAlignedStore(Val: V, Ptr: &NewAI, Align: NewAI.getAlign());
3337 Store->copyMetadata(SrcInst: SI, WL: {LLVMContext::MD_mem_parallel_loop_access,
3338 LLVMContext::MD_access_group});
3339 if (AATags)
3340 Store->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3341 AccessTy: V->getType(), DL));
3342
3343 migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * 8, SliceSizeInBits: SliceSize * 8, OldInst: &SI,
3344 Inst: Store, Dest: Store->getPointerOperand(),
3345 Value: Store->getValueOperand(), DL);
3346
3347 Pass.DeadInsts.push_back(Elt: &SI);
3348 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3349 return true;
3350 }
3351
3352 bool visitStoreInst(StoreInst &SI) {
3353 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3354 Value *OldOp = SI.getOperand(i_nocapture: 1);
3355 assert(OldOp == OldPtr);
3356
3357 AAMDNodes AATags = SI.getAAMetadata();
3358 Value *V = SI.getValueOperand();
3359
3360 // Strip all inbounds GEPs and pointer casts to try to dig out any root
3361 // alloca that should be re-examined after promoting this alloca.
3362 if (V->getType()->isPointerTy())
3363 if (AllocaInst *AI = dyn_cast<AllocaInst>(Val: V->stripInBoundsOffsets()))
3364 Pass.PostPromotionWorklist.insert(X: AI);
3365
3366 TypeSize StoreSize = DL.getTypeStoreSize(Ty: V->getType());
3367 if (StoreSize.isFixed() && SliceSize < StoreSize.getFixedValue()) {
3368 assert(!SI.isVolatile());
3369 assert(V->getType()->isIntegerTy() &&
3370 "Only integer type loads and stores are split");
3371 assert(DL.typeSizeEqualsStoreSize(V->getType()) &&
3372 "Non-byte-multiple bit width");
3373 IntegerType *NarrowTy = Type::getIntNTy(C&: SI.getContext(), N: SliceSize * 8);
3374 V = extractInteger(DL, IRB, V, Ty: NarrowTy, Offset: NewBeginOffset - BeginOffset,
3375 Name: "extract");
3376 }
3377
3378 if (VecTy)
3379 return rewriteVectorizedStoreInst(V, SI, OldOp, AATags);
3380 if (IntTy && V->getType()->isIntegerTy())
3381 return rewriteIntegerStore(V, SI, AATags);
3382
3383 StoreInst *NewSI;
3384 if (NewBeginOffset == NewAllocaBeginOffset &&
3385 NewEndOffset == NewAllocaEndOffset &&
3386 canConvertValue(DL, OldTy: V->getType(), NewTy: NewAllocaTy)) {
3387 V = IRB.CreateBitPreservingCastChain(DL, V, NewTy: NewAllocaTy);
3388 Value *NewPtr =
3389 getPtrToNewAI(AddrSpace: SI.getPointerAddressSpace(), IsVolatile: SI.isVolatile());
3390
3391 NewSI =
3392 IRB.CreateAlignedStore(Val: V, Ptr: NewPtr, Align: NewAI.getAlign(), isVolatile: SI.isVolatile());
3393 } else {
3394 unsigned AS = SI.getPointerAddressSpace();
3395 Value *NewPtr = getNewAllocaSlicePtr(IRB, PointerTy: IRB.getPtrTy(AddrSpace: AS));
3396 NewSI =
3397 IRB.CreateAlignedStore(Val: V, Ptr: NewPtr, Align: getSliceAlign(), isVolatile: SI.isVolatile());
3398 }
3399 NewSI->copyMetadata(SrcInst: SI, WL: {LLVMContext::MD_mem_parallel_loop_access,
3400 LLVMContext::MD_access_group});
3401 if (AATags)
3402 NewSI->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3403 AccessTy: V->getType(), DL));
3404 if (SI.isVolatile())
3405 NewSI->setAtomic(Ordering: SI.getOrdering(), SSID: SI.getSyncScopeID());
3406 if (NewSI->isAtomic())
3407 NewSI->setAlignment(SI.getAlign());
3408
3409 migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * 8, SliceSizeInBits: SliceSize * 8, OldInst: &SI,
3410 Inst: NewSI, Dest: NewSI->getPointerOperand(),
3411 Value: NewSI->getValueOperand(), DL);
3412
3413 Pass.DeadInsts.push_back(Elt: &SI);
3414 deleteIfTriviallyDead(V: OldOp);
3415
3416 LLVM_DEBUG(dbgs() << " to: " << *NewSI << "\n");
3417 return NewSI->getPointerOperand() == &NewAI &&
3418 NewSI->getValueOperand()->getType() == NewAllocaTy &&
3419 !SI.isVolatile();
3420 }
3421
3422 /// Compute an integer value from splatting an i8 across the given
3423 /// number of bytes.
3424 ///
3425 /// Note that this routine assumes an i8 is a byte. If that isn't true, don't
3426 /// call this routine.
3427 /// FIXME: Heed the advice above.
3428 ///
3429 /// \param V The i8 value to splat.
3430 /// \param Size The number of bytes in the output (assuming i8 is one byte)
3431 Value *getIntegerSplat(Value *V, unsigned Size) {
3432 assert(Size > 0 && "Expected a positive number of bytes.");
3433 IntegerType *VTy = cast<IntegerType>(Val: V->getType());
3434 assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte");
3435 if (Size == 1)
3436 return V;
3437
3438 Type *SplatIntTy = Type::getIntNTy(C&: VTy->getContext(), N: Size * 8);
3439 V = IRB.CreateMul(
3440 LHS: IRB.CreateZExt(V, DestTy: SplatIntTy, Name: "zext"),
3441 RHS: IRB.CreateUDiv(LHS: Constant::getAllOnesValue(Ty: SplatIntTy),
3442 RHS: IRB.CreateZExt(V: Constant::getAllOnesValue(Ty: V->getType()),
3443 DestTy: SplatIntTy)),
3444 Name: "isplat");
3445 return V;
3446 }
3447
3448 /// Compute a vector splat for a given element value.
3449 Value *getVectorSplat(Value *V, unsigned NumElements) {
3450 V = IRB.CreateVectorSplat(NumElts: NumElements, V, Name: "vsplat");
3451 LLVM_DEBUG(dbgs() << " splat: " << *V << "\n");
3452 return V;
3453 }
3454
3455 bool visitMemSetInst(MemSetInst &II) {
3456 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3457 assert(II.getRawDest() == OldPtr);
3458
3459 AAMDNodes AATags = II.getAAMetadata();
3460
3461 // If the memset has a variable size, it cannot be split, just adjust the
3462 // pointer to the new alloca.
3463 if (!isa<ConstantInt>(Val: II.getLength())) {
3464 assert(!IsSplit);
3465 assert(NewBeginOffset == BeginOffset);
3466 II.setDest(getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType()));
3467 II.setDestAlignment(getSliceAlign());
3468 // In theory we should call migrateDebugInfo here. However, we do not
3469 // emit dbg.assign intrinsics for mem intrinsics storing through non-
3470 // constant geps, or storing a variable number of bytes.
3471 assert(at::getDVRAssignmentMarkers(&II).empty() &&
3472 "AT: Unexpected link to non-const GEP");
3473 deleteIfTriviallyDead(V: OldPtr);
3474 return false;
3475 }
3476
3477 // Record this instruction for deletion.
3478 Pass.DeadInsts.push_back(Elt: &II);
3479
3480 Type *ScalarTy = NewAllocaTy->getScalarType();
3481
3482 const bool CanContinue = [&]() {
3483 if (VecTy || IntTy)
3484 return true;
3485 if (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset)
3486 return false;
3487 // Length must be in range for FixedVectorType.
3488 auto *C = cast<ConstantInt>(Val: II.getLength());
3489 const uint64_t Len = C->getLimitedValue();
3490 if (Len > std::numeric_limits<unsigned>::max())
3491 return false;
3492 auto *Int8Ty = IntegerType::getInt8Ty(C&: NewAI.getContext());
3493 auto *SrcTy = FixedVectorType::get(ElementType: Int8Ty, NumElts: Len);
3494 return canConvertValue(DL, OldTy: SrcTy, NewTy: NewAllocaTy) &&
3495 DL.isLegalInteger(Width: DL.getTypeSizeInBits(Ty: ScalarTy).getFixedValue());
3496 }();
3497
3498 // If this doesn't map cleanly onto the alloca type, and that type isn't
3499 // a single value type, just emit a memset.
3500 if (!CanContinue) {
3501 Type *SizeTy = II.getLength()->getType();
3502 unsigned Sz = NewEndOffset - NewBeginOffset;
3503 Constant *Size = ConstantInt::get(Ty: SizeTy, V: Sz);
3504 MemIntrinsic *New = cast<MemIntrinsic>(Val: IRB.CreateMemSet(
3505 Ptr: getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType()), Val: II.getValue(), Size,
3506 Align: MaybeAlign(getSliceAlign()), isVolatile: II.isVolatile()));
3507 if (AATags)
3508 New->setAAMetadata(
3509 AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset, AccessSize: Sz));
3510
3511 migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * 8, SliceSizeInBits: SliceSize * 8, OldInst: &II,
3512 Inst: New, Dest: New->getRawDest(), Value: nullptr, DL);
3513
3514 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3515 return false;
3516 }
3517
3518 // If we can represent this as a simple value, we have to build the actual
3519 // value to store, which requires expanding the byte present in memset to
3520 // a sensible representation for the alloca type. This is essentially
3521 // splatting the byte to a sufficiently wide integer, splatting it across
3522 // any desired vector width, and bitcasting to the final type.
3523 Value *V;
3524
3525 if (VecTy) {
3526 // If this is a memset of a vectorized alloca, insert it.
3527 assert(ElementTy == ScalarTy);
3528
3529 unsigned BeginIndex = getIndex(Offset: NewBeginOffset);
3530 unsigned EndIndex = getIndex(Offset: NewEndOffset);
3531 assert(EndIndex > BeginIndex && "Empty vector!");
3532 unsigned NumElements = EndIndex - BeginIndex;
3533 assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
3534 "Too many elements!");
3535
3536 Value *Splat = getIntegerSplat(
3537 V: II.getValue(), Size: DL.getTypeSizeInBits(Ty: ElementTy).getFixedValue() / 8);
3538 Splat = IRB.CreateBitPreservingCastChain(DL, V: Splat, NewTy: ElementTy);
3539 if (NumElements > 1)
3540 Splat = getVectorSplat(V: Splat, NumElements);
3541
3542 Value *Old = IRB.CreateAlignedLoad(Ty: NewAllocaTy, Ptr: &NewAI, Align: NewAI.getAlign(),
3543 Name: "oldload");
3544 V = insertVector(IRB, Old, V: Splat, BeginIndex, Name: "vec");
3545 } else if (IntTy) {
3546 // If this is a memset on an alloca where we can widen stores, insert the
3547 // set integer.
3548 assert(!II.isVolatile());
3549
3550 uint64_t Size = NewEndOffset - NewBeginOffset;
3551 V = getIntegerSplat(V: II.getValue(), Size);
3552
3553 if (IntTy && (BeginOffset != NewAllocaBeginOffset ||
3554 EndOffset != NewAllocaBeginOffset)) {
3555 Value *Old = IRB.CreateAlignedLoad(Ty: NewAllocaTy, Ptr: &NewAI,
3556 Align: NewAI.getAlign(), Name: "oldload");
3557 Old = IRB.CreateBitPreservingCastChain(DL, V: Old, NewTy: IntTy);
3558 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3559 V = insertInteger(DL, IRB, Old, V, Offset, Name: "insert");
3560 } else {
3561 assert(V->getType() == IntTy &&
3562 "Wrong type for an alloca wide integer!");
3563 }
3564 V = IRB.CreateBitPreservingCastChain(DL, V, NewTy: NewAllocaTy);
3565 } else {
3566 // Established these invariants above.
3567 assert(NewBeginOffset == NewAllocaBeginOffset);
3568 assert(NewEndOffset == NewAllocaEndOffset);
3569
3570 V = getIntegerSplat(V: II.getValue(),
3571 Size: DL.getTypeSizeInBits(Ty: ScalarTy).getFixedValue() / 8);
3572 if (VectorType *AllocaVecTy = dyn_cast<VectorType>(Val: NewAllocaTy))
3573 V = getVectorSplat(
3574 V, NumElements: cast<FixedVectorType>(Val: AllocaVecTy)->getNumElements());
3575
3576 V = IRB.CreateBitPreservingCastChain(DL, V, NewTy: NewAllocaTy);
3577 }
3578
3579 Value *NewPtr = getPtrToNewAI(AddrSpace: II.getDestAddressSpace(), IsVolatile: II.isVolatile());
3580 StoreInst *New =
3581 IRB.CreateAlignedStore(Val: V, Ptr: NewPtr, Align: NewAI.getAlign(), isVolatile: II.isVolatile());
3582 New->copyMetadata(SrcInst: II, WL: {LLVMContext::MD_mem_parallel_loop_access,
3583 LLVMContext::MD_access_group});
3584 if (AATags)
3585 New->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3586 AccessTy: V->getType(), DL));
3587
3588 migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * 8, SliceSizeInBits: SliceSize * 8, OldInst: &II,
3589 Inst: New, Dest: New->getPointerOperand(), Value: V, DL);
3590
3591 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3592 return !II.isVolatile();
3593 }
3594
3595 bool visitMemTransferInst(MemTransferInst &II) {
3596 // Rewriting of memory transfer instructions can be a bit tricky. We break
3597 // them into two categories: split intrinsics and unsplit intrinsics.
3598
3599 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3600
3601 AAMDNodes AATags = II.getAAMetadata();
3602
3603 bool IsDest = &II.getRawDestUse() == OldUse;
3604 assert((IsDest && II.getRawDest() == OldPtr) ||
3605 (!IsDest && II.getRawSource() == OldPtr));
3606
3607 Align SliceAlign = getSliceAlign();
3608 // For unsplit intrinsics, we simply modify the source and destination
3609 // pointers in place. This isn't just an optimization, it is a matter of
3610 // correctness. With unsplit intrinsics we may be dealing with transfers
3611 // within a single alloca before SROA ran, or with transfers that have
3612 // a variable length. We may also be dealing with memmove instead of
3613 // memcpy, and so simply updating the pointers is the necessary for us to
3614 // update both source and dest of a single call.
3615 if (!IsSplittable) {
3616 Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType());
3617 if (IsDest) {
3618 // Update the address component of linked dbg.assigns.
3619 for (DbgVariableRecord *DbgAssign : at::getDVRAssignmentMarkers(Inst: &II)) {
3620 if (llvm::is_contained(Range: DbgAssign->location_ops(), Element: II.getDest()) ||
3621 DbgAssign->getAddress() == II.getDest())
3622 DbgAssign->replaceVariableLocationOp(OldValue: II.getDest(), NewValue: AdjustedPtr);
3623 }
3624 II.setDest(AdjustedPtr);
3625 II.setDestAlignment(SliceAlign);
3626 } else {
3627 II.setSource(AdjustedPtr);
3628 II.setSourceAlignment(SliceAlign);
3629 }
3630
3631 LLVM_DEBUG(dbgs() << " to: " << II << "\n");
3632 deleteIfTriviallyDead(V: OldPtr);
3633 return false;
3634 }
3635 // For split transfer intrinsics we have an incredibly useful assurance:
3636 // the source and destination do not reside within the same alloca, and at
3637 // least one of them does not escape. This means that we can replace
3638 // memmove with memcpy, and we don't need to worry about all manner of
3639 // downsides to splitting and transforming the operations.
3640
3641 // If this doesn't map cleanly onto the alloca type, and that type isn't
3642 // a single value type, just emit a memcpy.
3643 bool EmitMemCpy =
3644 !VecTy && !IntTy &&
3645 (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
3646 SliceSize != DL.getTypeStoreSize(Ty: NewAllocaTy).getFixedValue() ||
3647 !DL.typeSizeEqualsStoreSize(Ty: NewAllocaTy) ||
3648 !NewAllocaTy->isSingleValueType());
3649
3650 // If we're just going to emit a memcpy, the alloca hasn't changed, and the
3651 // size hasn't been shrunk based on analysis of the viable range, this is
3652 // a no-op.
3653 if (EmitMemCpy && &OldAI == &NewAI) {
3654 // Ensure the start lines up.
3655 assert(NewBeginOffset == BeginOffset);
3656
3657 // Rewrite the size as needed.
3658 if (NewEndOffset != EndOffset)
3659 II.setLength(NewEndOffset - NewBeginOffset);
3660 return false;
3661 }
3662 // Record this instruction for deletion.
3663 Pass.DeadInsts.push_back(Elt: &II);
3664
3665 // Strip all inbounds GEPs and pointer casts to try to dig out any root
3666 // alloca that should be re-examined after rewriting this instruction.
3667 Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
3668 if (AllocaInst *AI =
3669 dyn_cast<AllocaInst>(Val: OtherPtr->stripInBoundsOffsets())) {
3670 assert(AI != &OldAI && AI != &NewAI &&
3671 "Splittable transfers cannot reach the same alloca on both ends.");
3672 Pass.Worklist.insert(X: AI);
3673 }
3674
3675 Type *OtherPtrTy = OtherPtr->getType();
3676 unsigned OtherAS = OtherPtrTy->getPointerAddressSpace();
3677
3678 // Compute the relative offset for the other pointer within the transfer.
3679 unsigned OffsetWidth = DL.getIndexSizeInBits(AS: OtherAS);
3680 APInt OtherOffset(OffsetWidth, NewBeginOffset - BeginOffset);
3681 Align OtherAlign =
3682 (IsDest ? II.getSourceAlign() : II.getDestAlign()).valueOrOne();
3683 OtherAlign =
3684 commonAlignment(A: OtherAlign, Offset: OtherOffset.zextOrTrunc(width: 64).getZExtValue());
3685
3686 if (EmitMemCpy) {
3687 // Compute the other pointer, folding as much as possible to produce
3688 // a single, simple GEP in most cases.
3689 OtherPtr = getAdjustedPtr(IRB, DL, Ptr: OtherPtr, Offset: OtherOffset, PointerTy: OtherPtrTy,
3690 NamePrefix: OtherPtr->getName() + ".");
3691
3692 Value *OurPtr = getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType());
3693 Type *SizeTy = II.getLength()->getType();
3694 Constant *Size = ConstantInt::get(Ty: SizeTy, V: NewEndOffset - NewBeginOffset);
3695
3696 Value *DestPtr, *SrcPtr;
3697 MaybeAlign DestAlign, SrcAlign;
3698 // Note: IsDest is true iff we're copying into the new alloca slice
3699 if (IsDest) {
3700 DestPtr = OurPtr;
3701 DestAlign = SliceAlign;
3702 SrcPtr = OtherPtr;
3703 SrcAlign = OtherAlign;
3704 } else {
3705 DestPtr = OtherPtr;
3706 DestAlign = OtherAlign;
3707 SrcPtr = OurPtr;
3708 SrcAlign = SliceAlign;
3709 }
3710 CallInst *New = IRB.CreateMemCpy(Dst: DestPtr, DstAlign: DestAlign, Src: SrcPtr, SrcAlign,
3711 Size, isVolatile: II.isVolatile());
3712 if (AATags)
3713 New->setAAMetadata(AATags.shift(Offset: NewBeginOffset - BeginOffset));
3714
3715 APInt Offset(DL.getIndexTypeSizeInBits(Ty: DestPtr->getType()), 0);
3716 if (IsDest) {
3717 migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * 8, SliceSizeInBits: SliceSize * 8,
3718 OldInst: &II, Inst: New, Dest: DestPtr, Value: nullptr, DL);
3719 } else if (AllocaInst *Base = dyn_cast<AllocaInst>(
3720 Val: DestPtr->stripAndAccumulateConstantOffsets(
3721 DL, Offset, /*AllowNonInbounds*/ true))) {
3722 migrateDebugInfo(OldAlloca: Base, IsSplit, OldAllocaOffsetInBits: Offset.getZExtValue() * 8,
3723 SliceSizeInBits: SliceSize * 8, OldInst: &II, Inst: New, Dest: DestPtr, Value: nullptr, DL);
3724 }
3725 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3726 return false;
3727 }
3728
3729 bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset &&
3730 NewEndOffset == NewAllocaEndOffset;
3731 uint64_t Size = NewEndOffset - NewBeginOffset;
3732 unsigned BeginIndex = VecTy ? getIndex(Offset: NewBeginOffset) : 0;
3733 unsigned EndIndex = VecTy ? getIndex(Offset: NewEndOffset) : 0;
3734 unsigned NumElements = EndIndex - BeginIndex;
3735 IntegerType *SubIntTy =
3736 IntTy ? Type::getIntNTy(C&: IntTy->getContext(), N: Size * 8) : nullptr;
3737
3738 // Reset the other pointer type to match the register type we're going to
3739 // use, but using the address space of the original other pointer.
3740 Type *OtherTy;
3741 if (VecTy && !IsWholeAlloca) {
3742 if (NumElements == 1)
3743 OtherTy = VecTy->getElementType();
3744 else
3745 OtherTy = FixedVectorType::get(ElementType: VecTy->getElementType(), NumElts: NumElements);
3746 } else if (IntTy && !IsWholeAlloca) {
3747 OtherTy = SubIntTy;
3748 } else {
3749 OtherTy = NewAllocaTy;
3750 }
3751
3752 Value *AdjPtr = getAdjustedPtr(IRB, DL, Ptr: OtherPtr, Offset: OtherOffset, PointerTy: OtherPtrTy,
3753 NamePrefix: OtherPtr->getName() + ".");
3754 MaybeAlign SrcAlign = OtherAlign;
3755 MaybeAlign DstAlign = SliceAlign;
3756 if (!IsDest)
3757 std::swap(a&: SrcAlign, b&: DstAlign);
3758
3759 Value *SrcPtr;
3760 Value *DstPtr;
3761
3762 if (IsDest) {
3763 DstPtr = getPtrToNewAI(AddrSpace: II.getDestAddressSpace(), IsVolatile: II.isVolatile());
3764 SrcPtr = AdjPtr;
3765 } else {
3766 DstPtr = AdjPtr;
3767 SrcPtr = getPtrToNewAI(AddrSpace: II.getSourceAddressSpace(), IsVolatile: II.isVolatile());
3768 }
3769
3770 Value *Src;
3771 if (VecTy && !IsWholeAlloca && !IsDest) {
3772 Src =
3773 IRB.CreateAlignedLoad(Ty: NewAllocaTy, Ptr: &NewAI, Align: NewAI.getAlign(), Name: "load");
3774 Src = extractVector(IRB, V: Src, BeginIndex, EndIndex, Name: "vec");
3775 } else if (IntTy && !IsWholeAlloca && !IsDest) {
3776 Src =
3777 IRB.CreateAlignedLoad(Ty: NewAllocaTy, Ptr: &NewAI, Align: NewAI.getAlign(), Name: "load");
3778 Src = IRB.CreateBitPreservingCastChain(DL, V: Src, NewTy: IntTy);
3779 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3780 Src = extractInteger(DL, IRB, V: Src, Ty: SubIntTy, Offset, Name: "extract");
3781 } else {
3782 LoadInst *Load = IRB.CreateAlignedLoad(Ty: OtherTy, Ptr: SrcPtr, Align: SrcAlign,
3783 isVolatile: II.isVolatile(), Name: "copyload");
3784 Load->copyMetadata(SrcInst: II, WL: {LLVMContext::MD_mem_parallel_loop_access,
3785 LLVMContext::MD_access_group});
3786 if (AATags)
3787 Load->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3788 AccessTy: Load->getType(), DL));
3789 Src = Load;
3790 }
3791
3792 if (VecTy && !IsWholeAlloca && IsDest) {
3793 Value *Old = IRB.CreateAlignedLoad(Ty: NewAllocaTy, Ptr: &NewAI, Align: NewAI.getAlign(),
3794 Name: "oldload");
3795 Src = insertVector(IRB, Old, V: Src, BeginIndex, Name: "vec");
3796 } else if (IntTy && !IsWholeAlloca && IsDest) {
3797 Value *Old = IRB.CreateAlignedLoad(Ty: NewAllocaTy, Ptr: &NewAI, Align: NewAI.getAlign(),
3798 Name: "oldload");
3799 Old = IRB.CreateBitPreservingCastChain(DL, V: Old, NewTy: IntTy);
3800 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3801 Src = insertInteger(DL, IRB, Old, V: Src, Offset, Name: "insert");
3802 Src = IRB.CreateBitPreservingCastChain(DL, V: Src, NewTy: NewAllocaTy);
3803 }
3804
3805 StoreInst *Store = cast<StoreInst>(
3806 Val: IRB.CreateAlignedStore(Val: Src, Ptr: DstPtr, Align: DstAlign, isVolatile: II.isVolatile()));
3807 Store->copyMetadata(SrcInst: II, WL: {LLVMContext::MD_mem_parallel_loop_access,
3808 LLVMContext::MD_access_group});
3809 if (AATags)
3810 Store->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3811 AccessTy: Src->getType(), DL));
3812
3813 APInt Offset(DL.getIndexTypeSizeInBits(Ty: DstPtr->getType()), 0);
3814 if (IsDest) {
3815
3816 migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * 8, SliceSizeInBits: SliceSize * 8, OldInst: &II,
3817 Inst: Store, Dest: DstPtr, Value: Src, DL);
3818 } else if (AllocaInst *Base = dyn_cast<AllocaInst>(
3819 Val: DstPtr->stripAndAccumulateConstantOffsets(
3820 DL, Offset, /*AllowNonInbounds*/ true))) {
3821 migrateDebugInfo(OldAlloca: Base, IsSplit, OldAllocaOffsetInBits: Offset.getZExtValue() * 8, SliceSizeInBits: SliceSize * 8,
3822 OldInst: &II, Inst: Store, Dest: DstPtr, Value: Src, DL);
3823 }
3824
3825 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3826 return !II.isVolatile();
3827 }
3828
3829 bool visitIntrinsicInst(IntrinsicInst &II) {
3830 assert((II.isLifetimeStartOrEnd() || II.isDroppable()) &&
3831 "Unexpected intrinsic!");
3832 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3833
3834 // Record this instruction for deletion.
3835 Pass.DeadInsts.push_back(Elt: &II);
3836
3837 if (II.isDroppable()) {
3838 assert(II.getIntrinsicID() == Intrinsic::assume && "Expected assume");
3839 // TODO For now we forget assumed information, this can be improved.
3840 OldPtr->dropDroppableUsesIn(Usr&: II);
3841 return true;
3842 }
3843
3844 assert(II.getArgOperand(0) == OldPtr);
3845 Type *PointerTy = IRB.getPtrTy(AddrSpace: OldPtr->getType()->getPointerAddressSpace());
3846 Value *Ptr = getNewAllocaSlicePtr(IRB, PointerTy);
3847 Value *New;
3848 if (II.getIntrinsicID() == Intrinsic::lifetime_start)
3849 New = IRB.CreateLifetimeStart(Ptr);
3850 else
3851 New = IRB.CreateLifetimeEnd(Ptr);
3852
3853 (void)New;
3854 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3855
3856 return true;
3857 }
3858
3859 void fixLoadStoreAlign(Instruction &Root) {
3860 // This algorithm implements the same visitor loop as
3861 // hasUnsafePHIOrSelectUse, and fixes the alignment of each load
3862 // or store found.
3863 SmallPtrSet<Instruction *, 4> Visited;
3864 SmallVector<Instruction *, 4> Uses;
3865 Visited.insert(Ptr: &Root);
3866 Uses.push_back(Elt: &Root);
3867 do {
3868 Instruction *I = Uses.pop_back_val();
3869
3870 if (LoadInst *LI = dyn_cast<LoadInst>(Val: I)) {
3871 LI->setAlignment(std::min(a: LI->getAlign(), b: getSliceAlign()));
3872 continue;
3873 }
3874 if (StoreInst *SI = dyn_cast<StoreInst>(Val: I)) {
3875 SI->setAlignment(std::min(a: SI->getAlign(), b: getSliceAlign()));
3876 continue;
3877 }
3878
3879 assert(isa<BitCastInst>(I) || isa<AddrSpaceCastInst>(I) ||
3880 isa<PHINode>(I) || isa<SelectInst>(I) ||
3881 isa<GetElementPtrInst>(I));
3882 for (User *U : I->users())
3883 if (Visited.insert(Ptr: cast<Instruction>(Val: U)).second)
3884 Uses.push_back(Elt: cast<Instruction>(Val: U));
3885 } while (!Uses.empty());
3886 }
3887
3888 bool visitPHINode(PHINode &PN) {
3889 LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
3890 assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable");
3891 assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable");
3892
3893 // We would like to compute a new pointer in only one place, but have it be
3894 // as local as possible to the PHI. To do that, we re-use the location of
3895 // the old pointer, which necessarily must be in the right position to
3896 // dominate the PHI.
3897 IRBuilderBase::InsertPointGuard Guard(IRB);
3898 if (isa<PHINode>(Val: OldPtr))
3899 IRB.SetInsertPoint(TheBB: OldPtr->getParent(),
3900 IP: OldPtr->getParent()->getFirstInsertionPt());
3901 else
3902 IRB.SetInsertPoint(OldPtr);
3903 IRB.SetCurrentDebugLocation(OldPtr->getDebugLoc());
3904
3905 Value *NewPtr = getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType());
3906 // Replace the operands which were using the old pointer.
3907 std::replace(first: PN.op_begin(), last: PN.op_end(), old_value: cast<Value>(Val: OldPtr), new_value: NewPtr);
3908
3909 LLVM_DEBUG(dbgs() << " to: " << PN << "\n");
3910 deleteIfTriviallyDead(V: OldPtr);
3911
3912 // Fix the alignment of any loads or stores using this PHI node.
3913 fixLoadStoreAlign(Root&: PN);
3914
3915 // PHIs can't be promoted on their own, but often can be speculated. We
3916 // check the speculation outside of the rewriter so that we see the
3917 // fully-rewritten alloca.
3918 PHIUsers.insert(X: &PN);
3919 return true;
3920 }
3921
3922 bool visitSelectInst(SelectInst &SI) {
3923 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3924 assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) &&
3925 "Pointer isn't an operand!");
3926 assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
3927 assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable");
3928
3929 Value *NewPtr = getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType());
3930 // Replace the operands which were using the old pointer.
3931 if (SI.getOperand(i_nocapture: 1) == OldPtr)
3932 SI.setOperand(i_nocapture: 1, Val_nocapture: NewPtr);
3933 if (SI.getOperand(i_nocapture: 2) == OldPtr)
3934 SI.setOperand(i_nocapture: 2, Val_nocapture: NewPtr);
3935
3936 LLVM_DEBUG(dbgs() << " to: " << SI << "\n");
3937 deleteIfTriviallyDead(V: OldPtr);
3938
3939 // Fix the alignment of any loads or stores using this select.
3940 fixLoadStoreAlign(Root&: SI);
3941
3942 // Selects can't be promoted on their own, but often can be speculated. We
3943 // check the speculation outside of the rewriter so that we see the
3944 // fully-rewritten alloca.
3945 SelectUsers.insert(X: &SI);
3946 return true;
3947 }
3948};
3949
3950/// Visitor to rewrite aggregate loads and stores as scalar.
3951///
3952/// This pass aggressively rewrites all aggregate loads and stores on
3953/// a particular pointer (or any pointer derived from it which we can identify)
3954/// with scalar loads and stores.
3955class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
3956 // Befriend the base class so it can delegate to private visit methods.
3957 friend class InstVisitor<AggLoadStoreRewriter, bool>;
3958
3959 /// Queue of pointer uses to analyze and potentially rewrite.
3960 SmallVector<Use *, 8> Queue;
3961
3962 /// Set to prevent us from cycling with phi nodes and loops.
3963 SmallPtrSet<User *, 8> Visited;
3964
3965 /// The current pointer use being rewritten. This is used to dig up the used
3966 /// value (as opposed to the user).
3967 Use *U = nullptr;
3968
3969 /// Used to calculate offsets, and hence alignment, of subobjects.
3970 const DataLayout &DL;
3971
3972 IRBuilderTy &IRB;
3973
3974public:
3975 AggLoadStoreRewriter(const DataLayout &DL, IRBuilderTy &IRB)
3976 : DL(DL), IRB(IRB) {}
3977
3978 /// Rewrite loads and stores through a pointer and all pointers derived from
3979 /// it.
3980 bool rewrite(Instruction &I) {
3981 LLVM_DEBUG(dbgs() << " Rewriting FCA loads and stores...\n");
3982 enqueueUsers(I);
3983 bool Changed = false;
3984 while (!Queue.empty()) {
3985 U = Queue.pop_back_val();
3986 Changed |= visit(I: cast<Instruction>(Val: U->getUser()));
3987 }
3988 return Changed;
3989 }
3990
3991private:
3992 /// Enqueue all the users of the given instruction for further processing.
3993 /// This uses a set to de-duplicate users.
3994 void enqueueUsers(Instruction &I) {
3995 for (Use &U : I.uses())
3996 if (Visited.insert(Ptr: U.getUser()).second)
3997 Queue.push_back(Elt: &U);
3998 }
3999
4000 // Conservative default is to not rewrite anything.
4001 bool visitInstruction(Instruction &I) { return false; }
4002
4003 /// Generic recursive split emission class.
4004 template <typename Derived> class OpSplitter {
4005 protected:
4006 /// The builder used to form new instructions.
4007 IRBuilderTy &IRB;
4008
4009 /// The indices which to be used with insert- or extractvalue to select the
4010 /// appropriate value within the aggregate.
4011 SmallVector<unsigned, 4> Indices;
4012
4013 /// The indices to a GEP instruction which will move Ptr to the correct slot
4014 /// within the aggregate.
4015 SmallVector<Value *, 4> GEPIndices;
4016
4017 /// The base pointer of the original op, used as a base for GEPing the
4018 /// split operations.
4019 Value *Ptr;
4020
4021 /// The base pointee type being GEPed into.
4022 Type *BaseTy;
4023
4024 /// Known alignment of the base pointer.
4025 Align BaseAlign;
4026
4027 /// To calculate offset of each component so we can correctly deduce
4028 /// alignments.
4029 const DataLayout &DL;
4030
4031 /// Initialize the splitter with an insertion point, Ptr and start with a
4032 /// single zero GEP index.
4033 OpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
4034 Align BaseAlign, const DataLayout &DL, IRBuilderTy &IRB)
4035 : IRB(IRB), GEPIndices(1, IRB.getInt32(C: 0)), Ptr(Ptr), BaseTy(BaseTy),
4036 BaseAlign(BaseAlign), DL(DL) {
4037 IRB.SetInsertPoint(InsertionPoint);
4038 }
4039
4040 public:
4041 /// Generic recursive split emission routine.
4042 ///
4043 /// This method recursively splits an aggregate op (load or store) into
4044 /// scalar or vector ops. It splits recursively until it hits a single value
4045 /// and emits that single value operation via the template argument.
4046 ///
4047 /// The logic of this routine relies on GEPs and insertvalue and
4048 /// extractvalue all operating with the same fundamental index list, merely
4049 /// formatted differently (GEPs need actual values).
4050 ///
4051 /// \param Ty The type being split recursively into smaller ops.
4052 /// \param Agg The aggregate value being built up or stored, depending on
4053 /// whether this is splitting a load or a store respectively.
4054 void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) {
4055 if (Ty->isSingleValueType()) {
4056 unsigned Offset = DL.getIndexedOffsetInType(ElemTy: BaseTy, Indices: GEPIndices);
4057 return static_cast<Derived *>(this)->emitFunc(
4058 Ty, Agg, commonAlignment(A: BaseAlign, Offset), Name);
4059 }
4060
4061 if (ArrayType *ATy = dyn_cast<ArrayType>(Val: Ty)) {
4062 unsigned OldSize = Indices.size();
4063 (void)OldSize;
4064 for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size;
4065 ++Idx) {
4066 assert(Indices.size() == OldSize && "Did not return to the old size");
4067 Indices.push_back(Elt: Idx);
4068 GEPIndices.push_back(Elt: IRB.getInt32(C: Idx));
4069 emitSplitOps(Ty: ATy->getElementType(), Agg, Name: Name + "." + Twine(Idx));
4070 GEPIndices.pop_back();
4071 Indices.pop_back();
4072 }
4073 return;
4074 }
4075
4076 if (StructType *STy = dyn_cast<StructType>(Val: Ty)) {
4077 unsigned OldSize = Indices.size();
4078 (void)OldSize;
4079 for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size;
4080 ++Idx) {
4081 assert(Indices.size() == OldSize && "Did not return to the old size");
4082 Indices.push_back(Elt: Idx);
4083 GEPIndices.push_back(Elt: IRB.getInt32(C: Idx));
4084 emitSplitOps(Ty: STy->getElementType(N: Idx), Agg, Name: Name + "." + Twine(Idx));
4085 GEPIndices.pop_back();
4086 Indices.pop_back();
4087 }
4088 return;
4089 }
4090
4091 llvm_unreachable("Only arrays and structs are aggregate loadable types");
4092 }
4093 };
4094
4095 struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
4096 AAMDNodes AATags;
4097 // A vector to hold the split components that we want to emit
4098 // separate fake uses for.
4099 SmallVector<Value *, 4> Components;
4100 // A vector to hold all the fake uses of the struct that we are splitting.
4101 // Usually there should only be one, but we are handling the general case.
4102 SmallVector<Instruction *, 1> FakeUses;
4103
4104 LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
4105 AAMDNodes AATags, Align BaseAlign, const DataLayout &DL,
4106 IRBuilderTy &IRB)
4107 : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, DL,
4108 IRB),
4109 AATags(AATags) {}
4110
4111 /// Emit a leaf load of a single value. This is called at the leaves of the
4112 /// recursive emission to actually load values.
4113 void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
4114 assert(Ty->isSingleValueType());
4115 // Load the single value and insert it using the indices.
4116 Value *GEP =
4117 IRB.CreateInBoundsGEP(Ty: BaseTy, Ptr, IdxList: GEPIndices, Name: Name + ".gep");
4118 LoadInst *Load =
4119 IRB.CreateAlignedLoad(Ty, Ptr: GEP, Align: Alignment, Name: Name + ".load");
4120
4121 APInt Offset(
4122 DL.getIndexSizeInBits(AS: Ptr->getType()->getPointerAddressSpace()), 0);
4123 if (AATags &&
4124 GEPOperator::accumulateConstantOffset(SourceType: BaseTy, Index: GEPIndices, DL, Offset))
4125 Load->setAAMetadata(
4126 AATags.adjustForAccess(Offset: Offset.getZExtValue(), AccessTy: Load->getType(), DL));
4127 // Record the load so we can generate a fake use for this aggregate
4128 // component.
4129 Components.push_back(Elt: Load);
4130
4131 Agg = IRB.CreateInsertValue(Agg, Val: Load, Idxs: Indices, Name: Name + ".insert");
4132 LLVM_DEBUG(dbgs() << " to: " << *Load << "\n");
4133 }
4134
4135 // Stash the fake uses that use the value generated by this instruction.
4136 void recordFakeUses(LoadInst &LI) {
4137 for (Use &U : LI.uses())
4138 if (auto *II = dyn_cast<IntrinsicInst>(Val: U.getUser()))
4139 if (II->getIntrinsicID() == Intrinsic::fake_use)
4140 FakeUses.push_back(Elt: II);
4141 }
4142
4143 // Replace all fake uses of the aggregate with a series of fake uses, one
4144 // for each split component.
4145 void emitFakeUses() {
4146 for (Instruction *I : FakeUses) {
4147 IRB.SetInsertPoint(I);
4148 for (auto *V : Components)
4149 IRB.CreateIntrinsic(ID: Intrinsic::fake_use, Args: {V});
4150 I->eraseFromParent();
4151 }
4152 }
4153 };
4154
4155 bool visitLoadInst(LoadInst &LI) {
4156 assert(LI.getPointerOperand() == *U);
4157 if (!LI.isSimple() || LI.getType()->isSingleValueType())
4158 return false;
4159
4160 // We have an aggregate being loaded, split it apart.
4161 LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
4162 LoadOpSplitter Splitter(&LI, *U, LI.getType(), LI.getAAMetadata(),
4163 getAdjustedAlignment(I: &LI, Offset: 0), DL, IRB);
4164 Splitter.recordFakeUses(LI);
4165 Value *V = PoisonValue::get(T: LI.getType());
4166 Splitter.emitSplitOps(Ty: LI.getType(), Agg&: V, Name: LI.getName() + ".fca");
4167 Splitter.emitFakeUses();
4168 Visited.erase(Ptr: &LI);
4169 LI.replaceAllUsesWith(V);
4170 LI.eraseFromParent();
4171 return true;
4172 }
4173
4174 struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
4175 StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
4176 AAMDNodes AATags, StoreInst *AggStore, Align BaseAlign,
4177 const DataLayout &DL, IRBuilderTy &IRB)
4178 : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
4179 DL, IRB),
4180 AATags(AATags), AggStore(AggStore) {}
4181 AAMDNodes AATags;
4182 StoreInst *AggStore;
4183 /// Emit a leaf store of a single value. This is called at the leaves of the
4184 /// recursive emission to actually produce stores.
4185 void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
4186 assert(Ty->isSingleValueType());
4187 // Extract the single value and store it using the indices.
4188 //
4189 // The gep and extractvalue values are factored out of the CreateStore
4190 // call to make the output independent of the argument evaluation order.
4191 Value *ExtractValue =
4192 IRB.CreateExtractValue(Agg, Idxs: Indices, Name: Name + ".extract");
4193 Value *InBoundsGEP =
4194 IRB.CreateInBoundsGEP(Ty: BaseTy, Ptr, IdxList: GEPIndices, Name: Name + ".gep");
4195 StoreInst *Store =
4196 IRB.CreateAlignedStore(Val: ExtractValue, Ptr: InBoundsGEP, Align: Alignment);
4197
4198 APInt Offset(
4199 DL.getIndexSizeInBits(AS: Ptr->getType()->getPointerAddressSpace()), 0);
4200 GEPOperator::accumulateConstantOffset(SourceType: BaseTy, Index: GEPIndices, DL, Offset);
4201 if (AATags) {
4202 Store->setAAMetadata(AATags.adjustForAccess(
4203 Offset: Offset.getZExtValue(), AccessTy: ExtractValue->getType(), DL));
4204 }
4205
4206 // migrateDebugInfo requires the base Alloca. Walk to it from this gep.
4207 // If we cannot (because there's an intervening non-const or unbounded
4208 // gep) then we wouldn't expect to see dbg.assign intrinsics linked to
4209 // this instruction.
4210 Value *Base = AggStore->getPointerOperand()->stripInBoundsOffsets();
4211 if (auto *OldAI = dyn_cast<AllocaInst>(Val: Base)) {
4212 uint64_t SizeInBits =
4213 DL.getTypeSizeInBits(Ty: Store->getValueOperand()->getType());
4214 migrateDebugInfo(OldAlloca: OldAI, /*IsSplit*/ true, OldAllocaOffsetInBits: Offset.getZExtValue() * 8,
4215 SliceSizeInBits: SizeInBits, OldInst: AggStore, Inst: Store,
4216 Dest: Store->getPointerOperand(), Value: Store->getValueOperand(),
4217 DL);
4218 } else {
4219 assert(at::getDVRAssignmentMarkers(Store).empty() &&
4220 "AT: unexpected debug.assign linked to store through "
4221 "unbounded GEP");
4222 }
4223 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
4224 }
4225 };
4226
4227 bool visitStoreInst(StoreInst &SI) {
4228 if (!SI.isSimple() || SI.getPointerOperand() != *U)
4229 return false;
4230 Value *V = SI.getValueOperand();
4231 if (V->getType()->isSingleValueType())
4232 return false;
4233
4234 // We have an aggregate being stored, split it apart.
4235 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
4236 StoreOpSplitter Splitter(&SI, *U, V->getType(), SI.getAAMetadata(), &SI,
4237 getAdjustedAlignment(I: &SI, Offset: 0), DL, IRB);
4238 Splitter.emitSplitOps(Ty: V->getType(), Agg&: V, Name: V->getName() + ".fca");
4239 Visited.erase(Ptr: &SI);
4240 // The stores replacing SI each have markers describing fragments of the
4241 // assignment so delete the assignment markers linked to SI.
4242 at::deleteAssignmentMarkers(Inst: &SI);
4243 SI.eraseFromParent();
4244 return true;
4245 }
4246
4247 bool visitBitCastInst(BitCastInst &BC) {
4248 enqueueUsers(I&: BC);
4249 return false;
4250 }
4251
4252 bool visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
4253 enqueueUsers(I&: ASC);
4254 return false;
4255 }
4256
4257 // Unfold gep (select cond, ptr1, ptr2), idx
4258 // => select cond, gep(ptr1, idx), gep(ptr2, idx)
4259 // and gep ptr, (select cond, idx1, idx2)
4260 // => select cond, gep(ptr, idx1), gep(ptr, idx2)
4261 // We also allow for i1 zext indices, which are equivalent to selects.
4262 bool unfoldGEPSelect(GetElementPtrInst &GEPI) {
4263 // Check whether the GEP has exactly one select operand and all indices
4264 // will become constant after the transform.
4265 Instruction *Sel = dyn_cast<SelectInst>(Val: GEPI.getPointerOperand());
4266 for (Value *Op : GEPI.indices()) {
4267 if (auto *SI = dyn_cast<SelectInst>(Val: Op)) {
4268 if (Sel)
4269 return false;
4270
4271 Sel = SI;
4272 if (!isa<ConstantInt>(Val: SI->getTrueValue()) ||
4273 !isa<ConstantInt>(Val: SI->getFalseValue()))
4274 return false;
4275 continue;
4276 }
4277 if (auto *ZI = dyn_cast<ZExtInst>(Val: Op)) {
4278 if (Sel)
4279 return false;
4280 Sel = ZI;
4281 if (!ZI->getSrcTy()->isIntegerTy(Bitwidth: 1))
4282 return false;
4283 continue;
4284 }
4285
4286 if (!isa<ConstantInt>(Val: Op))
4287 return false;
4288 }
4289
4290 if (!Sel)
4291 return false;
4292
4293 LLVM_DEBUG(dbgs() << " Rewriting gep(select) -> select(gep):\n";
4294 dbgs() << " original: " << *Sel << "\n";
4295 dbgs() << " " << GEPI << "\n";);
4296
4297 auto GetNewOps = [&](Value *SelOp) {
4298 SmallVector<Value *> NewOps;
4299 for (Value *Op : GEPI.operands())
4300 if (Op == Sel)
4301 NewOps.push_back(Elt: SelOp);
4302 else
4303 NewOps.push_back(Elt: Op);
4304 return NewOps;
4305 };
4306
4307 Value *Cond, *True, *False;
4308 Instruction *MDFrom = nullptr;
4309 if (auto *SI = dyn_cast<SelectInst>(Val: Sel)) {
4310 Cond = SI->getCondition();
4311 True = SI->getTrueValue();
4312 False = SI->getFalseValue();
4313 if (!ProfcheckDisableMetadataFixes)
4314 MDFrom = SI;
4315 } else {
4316 Cond = Sel->getOperand(i: 0);
4317 True = ConstantInt::get(Ty: Sel->getType(), V: 1);
4318 False = ConstantInt::get(Ty: Sel->getType(), V: 0);
4319 }
4320 SmallVector<Value *> TrueOps = GetNewOps(True);
4321 SmallVector<Value *> FalseOps = GetNewOps(False);
4322
4323 IRB.SetInsertPoint(&GEPI);
4324 GEPNoWrapFlags NW = GEPI.getNoWrapFlags();
4325
4326 Type *Ty = GEPI.getSourceElementType();
4327 Value *NTrue = IRB.CreateGEP(Ty, Ptr: TrueOps[0], IdxList: ArrayRef(TrueOps).drop_front(),
4328 Name: True->getName() + ".sroa.gep", NW);
4329
4330 Value *NFalse =
4331 IRB.CreateGEP(Ty, Ptr: FalseOps[0], IdxList: ArrayRef(FalseOps).drop_front(),
4332 Name: False->getName() + ".sroa.gep", NW);
4333
4334 Value *NSel = MDFrom
4335 ? IRB.CreateSelect(C: Cond, True: NTrue, False: NFalse,
4336 Name: Sel->getName() + ".sroa.sel", MDFrom)
4337 : IRB.CreateSelectWithUnknownProfile(
4338 C: Cond, True: NTrue, False: NFalse, DEBUG_TYPE,
4339 Name: Sel->getName() + ".sroa.sel");
4340 Visited.erase(Ptr: &GEPI);
4341 GEPI.replaceAllUsesWith(V: NSel);
4342 GEPI.eraseFromParent();
4343 Instruction *NSelI = cast<Instruction>(Val: NSel);
4344 Visited.insert(Ptr: NSelI);
4345 enqueueUsers(I&: *NSelI);
4346
4347 LLVM_DEBUG(dbgs() << " to: " << *NTrue << "\n";
4348 dbgs() << " " << *NFalse << "\n";
4349 dbgs() << " " << *NSel << "\n";);
4350
4351 return true;
4352 }
4353
4354 // Unfold gep (phi ptr1, ptr2), idx
4355 // => phi ((gep ptr1, idx), (gep ptr2, idx))
4356 // and gep ptr, (phi idx1, idx2)
4357 // => phi ((gep ptr, idx1), (gep ptr, idx2))
4358 bool unfoldGEPPhi(GetElementPtrInst &GEPI) {
4359 // To prevent infinitely expanding recursive phis, bail if the GEP pointer
4360 // operand (looking through the phi if it is the phi we want to unfold) is
4361 // an instruction besides a static alloca.
4362 PHINode *Phi = dyn_cast<PHINode>(Val: GEPI.getPointerOperand());
4363 auto IsInvalidPointerOperand = [](Value *V) {
4364 if (!isa<Instruction>(Val: V))
4365 return false;
4366 if (auto *AI = dyn_cast<AllocaInst>(Val: V))
4367 return !AI->isStaticAlloca();
4368 return true;
4369 };
4370 if (Phi) {
4371 if (any_of(Range: Phi->operands(), P: IsInvalidPointerOperand))
4372 return false;
4373 } else {
4374 if (IsInvalidPointerOperand(GEPI.getPointerOperand()))
4375 return false;
4376 }
4377 // Check whether the GEP has exactly one phi operand (including the pointer
4378 // operand) and all indices will become constant after the transform.
4379 for (Value *Op : GEPI.indices()) {
4380 if (auto *SI = dyn_cast<PHINode>(Val: Op)) {
4381 if (Phi)
4382 return false;
4383
4384 Phi = SI;
4385 if (!all_of(Range: Phi->incoming_values(),
4386 P: [](Value *V) { return isa<ConstantInt>(Val: V); }))
4387 return false;
4388 continue;
4389 }
4390
4391 if (!isa<ConstantInt>(Val: Op))
4392 return false;
4393 }
4394
4395 if (!Phi)
4396 return false;
4397
4398 LLVM_DEBUG(dbgs() << " Rewriting gep(phi) -> phi(gep):\n";
4399 dbgs() << " original: " << *Phi << "\n";
4400 dbgs() << " " << GEPI << "\n";);
4401
4402 auto GetNewOps = [&](Value *PhiOp) {
4403 SmallVector<Value *> NewOps;
4404 for (Value *Op : GEPI.operands())
4405 if (Op == Phi)
4406 NewOps.push_back(Elt: PhiOp);
4407 else
4408 NewOps.push_back(Elt: Op);
4409 return NewOps;
4410 };
4411
4412 IRB.SetInsertPoint(Phi);
4413 PHINode *NewPhi = IRB.CreatePHI(Ty: GEPI.getType(), NumReservedValues: Phi->getNumIncomingValues(),
4414 Name: Phi->getName() + ".sroa.phi");
4415
4416 Type *SourceTy = GEPI.getSourceElementType();
4417 // We only handle arguments, constants, and static allocas here, so we can
4418 // insert GEPs at the end of the entry block.
4419 IRB.SetInsertPoint(GEPI.getFunction()->getEntryBlock().getTerminator());
4420 for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
4421 Value *Op = Phi->getIncomingValue(i: I);
4422 BasicBlock *BB = Phi->getIncomingBlock(i: I);
4423 Value *NewGEP;
4424 if (int NI = NewPhi->getBasicBlockIndex(BB); NI >= 0) {
4425 NewGEP = NewPhi->getIncomingValue(i: NI);
4426 } else {
4427 SmallVector<Value *> NewOps = GetNewOps(Op);
4428 NewGEP =
4429 IRB.CreateGEP(Ty: SourceTy, Ptr: NewOps[0], IdxList: ArrayRef(NewOps).drop_front(),
4430 Name: Phi->getName() + ".sroa.gep", NW: GEPI.getNoWrapFlags());
4431 }
4432 NewPhi->addIncoming(V: NewGEP, BB);
4433 }
4434
4435 Visited.erase(Ptr: &GEPI);
4436 GEPI.replaceAllUsesWith(V: NewPhi);
4437 GEPI.eraseFromParent();
4438 Visited.insert(Ptr: NewPhi);
4439 enqueueUsers(I&: *NewPhi);
4440
4441 LLVM_DEBUG(dbgs() << " to: ";
4442 for (Value *In
4443 : NewPhi->incoming_values()) dbgs()
4444 << "\n " << *In;
4445 dbgs() << "\n " << *NewPhi << '\n');
4446
4447 return true;
4448 }
4449
4450 bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
4451 if (unfoldGEPSelect(GEPI))
4452 return true;
4453
4454 if (unfoldGEPPhi(GEPI))
4455 return true;
4456
4457 enqueueUsers(I&: GEPI);
4458 return false;
4459 }
4460
4461 bool visitPHINode(PHINode &PN) {
4462 enqueueUsers(I&: PN);
4463 return false;
4464 }
4465
4466 bool visitSelectInst(SelectInst &SI) {
4467 enqueueUsers(I&: SI);
4468 return false;
4469 }
4470};
4471
4472} // end anonymous namespace
4473
4474/// Strip aggregate type wrapping.
4475///
4476/// This removes no-op aggregate types wrapping an underlying type. It will
4477/// strip as many layers of types as it can without changing either the type
4478/// size or the allocated size.
4479static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) {
4480 if (Ty->isSingleValueType())
4481 return Ty;
4482
4483 uint64_t AllocSize = DL.getTypeAllocSize(Ty).getFixedValue();
4484 uint64_t TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
4485
4486 Type *InnerTy;
4487 if (ArrayType *ArrTy = dyn_cast<ArrayType>(Val: Ty)) {
4488 InnerTy = ArrTy->getElementType();
4489 } else if (StructType *STy = dyn_cast<StructType>(Val: Ty)) {
4490 const StructLayout *SL = DL.getStructLayout(Ty: STy);
4491 unsigned Index = SL->getElementContainingOffset(FixedOffset: 0);
4492 InnerTy = STy->getElementType(N: Index);
4493 } else {
4494 return Ty;
4495 }
4496
4497 if (AllocSize > DL.getTypeAllocSize(Ty: InnerTy).getFixedValue() ||
4498 TypeSize > DL.getTypeSizeInBits(Ty: InnerTy).getFixedValue())
4499 return Ty;
4500
4501 return stripAggregateTypeWrapping(DL, Ty: InnerTy);
4502}
4503
4504/// Try to find a partition of the aggregate type passed in for a given
4505/// offset and size.
4506///
4507/// This recurses through the aggregate type and tries to compute a subtype
4508/// based on the offset and size. When the offset and size span a sub-section
4509/// of an array, it will even compute a new array type for that sub-section,
4510/// and the same for structs.
4511///
4512/// Note that this routine is very strict and tries to find a partition of the
4513/// type which produces the *exact* right offset and size. It is not forgiving
4514/// when the size or offset cause either end of type-based partition to be off.
4515/// Also, this is a best-effort routine. It is reasonable to give up and not
4516/// return a type if necessary.
4517static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
4518 uint64_t Size) {
4519 if (Offset == 0 && DL.getTypeAllocSize(Ty).getFixedValue() == Size)
4520 return stripAggregateTypeWrapping(DL, Ty);
4521 if (Offset > DL.getTypeAllocSize(Ty).getFixedValue() ||
4522 (DL.getTypeAllocSize(Ty).getFixedValue() - Offset) < Size)
4523 return nullptr;
4524
4525 if (isa<ArrayType>(Val: Ty) || isa<VectorType>(Val: Ty)) {
4526 Type *ElementTy;
4527 uint64_t TyNumElements;
4528 if (auto *AT = dyn_cast<ArrayType>(Val: Ty)) {
4529 ElementTy = AT->getElementType();
4530 TyNumElements = AT->getNumElements();
4531 } else {
4532 // FIXME: This isn't right for vectors with non-byte-sized or
4533 // non-power-of-two sized elements.
4534 auto *VT = cast<FixedVectorType>(Val: Ty);
4535 ElementTy = VT->getElementType();
4536 TyNumElements = VT->getNumElements();
4537 }
4538 uint64_t ElementSize = DL.getTypeAllocSize(Ty: ElementTy).getFixedValue();
4539 uint64_t NumSkippedElements = Offset / ElementSize;
4540 if (NumSkippedElements >= TyNumElements)
4541 return nullptr;
4542 Offset -= NumSkippedElements * ElementSize;
4543
4544 // First check if we need to recurse.
4545 if (Offset > 0 || Size < ElementSize) {
4546 // Bail if the partition ends in a different array element.
4547 if ((Offset + Size) > ElementSize)
4548 return nullptr;
4549 // Recurse through the element type trying to peel off offset bytes.
4550 return getTypePartition(DL, Ty: ElementTy, Offset, Size);
4551 }
4552 assert(Offset == 0);
4553
4554 if (Size == ElementSize)
4555 return stripAggregateTypeWrapping(DL, Ty: ElementTy);
4556 assert(Size > ElementSize);
4557 uint64_t NumElements = Size / ElementSize;
4558 if (NumElements * ElementSize != Size)
4559 return nullptr;
4560 return ArrayType::get(ElementType: ElementTy, NumElements);
4561 }
4562
4563 StructType *STy = dyn_cast<StructType>(Val: Ty);
4564 if (!STy)
4565 return nullptr;
4566
4567 const StructLayout *SL = DL.getStructLayout(Ty: STy);
4568
4569 if (SL->getSizeInBits().isScalable())
4570 return nullptr;
4571
4572 if (Offset >= SL->getSizeInBytes())
4573 return nullptr;
4574 uint64_t EndOffset = Offset + Size;
4575 if (EndOffset > SL->getSizeInBytes())
4576 return nullptr;
4577
4578 unsigned Index = SL->getElementContainingOffset(FixedOffset: Offset);
4579 Offset -= SL->getElementOffset(Idx: Index);
4580
4581 Type *ElementTy = STy->getElementType(N: Index);
4582 uint64_t ElementSize = DL.getTypeAllocSize(Ty: ElementTy).getFixedValue();
4583 if (Offset >= ElementSize)
4584 return nullptr; // The offset points into alignment padding.
4585
4586 // See if any partition must be contained by the element.
4587 if (Offset > 0 || Size < ElementSize) {
4588 if ((Offset + Size) > ElementSize)
4589 return nullptr;
4590 return getTypePartition(DL, Ty: ElementTy, Offset, Size);
4591 }
4592 assert(Offset == 0);
4593
4594 if (Size == ElementSize)
4595 return stripAggregateTypeWrapping(DL, Ty: ElementTy);
4596
4597 StructType::element_iterator EI = STy->element_begin() + Index,
4598 EE = STy->element_end();
4599 if (EndOffset < SL->getSizeInBytes()) {
4600 unsigned EndIndex = SL->getElementContainingOffset(FixedOffset: EndOffset);
4601 if (Index == EndIndex)
4602 return nullptr; // Within a single element and its padding.
4603
4604 // Don't try to form "natural" types if the elements don't line up with the
4605 // expected size.
4606 // FIXME: We could potentially recurse down through the last element in the
4607 // sub-struct to find a natural end point.
4608 if (SL->getElementOffset(Idx: EndIndex) != EndOffset)
4609 return nullptr;
4610
4611 assert(Index < EndIndex);
4612 EE = STy->element_begin() + EndIndex;
4613 }
4614
4615 // Try to build up a sub-structure.
4616 StructType *SubTy =
4617 StructType::get(Context&: STy->getContext(), Elements: ArrayRef(EI, EE), isPacked: STy->isPacked());
4618 const StructLayout *SubSL = DL.getStructLayout(Ty: SubTy);
4619 if (Size != SubSL->getSizeInBytes())
4620 return nullptr; // The sub-struct doesn't have quite the size needed.
4621
4622 return SubTy;
4623}
4624
4625/// Pre-split loads and stores to simplify rewriting.
4626///
4627/// We want to break up the splittable load+store pairs as much as
4628/// possible. This is important to do as a preprocessing step, as once we
4629/// start rewriting the accesses to partitions of the alloca we lose the
4630/// necessary information to correctly split apart paired loads and stores
4631/// which both point into this alloca. The case to consider is something like
4632/// the following:
4633///
4634/// %a = alloca [12 x i8]
4635/// %gep1 = getelementptr i8, ptr %a, i32 0
4636/// %gep2 = getelementptr i8, ptr %a, i32 4
4637/// %gep3 = getelementptr i8, ptr %a, i32 8
4638/// store float 0.0, ptr %gep1
4639/// store float 1.0, ptr %gep2
4640/// %v = load i64, ptr %gep1
4641/// store i64 %v, ptr %gep2
4642/// %f1 = load float, ptr %gep2
4643/// %f2 = load float, ptr %gep3
4644///
4645/// Here we want to form 3 partitions of the alloca, each 4 bytes large, and
4646/// promote everything so we recover the 2 SSA values that should have been
4647/// there all along.
4648///
4649/// \returns true if any changes are made.
4650bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
4651 LLVM_DEBUG(dbgs() << "Pre-splitting loads and stores\n");
4652
4653 // Track the loads and stores which are candidates for pre-splitting here, in
4654 // the order they first appear during the partition scan. These give stable
4655 // iteration order and a basis for tracking which loads and stores we
4656 // actually split.
4657 SmallVector<LoadInst *, 4> Loads;
4658 SmallVector<StoreInst *, 4> Stores;
4659
4660 // We need to accumulate the splits required of each load or store where we
4661 // can find them via a direct lookup. This is important to cross-check loads
4662 // and stores against each other. We also track the slice so that we can kill
4663 // all the slices that end up split.
4664 struct SplitOffsets {
4665 Slice *S;
4666 std::vector<uint64_t> Splits;
4667 };
4668 SmallDenseMap<Instruction *, SplitOffsets, 8> SplitOffsetsMap;
4669
4670 // Track loads out of this alloca which cannot, for any reason, be pre-split.
4671 // This is important as we also cannot pre-split stores of those loads!
4672 // FIXME: This is all pretty gross. It means that we can be more aggressive
4673 // in pre-splitting when the load feeding the store happens to come from
4674 // a separate alloca. Put another way, the effectiveness of SROA would be
4675 // decreased by a frontend which just concatenated all of its local allocas
4676 // into one big flat alloca. But defeating such patterns is exactly the job
4677 // SROA is tasked with! Sadly, to not have this discrepancy we would have
4678 // change store pre-splitting to actually force pre-splitting of the load
4679 // that feeds it *and all stores*. That makes pre-splitting much harder, but
4680 // maybe it would make it more principled?
4681 SmallPtrSet<LoadInst *, 8> UnsplittableLoads;
4682
4683 LLVM_DEBUG(dbgs() << " Searching for candidate loads and stores\n");
4684 for (auto &P : AS.partitions()) {
4685 for (Slice &S : P) {
4686 Instruction *I = cast<Instruction>(Val: S.getUse()->getUser());
4687 if (!S.isSplittable() || S.endOffset() <= P.endOffset()) {
4688 // If this is a load we have to track that it can't participate in any
4689 // pre-splitting. If this is a store of a load we have to track that
4690 // that load also can't participate in any pre-splitting.
4691 if (auto *LI = dyn_cast<LoadInst>(Val: I))
4692 UnsplittableLoads.insert(Ptr: LI);
4693 else if (auto *SI = dyn_cast<StoreInst>(Val: I))
4694 if (auto *LI = dyn_cast<LoadInst>(Val: SI->getValueOperand()))
4695 UnsplittableLoads.insert(Ptr: LI);
4696 continue;
4697 }
4698 assert(P.endOffset() > S.beginOffset() &&
4699 "Empty or backwards partition!");
4700
4701 // Determine if this is a pre-splittable slice.
4702 if (auto *LI = dyn_cast<LoadInst>(Val: I)) {
4703 assert(!LI->isVolatile() && "Cannot split volatile loads!");
4704
4705 // The load must be used exclusively to store into other pointers for
4706 // us to be able to arbitrarily pre-split it. The stores must also be
4707 // simple to avoid changing semantics.
4708 auto IsLoadSimplyStored = [](LoadInst *LI) {
4709 for (User *LU : LI->users()) {
4710 auto *SI = dyn_cast<StoreInst>(Val: LU);
4711 if (!SI || !SI->isSimple())
4712 return false;
4713 }
4714 return true;
4715 };
4716 if (!IsLoadSimplyStored(LI)) {
4717 UnsplittableLoads.insert(Ptr: LI);
4718 continue;
4719 }
4720
4721 Loads.push_back(Elt: LI);
4722 } else if (auto *SI = dyn_cast<StoreInst>(Val: I)) {
4723 if (S.getUse() != &SI->getOperandUse(i: SI->getPointerOperandIndex()))
4724 // Skip stores *of* pointers. FIXME: This shouldn't even be possible!
4725 continue;
4726 auto *StoredLoad = dyn_cast<LoadInst>(Val: SI->getValueOperand());
4727 if (!StoredLoad || !StoredLoad->isSimple())
4728 continue;
4729 assert(!SI->isVolatile() && "Cannot split volatile stores!");
4730
4731 Stores.push_back(Elt: SI);
4732 } else {
4733 // Other uses cannot be pre-split.
4734 continue;
4735 }
4736
4737 // Record the initial split.
4738 LLVM_DEBUG(dbgs() << " Candidate: " << *I << "\n");
4739 auto &Offsets = SplitOffsetsMap[I];
4740 assert(Offsets.Splits.empty() &&
4741 "Should not have splits the first time we see an instruction!");
4742 Offsets.S = &S;
4743 Offsets.Splits.push_back(x: P.endOffset() - S.beginOffset());
4744 }
4745
4746 // Now scan the already split slices, and add a split for any of them which
4747 // we're going to pre-split.
4748 for (Slice *S : P.splitSliceTails()) {
4749 auto SplitOffsetsMapI =
4750 SplitOffsetsMap.find(Val: cast<Instruction>(Val: S->getUse()->getUser()));
4751 if (SplitOffsetsMapI == SplitOffsetsMap.end())
4752 continue;
4753 auto &Offsets = SplitOffsetsMapI->second;
4754
4755 assert(Offsets.S == S && "Found a mismatched slice!");
4756 assert(!Offsets.Splits.empty() &&
4757 "Cannot have an empty set of splits on the second partition!");
4758 assert(Offsets.Splits.back() ==
4759 P.beginOffset() - Offsets.S->beginOffset() &&
4760 "Previous split does not end where this one begins!");
4761
4762 // Record each split. The last partition's end isn't needed as the size
4763 // of the slice dictates that.
4764 if (S->endOffset() > P.endOffset())
4765 Offsets.Splits.push_back(x: P.endOffset() - Offsets.S->beginOffset());
4766 }
4767 }
4768
4769 // We may have split loads where some of their stores are split stores. For
4770 // such loads and stores, we can only pre-split them if their splits exactly
4771 // match relative to their starting offset. We have to verify this prior to
4772 // any rewriting.
4773 llvm::erase_if(C&: Stores, P: [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
4774 // Lookup the load we are storing in our map of split
4775 // offsets.
4776 auto *LI = cast<LoadInst>(Val: SI->getValueOperand());
4777 // If it was completely unsplittable, then we're done,
4778 // and this store can't be pre-split.
4779 if (UnsplittableLoads.count(Ptr: LI))
4780 return true;
4781
4782 auto LoadOffsetsI = SplitOffsetsMap.find(Val: LI);
4783 if (LoadOffsetsI == SplitOffsetsMap.end())
4784 return false; // Unrelated loads are definitely safe.
4785 auto &LoadOffsets = LoadOffsetsI->second;
4786
4787 // Now lookup the store's offsets.
4788 auto &StoreOffsets = SplitOffsetsMap[SI];
4789
4790 // If the relative offsets of each split in the load and
4791 // store match exactly, then we can split them and we
4792 // don't need to remove them here.
4793 if (LoadOffsets.Splits == StoreOffsets.Splits)
4794 return false;
4795
4796 LLVM_DEBUG(dbgs() << " Mismatched splits for load and store:\n"
4797 << " " << *LI << "\n"
4798 << " " << *SI << "\n");
4799
4800 // We've found a store and load that we need to split
4801 // with mismatched relative splits. Just give up on them
4802 // and remove both instructions from our list of
4803 // candidates.
4804 UnsplittableLoads.insert(Ptr: LI);
4805 return true;
4806 });
4807 // Now we have to go *back* through all the stores, because a later store may
4808 // have caused an earlier store's load to become unsplittable and if it is
4809 // unsplittable for the later store, then we can't rely on it being split in
4810 // the earlier store either.
4811 llvm::erase_if(C&: Stores, P: [&UnsplittableLoads](StoreInst *SI) {
4812 auto *LI = cast<LoadInst>(Val: SI->getValueOperand());
4813 return UnsplittableLoads.count(Ptr: LI);
4814 });
4815 // Once we've established all the loads that can't be split for some reason,
4816 // filter any that made it into our list out.
4817 llvm::erase_if(C&: Loads, P: [&UnsplittableLoads](LoadInst *LI) {
4818 return UnsplittableLoads.count(Ptr: LI);
4819 });
4820
4821 // If no loads or stores are left, there is no pre-splitting to be done for
4822 // this alloca.
4823 if (Loads.empty() && Stores.empty())
4824 return false;
4825
4826 // From here on, we can't fail and will be building new accesses, so rig up
4827 // an IR builder.
4828 IRBuilderTy IRB(&AI);
4829
4830 // Collect the new slices which we will merge into the alloca slices.
4831 SmallVector<Slice, 4> NewSlices;
4832
4833 // Track any allocas we end up splitting loads and stores for so we iterate
4834 // on them.
4835 SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas;
4836
4837 // At this point, we have collected all of the loads and stores we can
4838 // pre-split, and the specific splits needed for them. We actually do the
4839 // splitting in a specific order in order to handle when one of the loads in
4840 // the value operand to one of the stores.
4841 //
4842 // First, we rewrite all of the split loads, and just accumulate each split
4843 // load in a parallel structure. We also build the slices for them and append
4844 // them to the alloca slices.
4845 SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap;
4846 std::vector<LoadInst *> SplitLoads;
4847 const DataLayout &DL = AI.getDataLayout();
4848 for (LoadInst *LI : Loads) {
4849 SplitLoads.clear();
4850
4851 auto &Offsets = SplitOffsetsMap[LI];
4852 unsigned SliceSize = Offsets.S->endOffset() - Offsets.S->beginOffset();
4853 assert(LI->getType()->getIntegerBitWidth() % 8 == 0 &&
4854 "Load must have type size equal to store size");
4855 assert(LI->getType()->getIntegerBitWidth() / 8 >= SliceSize &&
4856 "Load must be >= slice size");
4857
4858 uint64_t BaseOffset = Offsets.S->beginOffset();
4859 assert(BaseOffset + SliceSize > BaseOffset &&
4860 "Cannot represent alloca access size using 64-bit integers!");
4861
4862 Instruction *BasePtr = cast<Instruction>(Val: LI->getPointerOperand());
4863 IRB.SetInsertPoint(LI);
4864
4865 LLVM_DEBUG(dbgs() << " Splitting load: " << *LI << "\n");
4866
4867 uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
4868 int Idx = 0, Size = Offsets.Splits.size();
4869 for (;;) {
4870 auto *PartTy = Type::getIntNTy(C&: LI->getContext(), N: PartSize * 8);
4871 auto AS = LI->getPointerAddressSpace();
4872 auto *PartPtrTy = LI->getPointerOperandType();
4873 LoadInst *PLoad = IRB.CreateAlignedLoad(
4874 Ty: PartTy,
4875 Ptr: getAdjustedPtr(IRB, DL, Ptr: BasePtr,
4876 Offset: APInt(DL.getIndexSizeInBits(AS), PartOffset),
4877 PointerTy: PartPtrTy, NamePrefix: BasePtr->getName() + "."),
4878 Align: getAdjustedAlignment(I: LI, Offset: PartOffset),
4879 /*IsVolatile*/ isVolatile: false, Name: LI->getName());
4880 PLoad->copyMetadata(SrcInst: *LI, WL: {LLVMContext::MD_mem_parallel_loop_access,
4881 LLVMContext::MD_access_group});
4882
4883 // Append this load onto the list of split loads so we can find it later
4884 // to rewrite the stores.
4885 SplitLoads.push_back(x: PLoad);
4886
4887 // Now build a new slice for the alloca.
4888 NewSlices.push_back(
4889 Elt: Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
4890 &PLoad->getOperandUse(i: PLoad->getPointerOperandIndex()),
4891 /*IsSplittable*/ false, nullptr));
4892 LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
4893 << ", " << NewSlices.back().endOffset()
4894 << "): " << *PLoad << "\n");
4895
4896 // See if we've handled all the splits.
4897 if (Idx >= Size)
4898 break;
4899
4900 // Setup the next partition.
4901 PartOffset = Offsets.Splits[Idx];
4902 ++Idx;
4903 PartSize = (Idx < Size ? Offsets.Splits[Idx] : SliceSize) - PartOffset;
4904 }
4905
4906 // Now that we have the split loads, do the slow walk over all uses of the
4907 // load and rewrite them as split stores, or save the split loads to use
4908 // below if the store is going to be split there anyways.
4909 bool DeferredStores = false;
4910 for (User *LU : LI->users()) {
4911 StoreInst *SI = cast<StoreInst>(Val: LU);
4912 if (!Stores.empty() && SplitOffsetsMap.count(Val: SI)) {
4913 DeferredStores = true;
4914 LLVM_DEBUG(dbgs() << " Deferred splitting of store: " << *SI
4915 << "\n");
4916 continue;
4917 }
4918
4919 Value *StoreBasePtr = SI->getPointerOperand();
4920 IRB.SetInsertPoint(SI);
4921 AAMDNodes AATags = SI->getAAMetadata();
4922
4923 LLVM_DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n");
4924
4925 for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) {
4926 LoadInst *PLoad = SplitLoads[Idx];
4927 uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1];
4928 auto *PartPtrTy = SI->getPointerOperandType();
4929
4930 auto AS = SI->getPointerAddressSpace();
4931 StoreInst *PStore = IRB.CreateAlignedStore(
4932 Val: PLoad,
4933 Ptr: getAdjustedPtr(IRB, DL, Ptr: StoreBasePtr,
4934 Offset: APInt(DL.getIndexSizeInBits(AS), PartOffset),
4935 PointerTy: PartPtrTy, NamePrefix: StoreBasePtr->getName() + "."),
4936 Align: getAdjustedAlignment(I: SI, Offset: PartOffset),
4937 /*IsVolatile*/ isVolatile: false);
4938 PStore->copyMetadata(SrcInst: *SI, WL: {LLVMContext::MD_mem_parallel_loop_access,
4939 LLVMContext::MD_access_group,
4940 LLVMContext::MD_DIAssignID});
4941
4942 if (AATags)
4943 PStore->setAAMetadata(
4944 AATags.adjustForAccess(Offset: PartOffset, AccessTy: PLoad->getType(), DL));
4945 LLVM_DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n");
4946 }
4947
4948 // We want to immediately iterate on any allocas impacted by splitting
4949 // this store, and we have to track any promotable alloca (indicated by
4950 // a direct store) as needing to be resplit because it is no longer
4951 // promotable.
4952 if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(Val: StoreBasePtr)) {
4953 ResplitPromotableAllocas.insert(Ptr: OtherAI);
4954 Worklist.insert(X: OtherAI);
4955 } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
4956 Val: StoreBasePtr->stripInBoundsOffsets())) {
4957 Worklist.insert(X: OtherAI);
4958 }
4959
4960 // Mark the original store as dead.
4961 DeadInsts.push_back(Elt: SI);
4962 }
4963
4964 // Save the split loads if there are deferred stores among the users.
4965 if (DeferredStores)
4966 SplitLoadsMap.insert(KV: std::make_pair(x&: LI, y: std::move(SplitLoads)));
4967
4968 // Mark the original load as dead and kill the original slice.
4969 DeadInsts.push_back(Elt: LI);
4970 Offsets.S->kill();
4971 }
4972
4973 // Second, we rewrite all of the split stores. At this point, we know that
4974 // all loads from this alloca have been split already. For stores of such
4975 // loads, we can simply look up the pre-existing split loads. For stores of
4976 // other loads, we split those loads first and then write split stores of
4977 // them.
4978 for (StoreInst *SI : Stores) {
4979 auto *LI = cast<LoadInst>(Val: SI->getValueOperand());
4980 IntegerType *Ty = cast<IntegerType>(Val: LI->getType());
4981 assert(Ty->getBitWidth() % 8 == 0);
4982 uint64_t StoreSize = Ty->getBitWidth() / 8;
4983 assert(StoreSize > 0 && "Cannot have a zero-sized integer store!");
4984
4985 auto &Offsets = SplitOffsetsMap[SI];
4986 assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
4987 "Slice size should always match load size exactly!");
4988 uint64_t BaseOffset = Offsets.S->beginOffset();
4989 assert(BaseOffset + StoreSize > BaseOffset &&
4990 "Cannot represent alloca access size using 64-bit integers!");
4991
4992 Value *LoadBasePtr = LI->getPointerOperand();
4993 Instruction *StoreBasePtr = cast<Instruction>(Val: SI->getPointerOperand());
4994
4995 LLVM_DEBUG(dbgs() << " Splitting store: " << *SI << "\n");
4996
4997 // Check whether we have an already split load.
4998 auto SplitLoadsMapI = SplitLoadsMap.find(Val: LI);
4999 std::vector<LoadInst *> *SplitLoads = nullptr;
5000 if (SplitLoadsMapI != SplitLoadsMap.end()) {
5001 SplitLoads = &SplitLoadsMapI->second;
5002 assert(SplitLoads->size() == Offsets.Splits.size() + 1 &&
5003 "Too few split loads for the number of splits in the store!");
5004 } else {
5005 LLVM_DEBUG(dbgs() << " of load: " << *LI << "\n");
5006 }
5007
5008 uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
5009 int Idx = 0, Size = Offsets.Splits.size();
5010 for (;;) {
5011 auto *PartTy = Type::getIntNTy(C&: Ty->getContext(), N: PartSize * 8);
5012 auto *LoadPartPtrTy = LI->getPointerOperandType();
5013 auto *StorePartPtrTy = SI->getPointerOperandType();
5014
5015 // Either lookup a split load or create one.
5016 LoadInst *PLoad;
5017 if (SplitLoads) {
5018 PLoad = (*SplitLoads)[Idx];
5019 } else {
5020 IRB.SetInsertPoint(LI);
5021 auto AS = LI->getPointerAddressSpace();
5022 PLoad = IRB.CreateAlignedLoad(
5023 Ty: PartTy,
5024 Ptr: getAdjustedPtr(IRB, DL, Ptr: LoadBasePtr,
5025 Offset: APInt(DL.getIndexSizeInBits(AS), PartOffset),
5026 PointerTy: LoadPartPtrTy, NamePrefix: LoadBasePtr->getName() + "."),
5027 Align: getAdjustedAlignment(I: LI, Offset: PartOffset),
5028 /*IsVolatile*/ isVolatile: false, Name: LI->getName());
5029 PLoad->copyMetadata(SrcInst: *LI, WL: {LLVMContext::MD_mem_parallel_loop_access,
5030 LLVMContext::MD_access_group});
5031 }
5032
5033 // And store this partition.
5034 IRB.SetInsertPoint(SI);
5035 auto AS = SI->getPointerAddressSpace();
5036 StoreInst *PStore = IRB.CreateAlignedStore(
5037 Val: PLoad,
5038 Ptr: getAdjustedPtr(IRB, DL, Ptr: StoreBasePtr,
5039 Offset: APInt(DL.getIndexSizeInBits(AS), PartOffset),
5040 PointerTy: StorePartPtrTy, NamePrefix: StoreBasePtr->getName() + "."),
5041 Align: getAdjustedAlignment(I: SI, Offset: PartOffset),
5042 /*IsVolatile*/ isVolatile: false);
5043 PStore->copyMetadata(SrcInst: *SI, WL: {LLVMContext::MD_mem_parallel_loop_access,
5044 LLVMContext::MD_access_group});
5045
5046 // Now build a new slice for the alloca.
5047 // ProtectedFieldDisc==nullptr is a lie, but it doesn't matter because we
5048 // already determined that all accesses are consistent.
5049 NewSlices.push_back(
5050 Elt: Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
5051 &PStore->getOperandUse(i: PStore->getPointerOperandIndex()),
5052 /*IsSplittable*/ false, nullptr));
5053 LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
5054 << ", " << NewSlices.back().endOffset()
5055 << "): " << *PStore << "\n");
5056 if (!SplitLoads) {
5057 LLVM_DEBUG(dbgs() << " of split load: " << *PLoad << "\n");
5058 }
5059
5060 // See if we've finished all the splits.
5061 if (Idx >= Size)
5062 break;
5063
5064 // Setup the next partition.
5065 PartOffset = Offsets.Splits[Idx];
5066 ++Idx;
5067 PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset;
5068 }
5069
5070 // We want to immediately iterate on any allocas impacted by splitting
5071 // this load, which is only relevant if it isn't a load of this alloca and
5072 // thus we didn't already split the loads above. We also have to keep track
5073 // of any promotable allocas we split loads on as they can no longer be
5074 // promoted.
5075 if (!SplitLoads) {
5076 if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(Val: LoadBasePtr)) {
5077 assert(OtherAI != &AI && "We can't re-split our own alloca!");
5078 ResplitPromotableAllocas.insert(Ptr: OtherAI);
5079 Worklist.insert(X: OtherAI);
5080 } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
5081 Val: LoadBasePtr->stripInBoundsOffsets())) {
5082 assert(OtherAI != &AI && "We can't re-split our own alloca!");
5083 Worklist.insert(X: OtherAI);
5084 }
5085 }
5086
5087 // Mark the original store as dead now that we've split it up and kill its
5088 // slice. Note that we leave the original load in place unless this store
5089 // was its only use. It may in turn be split up if it is an alloca load
5090 // for some other alloca, but it may be a normal load. This may introduce
5091 // redundant loads, but where those can be merged the rest of the optimizer
5092 // should handle the merging, and this uncovers SSA splits which is more
5093 // important. In practice, the original loads will almost always be fully
5094 // split and removed eventually, and the splits will be merged by any
5095 // trivial CSE, including instcombine.
5096 if (LI->hasOneUse()) {
5097 assert(*LI->user_begin() == SI && "Single use isn't this store!");
5098 DeadInsts.push_back(Elt: LI);
5099 }
5100 DeadInsts.push_back(Elt: SI);
5101 Offsets.S->kill();
5102 }
5103
5104 // Remove the killed slices that have ben pre-split.
5105 llvm::erase_if(C&: AS, P: [](const Slice &S) { return S.isDead(); });
5106
5107 // Insert our new slices. This will sort and merge them into the sorted
5108 // sequence.
5109 AS.insert(NewSlices);
5110
5111 LLVM_DEBUG(dbgs() << " Pre-split slices:\n");
5112#ifndef NDEBUG
5113 for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
5114 LLVM_DEBUG(AS.print(dbgs(), I, " "));
5115#endif
5116
5117 // Finally, don't try to promote any allocas that new require re-splitting.
5118 // They have already been added to the worklist above.
5119 PromotableAllocas.set_subtract(ResplitPromotableAllocas);
5120
5121 return true;
5122}
5123
5124/// Select a partition type for an alloca partition.
5125///
5126/// Try to compute a friendly type for this partition of the alloca. This
5127/// won't always succeed, in which case we fall back to a legal integer type
5128/// or an i8 array of an appropriate size.
5129///
5130/// \returns A tuple with the following elements:
5131/// - PartitionType: The computed type for this partition.
5132/// - IsIntegerWideningViable: True if integer widening promotion is used.
5133/// - VectorType: The vector type if vector promotion is used, otherwise
5134/// nullptr.
5135static std::tuple<Type *, bool, VectorType *>
5136selectPartitionType(Partition &P, const DataLayout &DL, AllocaInst &AI,
5137 LLVMContext &C) {
5138 // First check if the partition is viable for vector promotion.
5139 //
5140 // We prefer vector promotion over integer widening promotion when:
5141 // - The vector element type is a floating-point type.
5142 // - All the loads/stores to the alloca are vector loads/stores to the
5143 // entire alloca or load/store a single element of the vector.
5144 //
5145 // Otherwise when there is an integer vector with mixed type loads/stores we
5146 // prefer integer widening promotion because it's more likely the user is
5147 // doing bitwise arithmetic and we generate better code.
5148 VectorType *VecTy =
5149 isVectorPromotionViable(P, DL, VScale: AI.getFunction()->getVScaleValue());
5150 // If the vector element type is a floating-point type, we prefer vector
5151 // promotion. If the vector has one element, let the below code select
5152 // whether we promote with the vector or scalar.
5153 if (VecTy && VecTy->getElementType()->isFloatingPointTy() &&
5154 VecTy->getElementCount().getFixedValue() > 1)
5155 return {VecTy, false, VecTy};
5156
5157 // Check if there is a common type that all slices of the partition use that
5158 // spans the partition.
5159 auto [CommonUseTy, LargestIntTy] =
5160 findCommonType(B: P.begin(), E: P.end(), EndOffset: P.endOffset());
5161 if (CommonUseTy) {
5162 TypeSize CommonUseSize = DL.getTypeAllocSize(Ty: CommonUseTy);
5163 if (CommonUseSize.isFixed() && CommonUseSize.getFixedValue() >= P.size()) {
5164 // We prefer vector promotion here because if vector promotion is viable
5165 // and there is a common type used, then it implies the second listed
5166 // condition for preferring vector promotion is true.
5167 if (VecTy)
5168 return {VecTy, false, VecTy};
5169 return {CommonUseTy, isIntegerWideningViable(P, AllocaTy: CommonUseTy, DL),
5170 nullptr};
5171 }
5172 }
5173
5174 // Can we find an appropriate subtype in the original allocated
5175 // type?
5176 if (Type *TypePartitionTy = getTypePartition(DL, Ty: AI.getAllocatedType(),
5177 Offset: P.beginOffset(), Size: P.size())) {
5178 // If the partition is an integer array that can be spanned by a legal
5179 // integer type, prefer to represent it as a legal integer type because
5180 // it's more likely to be promotable.
5181 if (TypePartitionTy->isArrayTy() &&
5182 TypePartitionTy->getArrayElementType()->isIntegerTy() &&
5183 DL.isLegalInteger(Width: P.size() * 8))
5184 TypePartitionTy = Type::getIntNTy(C, N: P.size() * 8);
5185 // There was no common type used, so we prefer integer widening promotion.
5186 if (isIntegerWideningViable(P, AllocaTy: TypePartitionTy, DL))
5187 return {TypePartitionTy, true, nullptr};
5188 if (VecTy)
5189 return {VecTy, false, VecTy};
5190 // If we couldn't promote with TypePartitionTy, try with the largest
5191 // integer type used.
5192 if (LargestIntTy &&
5193 DL.getTypeAllocSize(Ty: LargestIntTy).getFixedValue() >= P.size() &&
5194 isIntegerWideningViable(P, AllocaTy: LargestIntTy, DL))
5195 return {LargestIntTy, true, nullptr};
5196
5197 // Fallback to TypePartitionTy and we probably won't promote.
5198 return {TypePartitionTy, false, nullptr};
5199 }
5200
5201 // Select the largest integer type used if it spans the partition.
5202 if (LargestIntTy &&
5203 DL.getTypeAllocSize(Ty: LargestIntTy).getFixedValue() >= P.size())
5204 return {LargestIntTy, false, nullptr};
5205
5206 // Select a legal integer type if it spans the partition.
5207 if (DL.isLegalInteger(Width: P.size() * 8))
5208 return {Type::getIntNTy(C, N: P.size() * 8), false, nullptr};
5209
5210 // Fallback to an i8 array.
5211 return {ArrayType::get(ElementType: Type::getInt8Ty(C), NumElements: P.size()), false, nullptr};
5212}
5213
5214/// Rewrite an alloca partition's users.
5215///
5216/// This routine drives both of the rewriting goals of the SROA pass. It tries
5217/// to rewrite uses of an alloca partition to be conducive for SSA value
5218/// promotion. If the partition needs a new, more refined alloca, this will
5219/// build that new alloca, preserving as much type information as possible, and
5220/// rewrite the uses of the old alloca to point at the new one and have the
5221/// appropriate new offsets. It also evaluates how successful the rewrite was
5222/// at enabling promotion and if it was successful queues the alloca to be
5223/// promoted.
5224std::pair<AllocaInst *, uint64_t>
5225SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P) {
5226 const DataLayout &DL = AI.getDataLayout();
5227 // Select the type for the new alloca that spans the partition.
5228 auto [PartitionTy, IsIntegerWideningViable, VecTy] =
5229 selectPartitionType(P, DL, AI, C&: *C);
5230
5231 // Check for the case where we're going to rewrite to a new alloca of the
5232 // exact same type as the original, and with the same access offsets. In that
5233 // case, re-use the existing alloca, but still run through the rewriter to
5234 // perform phi and select speculation.
5235 // P.beginOffset() can be non-zero even with the same type in a case with
5236 // out-of-bounds access (e.g. @PR35657 function in SROA/basictest.ll).
5237 AllocaInst *NewAI;
5238 if (PartitionTy == AI.getAllocatedType() && P.beginOffset() == 0) {
5239 NewAI = &AI;
5240 // FIXME: We should be able to bail at this point with "nothing changed".
5241 // FIXME: We might want to defer PHI speculation until after here.
5242 // FIXME: return nullptr;
5243 } else {
5244 // Make sure the alignment is compatible with P.beginOffset().
5245 const Align Alignment = commonAlignment(A: AI.getAlign(), Offset: P.beginOffset());
5246 // If we will get at least this much alignment from the type alone, leave
5247 // the alloca's alignment unconstrained.
5248 const bool IsUnconstrained = Alignment <= DL.getABITypeAlign(Ty: PartitionTy);
5249 NewAI = new AllocaInst(
5250 PartitionTy, AI.getAddressSpace(), nullptr,
5251 IsUnconstrained ? DL.getPrefTypeAlign(Ty: PartitionTy) : Alignment,
5252 AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()),
5253 AI.getIterator());
5254 // Copy the old AI debug location over to the new one.
5255 NewAI->setDebugLoc(AI.getDebugLoc());
5256 ++NumNewAllocas;
5257 }
5258
5259 LLVM_DEBUG(dbgs() << "Rewriting alloca partition " << "[" << P.beginOffset()
5260 << "," << P.endOffset() << ") to: " << *NewAI << "\n");
5261
5262 // Track the high watermark on the worklist as it is only relevant for
5263 // promoted allocas. We will reset it to this point if the alloca is not in
5264 // fact scheduled for promotion.
5265 unsigned PPWOldSize = PostPromotionWorklist.size();
5266 unsigned NumUses = 0;
5267 SmallSetVector<PHINode *, 8> PHIUsers;
5268 SmallSetVector<SelectInst *, 8> SelectUsers;
5269
5270 AllocaSliceRewriter Rewriter(
5271 DL, AS, *this, AI, *NewAI, PartitionTy, P.beginOffset(), P.endOffset(),
5272 IsIntegerWideningViable, VecTy, PHIUsers, SelectUsers);
5273 bool Promotable = true;
5274 // Check whether we can have tree-structured merge.
5275 if (auto DeletedValues = Rewriter.rewriteTreeStructuredMerge(P)) {
5276 NumUses += DeletedValues->size() + 1;
5277 for (Value *V : *DeletedValues)
5278 DeadInsts.push_back(Elt: V);
5279 } else {
5280 for (Slice *S : P.splitSliceTails()) {
5281 Promotable &= Rewriter.visit(I: S);
5282 ++NumUses;
5283 }
5284 for (Slice &S : P) {
5285 Promotable &= Rewriter.visit(I: &S);
5286 ++NumUses;
5287 }
5288 }
5289
5290 NumAllocaPartitionUses += NumUses;
5291 MaxUsesPerAllocaPartition.updateMax(V: NumUses);
5292
5293 // Now that we've processed all the slices in the new partition, check if any
5294 // PHIs or Selects would block promotion.
5295 for (PHINode *PHI : PHIUsers)
5296 if (!isSafePHIToSpeculate(PN&: *PHI)) {
5297 Promotable = false;
5298 PHIUsers.clear();
5299 SelectUsers.clear();
5300 break;
5301 }
5302
5303 SmallVector<std::pair<SelectInst *, RewriteableMemOps>, 2>
5304 NewSelectsToRewrite;
5305 NewSelectsToRewrite.reserve(N: SelectUsers.size());
5306 for (SelectInst *Sel : SelectUsers) {
5307 std::optional<RewriteableMemOps> Ops =
5308 isSafeSelectToSpeculate(SI&: *Sel, PreserveCFG);
5309 if (!Ops) {
5310 Promotable = false;
5311 PHIUsers.clear();
5312 SelectUsers.clear();
5313 NewSelectsToRewrite.clear();
5314 break;
5315 }
5316 NewSelectsToRewrite.emplace_back(Args: std::make_pair(x&: Sel, y&: *Ops));
5317 }
5318
5319 if (Promotable) {
5320 for (Use *U : AS.getDeadUsesIfPromotable()) {
5321 auto *OldInst = dyn_cast<Instruction>(Val: U->get());
5322 Value::dropDroppableUse(U&: *U);
5323 if (OldInst)
5324 if (isInstructionTriviallyDead(I: OldInst))
5325 DeadInsts.push_back(Elt: OldInst);
5326 }
5327 if (PHIUsers.empty() && SelectUsers.empty()) {
5328 // Promote the alloca.
5329 PromotableAllocas.insert(X: NewAI);
5330 } else {
5331 // If we have either PHIs or Selects to speculate, add them to those
5332 // worklists and re-queue the new alloca so that we promote in on the
5333 // next iteration.
5334 SpeculatablePHIs.insert_range(R&: PHIUsers);
5335 SelectsToRewrite.reserve(NumEntries: SelectsToRewrite.size() +
5336 NewSelectsToRewrite.size());
5337 for (auto &&KV : llvm::make_range(
5338 x: std::make_move_iterator(i: NewSelectsToRewrite.begin()),
5339 y: std::make_move_iterator(i: NewSelectsToRewrite.end())))
5340 SelectsToRewrite.insert(KV: std::move(KV));
5341 Worklist.insert(X: NewAI);
5342 }
5343 } else {
5344 // Drop any post-promotion work items if promotion didn't happen.
5345 while (PostPromotionWorklist.size() > PPWOldSize)
5346 PostPromotionWorklist.pop_back();
5347
5348 // We couldn't promote and we didn't create a new partition, nothing
5349 // happened.
5350 if (NewAI == &AI)
5351 return {nullptr, 0};
5352
5353 // If we can't promote the alloca, iterate on it to check for new
5354 // refinements exposed by splitting the current alloca. Don't iterate on an
5355 // alloca which didn't actually change and didn't get promoted.
5356 Worklist.insert(X: NewAI);
5357 }
5358
5359 return {NewAI, DL.getTypeSizeInBits(Ty: PartitionTy).getFixedValue()};
5360}
5361
5362// There isn't a shared interface to get the "address" parts out of a
5363// dbg.declare and dbg.assign, so provide some wrappers.
5364bool isKillAddress(const DbgVariableRecord *DVR) {
5365 if (DVR->getType() == DbgVariableRecord::LocationType::Assign)
5366 return DVR->isKillAddress();
5367 return DVR->isKillLocation();
5368}
5369
5370const DIExpression *getAddressExpression(const DbgVariableRecord *DVR) {
5371 if (DVR->getType() == DbgVariableRecord::LocationType::Assign)
5372 return DVR->getAddressExpression();
5373 return DVR->getExpression();
5374}
5375
5376/// Create or replace an existing fragment in a DIExpression with \p Frag.
5377/// If the expression already contains a DW_OP_LLVM_extract_bits_[sz]ext
5378/// operation, add \p BitExtractOffset to the offset part.
5379///
5380/// Returns the new expression, or nullptr if this fails (see details below).
5381///
5382/// This function is similar to DIExpression::createFragmentExpression except
5383/// for 3 important distinctions:
5384/// 1. The new fragment isn't relative to an existing fragment.
5385/// 2. It assumes the computed location is a memory location. This means we
5386/// don't need to perform checks that creating the fragment preserves the
5387/// expression semantics.
5388/// 3. Existing extract_bits are modified independently of fragment changes
5389/// using \p BitExtractOffset. A change to the fragment offset or size
5390/// may affect a bit extract. But a bit extract offset can change
5391/// independently of the fragment dimensions.
5392///
5393/// Returns the new expression, or nullptr if one couldn't be created.
5394/// Ideally this is only used to signal that a bit-extract has become
5395/// zero-sized (and thus the new debug record has no size and can be
5396/// dropped), however, it fails for other reasons too - see the FIXME below.
5397///
5398/// FIXME: To keep the change that introduces this function NFC it bails
5399/// in some situations unecessarily, e.g. when fragment and bit extract
5400/// sizes differ.
5401static DIExpression *createOrReplaceFragment(const DIExpression *Expr,
5402 DIExpression::FragmentInfo Frag,
5403 int64_t BitExtractOffset) {
5404 SmallVector<uint64_t, 8> Ops;
5405 bool HasFragment = false;
5406 bool HasBitExtract = false;
5407
5408 for (auto &Op : Expr->expr_ops()) {
5409 if (Op.getOp() == dwarf::DW_OP_LLVM_fragment) {
5410 HasFragment = true;
5411 continue;
5412 }
5413 if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext ||
5414 Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_sext) {
5415 HasBitExtract = true;
5416 int64_t ExtractOffsetInBits = Op.getArg(I: 0);
5417 int64_t ExtractSizeInBits = Op.getArg(I: 1);
5418
5419 // DIExpression::createFragmentExpression doesn't know how to handle
5420 // a fragment that is smaller than the extract. Copy the behaviour
5421 // (bail) to avoid non-NFC changes.
5422 // FIXME: Don't do this.
5423 if (Frag.SizeInBits < uint64_t(ExtractSizeInBits))
5424 return nullptr;
5425
5426 assert(BitExtractOffset <= 0);
5427 int64_t AdjustedOffset = ExtractOffsetInBits + BitExtractOffset;
5428
5429 // DIExpression::createFragmentExpression doesn't know what to do
5430 // if the new extract starts "outside" the existing one. Copy the
5431 // behaviour (bail) to avoid non-NFC changes.
5432 // FIXME: Don't do this.
5433 if (AdjustedOffset < 0)
5434 return nullptr;
5435
5436 Ops.push_back(Elt: Op.getOp());
5437 Ops.push_back(Elt: std::max<int64_t>(a: 0, b: AdjustedOffset));
5438 Ops.push_back(Elt: ExtractSizeInBits);
5439 continue;
5440 }
5441 Op.appendToVector(V&: Ops);
5442 }
5443
5444 // Unsupported by createFragmentExpression, so don't support it here yet to
5445 // preserve NFC-ness.
5446 if (HasFragment && HasBitExtract)
5447 return nullptr;
5448
5449 if (!HasBitExtract) {
5450 Ops.push_back(Elt: dwarf::DW_OP_LLVM_fragment);
5451 Ops.push_back(Elt: Frag.OffsetInBits);
5452 Ops.push_back(Elt: Frag.SizeInBits);
5453 }
5454 return DIExpression::get(Context&: Expr->getContext(), Elements: Ops);
5455}
5456
5457/// Insert a new DbgRecord.
5458/// \p Orig Original to copy record type, debug loc and variable from, and
5459/// additionally value and value expression for dbg_assign records.
5460/// \p NewAddr Location's new base address.
5461/// \p NewAddrExpr New expression to apply to address.
5462/// \p BeforeInst Insert position.
5463/// \p NewFragment New fragment (absolute, non-relative).
5464/// \p BitExtractAdjustment Offset to apply to any extract_bits op.
5465static void
5466insertNewDbgInst(DIBuilder &DIB, DbgVariableRecord *Orig, AllocaInst *NewAddr,
5467 DIExpression *NewAddrExpr, Instruction *BeforeInst,
5468 std::optional<DIExpression::FragmentInfo> NewFragment,
5469 int64_t BitExtractAdjustment) {
5470 (void)DIB;
5471
5472 // A dbg_assign puts fragment info in the value expression only. The address
5473 // expression has already been built: NewAddrExpr. A dbg_declare puts the
5474 // new fragment info into NewAddrExpr (as it only has one expression).
5475 DIExpression *NewFragmentExpr =
5476 Orig->isDbgAssign() ? Orig->getExpression() : NewAddrExpr;
5477 if (NewFragment)
5478 NewFragmentExpr = createOrReplaceFragment(Expr: NewFragmentExpr, Frag: *NewFragment,
5479 BitExtractOffset: BitExtractAdjustment);
5480 if (!NewFragmentExpr)
5481 return;
5482
5483 if (Orig->isDbgDeclare()) {
5484 DbgVariableRecord *DVR = DbgVariableRecord::createDVRDeclare(
5485 Address: NewAddr, DV: Orig->getVariable(), Expr: NewFragmentExpr, DI: Orig->getDebugLoc());
5486 BeforeInst->getParent()->insertDbgRecordBefore(DR: DVR,
5487 Here: BeforeInst->getIterator());
5488 return;
5489 }
5490
5491 if (Orig->isDbgValue()) {
5492 DbgVariableRecord *DVR = DbgVariableRecord::createDbgVariableRecord(
5493 Location: NewAddr, DV: Orig->getVariable(), Expr: NewFragmentExpr, DI: Orig->getDebugLoc());
5494 // Drop debug information if the expression doesn't start with a
5495 // DW_OP_deref. This is because without a DW_OP_deref, the #dbg_value
5496 // describes the address of alloca rather than the value inside the alloca.
5497 if (!NewFragmentExpr->startsWithDeref())
5498 DVR->setKillAddress();
5499 BeforeInst->getParent()->insertDbgRecordBefore(DR: DVR,
5500 Here: BeforeInst->getIterator());
5501 return;
5502 }
5503
5504 // Apply a DIAssignID to the store if it doesn't already have it.
5505 if (!NewAddr->hasMetadata(KindID: LLVMContext::MD_DIAssignID)) {
5506 NewAddr->setMetadata(KindID: LLVMContext::MD_DIAssignID,
5507 Node: DIAssignID::getDistinct(Context&: NewAddr->getContext()));
5508 }
5509
5510 DbgVariableRecord *NewAssign = DbgVariableRecord::createLinkedDVRAssign(
5511 LinkedInstr: NewAddr, Val: Orig->getValue(), Variable: Orig->getVariable(), Expression: NewFragmentExpr, Address: NewAddr,
5512 AddressExpression: NewAddrExpr, DI: Orig->getDebugLoc());
5513 LLVM_DEBUG(dbgs() << "Created new DVRAssign: " << *NewAssign << "\n");
5514 (void)NewAssign;
5515}
5516
5517/// Walks the slices of an alloca and form partitions based on them,
5518/// rewriting each of their uses.
5519bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
5520 if (AS.begin() == AS.end())
5521 return false;
5522
5523 unsigned NumPartitions = 0;
5524 bool Changed = false;
5525 const DataLayout &DL = AI.getModule()->getDataLayout();
5526
5527 // First try to pre-split loads and stores.
5528 Changed |= presplitLoadsAndStores(AI, AS);
5529
5530 // Now that we have identified any pre-splitting opportunities,
5531 // mark loads and stores unsplittable except for the following case.
5532 // We leave a slice splittable if all other slices are disjoint or fully
5533 // included in the slice, such as whole-alloca loads and stores.
5534 // If we fail to split these during pre-splitting, we want to force them
5535 // to be rewritten into a partition.
5536 bool IsSorted = true;
5537
5538 uint64_t AllocaSize = AI.getAllocationSize(DL)->getFixedValue();
5539 const uint64_t MaxBitVectorSize = 1024;
5540 if (AllocaSize <= MaxBitVectorSize) {
5541 // If a byte boundary is included in any load or store, a slice starting or
5542 // ending at the boundary is not splittable.
5543 SmallBitVector SplittableOffset(AllocaSize + 1, true);
5544 for (Slice &S : AS)
5545 for (unsigned O = S.beginOffset() + 1;
5546 O < S.endOffset() && O < AllocaSize; O++)
5547 SplittableOffset.reset(Idx: O);
5548
5549 for (Slice &S : AS) {
5550 if (!S.isSplittable())
5551 continue;
5552
5553 if ((S.beginOffset() > AllocaSize || SplittableOffset[S.beginOffset()]) &&
5554 (S.endOffset() > AllocaSize || SplittableOffset[S.endOffset()]))
5555 continue;
5556
5557 if (isa<LoadInst>(Val: S.getUse()->getUser()) ||
5558 isa<StoreInst>(Val: S.getUse()->getUser())) {
5559 S.makeUnsplittable();
5560 IsSorted = false;
5561 }
5562 }
5563 } else {
5564 // We only allow whole-alloca splittable loads and stores
5565 // for a large alloca to avoid creating too large BitVector.
5566 for (Slice &S : AS) {
5567 if (!S.isSplittable())
5568 continue;
5569
5570 if (S.beginOffset() == 0 && S.endOffset() >= AllocaSize)
5571 continue;
5572
5573 if (isa<LoadInst>(Val: S.getUse()->getUser()) ||
5574 isa<StoreInst>(Val: S.getUse()->getUser())) {
5575 S.makeUnsplittable();
5576 IsSorted = false;
5577 }
5578 }
5579 }
5580
5581 if (!IsSorted)
5582 llvm::stable_sort(Range&: AS);
5583
5584 /// Describes the allocas introduced by rewritePartition in order to migrate
5585 /// the debug info.
5586 struct Fragment {
5587 AllocaInst *Alloca;
5588 uint64_t Offset;
5589 uint64_t Size;
5590 Fragment(AllocaInst *AI, uint64_t O, uint64_t S)
5591 : Alloca(AI), Offset(O), Size(S) {}
5592 };
5593 SmallVector<Fragment, 4> Fragments;
5594
5595 // Rewrite each partition.
5596 for (auto &P : AS.partitions()) {
5597 auto [NewAI, ActiveBits] = rewritePartition(AI, AS, P);
5598 if (NewAI) {
5599 Changed = true;
5600 if (NewAI != &AI) {
5601 uint64_t SizeOfByte = 8;
5602 // Don't include any padding.
5603 uint64_t Size = std::min(a: ActiveBits, b: P.size() * SizeOfByte);
5604 Fragments.push_back(
5605 Elt: Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
5606 }
5607 }
5608 ++NumPartitions;
5609 }
5610
5611 NumAllocaPartitions += NumPartitions;
5612 MaxPartitionsPerAlloca.updateMax(V: NumPartitions);
5613
5614 // Migrate debug information from the old alloca to the new alloca(s)
5615 // and the individual partitions.
5616 auto MigrateOne = [&](DbgVariableRecord *DbgVariable) {
5617 // Can't overlap with undef memory.
5618 if (isKillAddress(DVR: DbgVariable))
5619 return;
5620
5621 const Value *DbgPtr = DbgVariable->getAddress();
5622 DIExpression::FragmentInfo VarFrag =
5623 DbgVariable->getFragmentOrEntireVariable();
5624 // Get the address expression constant offset if one exists and the ops
5625 // that come after it.
5626 int64_t CurrentExprOffsetInBytes = 0;
5627 SmallVector<uint64_t> PostOffsetOps;
5628 if (!getAddressExpression(DVR: DbgVariable)
5629 ->extractLeadingOffset(OffsetInBytes&: CurrentExprOffsetInBytes, RemainingOps&: PostOffsetOps))
5630 return; // Couldn't interpret this DIExpression - drop the var.
5631
5632 // Offset defined by a DW_OP_LLVM_extract_bits_[sz]ext.
5633 int64_t ExtractOffsetInBits = 0;
5634 for (auto Op : getAddressExpression(DVR: DbgVariable)->expr_ops()) {
5635 if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext ||
5636 Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_sext) {
5637 ExtractOffsetInBits = Op.getArg(I: 0);
5638 break;
5639 }
5640 }
5641
5642 DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
5643 for (auto Fragment : Fragments) {
5644 int64_t OffsetFromLocationInBits;
5645 std::optional<DIExpression::FragmentInfo> NewDbgFragment;
5646 // Find the variable fragment that the new alloca slice covers.
5647 // Drop debug info for this variable fragment if we can't compute an
5648 // intersect between it and the alloca slice.
5649 if (!DIExpression::calculateFragmentIntersect(
5650 DL, SliceStart: &AI, SliceOffsetInBits: Fragment.Offset, SliceSizeInBits: Fragment.Size, DbgPtr,
5651 DbgPtrOffsetInBits: CurrentExprOffsetInBytes * 8, DbgExtractOffsetInBits: ExtractOffsetInBits, VarFrag,
5652 Result&: NewDbgFragment, OffsetFromLocationInBits))
5653 continue; // Do not migrate this fragment to this slice.
5654
5655 // Zero sized fragment indicates there's no intersect between the variable
5656 // fragment and the alloca slice. Skip this slice for this variable
5657 // fragment.
5658 if (NewDbgFragment && !NewDbgFragment->SizeInBits)
5659 continue; // Do not migrate this fragment to this slice.
5660
5661 // No fragment indicates DbgVariable's variable or fragment exactly
5662 // overlaps the slice; copy its fragment (or nullopt if there isn't one).
5663 if (!NewDbgFragment)
5664 NewDbgFragment = DbgVariable->getFragment();
5665
5666 // Reduce the new expression offset by the bit-extract offset since
5667 // we'll be keeping that.
5668 int64_t OffestFromNewAllocaInBits =
5669 OffsetFromLocationInBits - ExtractOffsetInBits;
5670 // We need to adjust an existing bit extract if the offset expression
5671 // can't eat the slack (i.e., if the new offset would be negative).
5672 int64_t BitExtractOffset =
5673 std::min<int64_t>(a: 0, b: OffestFromNewAllocaInBits);
5674 // The magnitude of a negative value indicates the number of bits into
5675 // the existing variable fragment that the memory region begins. The new
5676 // variable fragment already excludes those bits - the new DbgPtr offset
5677 // only needs to be applied if it's positive.
5678 OffestFromNewAllocaInBits =
5679 std::max(a: int64_t(0), b: OffestFromNewAllocaInBits);
5680
5681 // Rebuild the expression:
5682 // {Offset(OffestFromNewAllocaInBits), PostOffsetOps, NewDbgFragment}
5683 // Add NewDbgFragment later, because dbg.assigns don't want it in the
5684 // address expression but the value expression instead.
5685 DIExpression *NewExpr = DIExpression::get(Context&: AI.getContext(), Elements: PostOffsetOps);
5686 if (OffestFromNewAllocaInBits > 0) {
5687 int64_t OffsetInBytes = (OffestFromNewAllocaInBits + 7) / 8;
5688 NewExpr = DIExpression::prepend(Expr: NewExpr, /*flags=*/Flags: 0, Offset: OffsetInBytes);
5689 }
5690
5691 // Remove any existing intrinsics on the new alloca describing
5692 // the variable fragment.
5693 auto RemoveOne = [DbgVariable](auto *OldDII) {
5694 auto SameVariableFragment = [](const auto *LHS, const auto *RHS) {
5695 return LHS->getVariable() == RHS->getVariable() &&
5696 LHS->getDebugLoc()->getInlinedAt() ==
5697 RHS->getDebugLoc()->getInlinedAt();
5698 };
5699 if (SameVariableFragment(OldDII, DbgVariable))
5700 OldDII->eraseFromParent();
5701 };
5702 for_each(Range: findDVRDeclares(V: Fragment.Alloca), F: RemoveOne);
5703 for_each(Range: findDVRValues(V: Fragment.Alloca), F: RemoveOne);
5704 insertNewDbgInst(DIB, Orig: DbgVariable, NewAddr: Fragment.Alloca, NewAddrExpr: NewExpr, BeforeInst: &AI,
5705 NewFragment: NewDbgFragment, BitExtractAdjustment: BitExtractOffset);
5706 }
5707 };
5708
5709 // Migrate debug information from the old alloca to the new alloca(s)
5710 // and the individual partitions.
5711 for_each(Range: findDVRDeclares(V: &AI), F: MigrateOne);
5712 for_each(Range: findDVRValues(V: &AI), F: MigrateOne);
5713 for_each(Range: at::getDVRAssignmentMarkers(Inst: &AI), F: MigrateOne);
5714
5715 return Changed;
5716}
5717
5718/// Clobber a use with poison, deleting the used value if it becomes dead.
5719void SROA::clobberUse(Use &U) {
5720 Value *OldV = U;
5721 // Replace the use with an poison value.
5722 U = PoisonValue::get(T: OldV->getType());
5723
5724 // Check for this making an instruction dead. We have to garbage collect
5725 // all the dead instructions to ensure the uses of any alloca end up being
5726 // minimal.
5727 if (Instruction *OldI = dyn_cast<Instruction>(Val: OldV))
5728 if (isInstructionTriviallyDead(I: OldI)) {
5729 DeadInsts.push_back(Elt: OldI);
5730 }
5731}
5732
5733/// A basic LoadAndStorePromoter that does not remove store nodes.
5734class BasicLoadAndStorePromoter : public LoadAndStorePromoter {
5735public:
5736 BasicLoadAndStorePromoter(ArrayRef<const Instruction *> Insts, SSAUpdater &S,
5737 Type *ZeroType)
5738 : LoadAndStorePromoter(Insts, S), ZeroType(ZeroType) {}
5739 bool shouldDelete(Instruction *I) const override {
5740 return !isa<StoreInst>(Val: I) && !isa<AllocaInst>(Val: I);
5741 }
5742
5743 Value *getValueToUseForAlloca(Instruction *I) const override {
5744 return UndefValue::get(T: ZeroType);
5745 }
5746
5747private:
5748 Type *ZeroType;
5749};
5750
5751bool SROA::propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS) {
5752 // Look through each "partition", looking for slices with the same start/end
5753 // that do not overlap with any before them. The slices are sorted by
5754 // increasing beginOffset. We don't use AS.partitions(), as it will use a more
5755 // sophisticated algorithm that takes splittable slices into account.
5756 LLVM_DEBUG(dbgs() << "Attempting to propagate values on " << AI << "\n");
5757 bool AllSameAndValid = true;
5758 Type *PartitionType = nullptr;
5759 SmallVector<Instruction *> Insts;
5760 uint64_t BeginOffset = 0;
5761 uint64_t EndOffset = 0;
5762
5763 auto Flush = [&]() {
5764 if (AllSameAndValid && !Insts.empty()) {
5765 LLVM_DEBUG(dbgs() << "Propagate values on slice [" << BeginOffset << ", "
5766 << EndOffset << ")\n");
5767 SmallVector<PHINode *, 4> NewPHIs;
5768 SSAUpdater SSA(&NewPHIs);
5769 Insts.push_back(Elt: &AI);
5770 BasicLoadAndStorePromoter Promoter(Insts, SSA, PartitionType);
5771 Promoter.run(Insts);
5772 }
5773 AllSameAndValid = true;
5774 PartitionType = nullptr;
5775 Insts.clear();
5776 };
5777
5778 for (Slice &S : AS) {
5779 auto *User = cast<Instruction>(Val: S.getUse()->getUser());
5780 if (isAssumeLikeIntrinsic(I: User)) {
5781 LLVM_DEBUG({
5782 dbgs() << "Ignoring slice: ";
5783 AS.print(dbgs(), &S);
5784 });
5785 continue;
5786 }
5787 if (S.beginOffset() >= EndOffset) {
5788 Flush();
5789 BeginOffset = S.beginOffset();
5790 EndOffset = S.endOffset();
5791 } else if (S.beginOffset() != BeginOffset || S.endOffset() != EndOffset) {
5792 if (AllSameAndValid) {
5793 LLVM_DEBUG({
5794 dbgs() << "Slice does not match range [" << BeginOffset << ", "
5795 << EndOffset << ")";
5796 AS.print(dbgs(), &S);
5797 });
5798 AllSameAndValid = false;
5799 }
5800 EndOffset = std::max(a: EndOffset, b: S.endOffset());
5801 continue;
5802 }
5803
5804 if (auto *LI = dyn_cast<LoadInst>(Val: User)) {
5805 Type *UserTy = LI->getType();
5806 // LoadAndStorePromoter requires all the types to be the same.
5807 if (!LI->isSimple() || (PartitionType && UserTy != PartitionType))
5808 AllSameAndValid = false;
5809 PartitionType = UserTy;
5810 Insts.push_back(Elt: User);
5811 } else if (auto *SI = dyn_cast<StoreInst>(Val: User)) {
5812 Type *UserTy = SI->getValueOperand()->getType();
5813 if (!SI->isSimple() || (PartitionType && UserTy != PartitionType))
5814 AllSameAndValid = false;
5815 PartitionType = UserTy;
5816 Insts.push_back(Elt: User);
5817 } else {
5818 AllSameAndValid = false;
5819 }
5820 }
5821
5822 Flush();
5823 return true;
5824}
5825
5826/// Analyze an alloca for SROA.
5827///
5828/// This analyzes the alloca to ensure we can reason about it, builds
5829/// the slices of the alloca, and then hands it off to be split and
5830/// rewritten as needed.
5831std::pair<bool /*Changed*/, bool /*CFGChanged*/>
5832SROA::runOnAlloca(AllocaInst &AI) {
5833 bool Changed = false;
5834 bool CFGChanged = false;
5835
5836 LLVM_DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
5837 ++NumAllocasAnalyzed;
5838
5839 // Special case dead allocas, as they're trivial.
5840 if (AI.use_empty()) {
5841 AI.eraseFromParent();
5842 Changed = true;
5843 return {Changed, CFGChanged};
5844 }
5845 const DataLayout &DL = AI.getDataLayout();
5846
5847 // Skip alloca forms that this analysis can't handle.
5848 std::optional<TypeSize> Size = AI.getAllocationSize(DL);
5849 if (AI.isArrayAllocation() || !Size || Size->isScalable() || Size->isZero())
5850 return {Changed, CFGChanged};
5851
5852 // First, split any FCA loads and stores touching this alloca to promote
5853 // better splitting and promotion opportunities.
5854 IRBuilderTy IRB(&AI);
5855 AggLoadStoreRewriter AggRewriter(DL, IRB);
5856 Changed |= AggRewriter.rewrite(I&: AI);
5857
5858 // Build the slices using a recursive instruction-visiting builder.
5859 AllocaSlices AS(DL, AI);
5860 LLVM_DEBUG(AS.print(dbgs()));
5861 if (AS.isEscaped())
5862 return {Changed, CFGChanged};
5863
5864 if (AS.isEscapedReadOnly()) {
5865 Changed |= propagateStoredValuesToLoads(AI, AS);
5866 return {Changed, CFGChanged};
5867 }
5868
5869 for (auto &P : AS.partitions()) {
5870 // For now, we can't split if a field is accessed both via protected field
5871 // and not, because that would mean that we would need to introduce sign and
5872 // auth operations to convert between the protected and non-protected uses,
5873 // and this pass doesn't know how to do that. Also, this case is unlikely to
5874 // occur in normal code.
5875 std::optional<Value *> ProtectedFieldDisc;
5876 auto SliceHasMismatch = [&](Slice &S) {
5877 if (auto *II = dyn_cast<IntrinsicInst>(Val: S.getUse()->getUser()))
5878 if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
5879 II->getIntrinsicID() == Intrinsic::lifetime_end)
5880 return false;
5881 if (!ProtectedFieldDisc)
5882 ProtectedFieldDisc = S.ProtectedFieldDisc;
5883 return *ProtectedFieldDisc != S.ProtectedFieldDisc;
5884 };
5885 for (Slice &S : P)
5886 if (SliceHasMismatch(S))
5887 return {Changed, CFGChanged};
5888 for (Slice *S : P.splitSliceTails())
5889 if (SliceHasMismatch(*S))
5890 return {Changed, CFGChanged};
5891 }
5892
5893 // Delete all the dead users of this alloca before splitting and rewriting it.
5894 for (Instruction *DeadUser : AS.getDeadUsers()) {
5895 // Free up everything used by this instruction.
5896 for (Use &DeadOp : DeadUser->operands())
5897 clobberUse(U&: DeadOp);
5898
5899 // Now replace the uses of this instruction.
5900 DeadUser->replaceAllUsesWith(V: PoisonValue::get(T: DeadUser->getType()));
5901
5902 // And mark it for deletion.
5903 DeadInsts.push_back(Elt: DeadUser);
5904 Changed = true;
5905 }
5906 for (Use *DeadOp : AS.getDeadOperands()) {
5907 clobberUse(U&: *DeadOp);
5908 Changed = true;
5909 }
5910 for (IntrinsicInst *PFPUser : AS.getPFPUsers()) {
5911 PFPUser->replaceAllUsesWith(V: PFPUser->getArgOperand(i: 0));
5912
5913 DeadInsts.push_back(Elt: PFPUser);
5914 Changed = true;
5915 }
5916
5917 // No slices to split. Leave the dead alloca for a later pass to clean up.
5918 if (AS.begin() == AS.end())
5919 return {Changed, CFGChanged};
5920
5921 Changed |= splitAlloca(AI, AS);
5922
5923 LLVM_DEBUG(dbgs() << " Speculating PHIs\n");
5924 while (!SpeculatablePHIs.empty())
5925 speculatePHINodeLoads(IRB, PN&: *SpeculatablePHIs.pop_back_val());
5926
5927 LLVM_DEBUG(dbgs() << " Rewriting Selects\n");
5928 auto RemainingSelectsToRewrite = SelectsToRewrite.takeVector();
5929 while (!RemainingSelectsToRewrite.empty()) {
5930 const auto [K, V] = RemainingSelectsToRewrite.pop_back_val();
5931 CFGChanged |=
5932 rewriteSelectInstMemOps(SI&: *K, Ops: V, IRB, DTU: PreserveCFG ? nullptr : DTU);
5933 }
5934
5935 return {Changed, CFGChanged};
5936}
5937
5938/// Delete the dead instructions accumulated in this run.
5939///
5940/// Recursively deletes the dead instructions we've accumulated. This is done
5941/// at the very end to maximize locality of the recursive delete and to
5942/// minimize the problems of invalidated instruction pointers as such pointers
5943/// are used heavily in the intermediate stages of the algorithm.
5944///
5945/// We also record the alloca instructions deleted here so that they aren't
5946/// subsequently handed to mem2reg to promote.
5947bool SROA::deleteDeadInstructions(
5948 SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
5949 bool Changed = false;
5950 while (!DeadInsts.empty()) {
5951 Instruction *I = dyn_cast_or_null<Instruction>(Val: DeadInsts.pop_back_val());
5952 if (!I)
5953 continue;
5954 LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
5955
5956 // If the instruction is an alloca, find the possible dbg.declare connected
5957 // to it, and remove it too. We must do this before calling RAUW or we will
5958 // not be able to find it.
5959 if (AllocaInst *AI = dyn_cast<AllocaInst>(Val: I)) {
5960 DeletedAllocas.insert(Ptr: AI);
5961 for (DbgVariableRecord *OldDII : findDVRDeclares(V: AI))
5962 OldDII->eraseFromParent();
5963 }
5964
5965 at::deleteAssignmentMarkers(Inst: I);
5966 I->replaceAllUsesWith(V: UndefValue::get(T: I->getType()));
5967
5968 for (Use &Operand : I->operands())
5969 if (Instruction *U = dyn_cast<Instruction>(Val&: Operand)) {
5970 // Zero out the operand and see if it becomes trivially dead.
5971 Operand = nullptr;
5972 if (isInstructionTriviallyDead(I: U))
5973 DeadInsts.push_back(Elt: U);
5974 }
5975
5976 ++NumDeleted;
5977 I->eraseFromParent();
5978 Changed = true;
5979 }
5980 return Changed;
5981}
5982/// Promote the allocas, using the best available technique.
5983///
5984/// This attempts to promote whatever allocas have been identified as viable in
5985/// the PromotableAllocas list. If that list is empty, there is nothing to do.
5986/// This function returns whether any promotion occurred.
5987bool SROA::promoteAllocas() {
5988 if (PromotableAllocas.empty())
5989 return false;
5990
5991 if (SROASkipMem2Reg) {
5992 LLVM_DEBUG(dbgs() << "Not promoting allocas with mem2reg!\n");
5993 } else {
5994 LLVM_DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
5995 NumPromoted += PromotableAllocas.size();
5996 PromoteMemToReg(Allocas: PromotableAllocas.getArrayRef(), DT&: DTU->getDomTree(), AC);
5997 }
5998
5999 PromotableAllocas.clear();
6000 return true;
6001}
6002
6003std::pair<bool /*Changed*/, bool /*CFGChanged*/> SROA::runSROA(Function &F) {
6004 LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
6005
6006 const DataLayout &DL = F.getDataLayout();
6007 BasicBlock &EntryBB = F.getEntryBlock();
6008 for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(x: EntryBB.end());
6009 I != E; ++I) {
6010 if (AllocaInst *AI = dyn_cast<AllocaInst>(Val&: I)) {
6011 std::optional<TypeSize> Size = AI->getAllocationSize(DL);
6012 if (Size && Size->isScalable() && isAllocaPromotable(AI))
6013 PromotableAllocas.insert(X: AI);
6014 else
6015 Worklist.insert(X: AI);
6016 }
6017 }
6018
6019 bool Changed = false;
6020 bool CFGChanged = false;
6021 // A set of deleted alloca instruction pointers which should be removed from
6022 // the list of promotable allocas.
6023 SmallPtrSet<AllocaInst *, 4> DeletedAllocas;
6024
6025 do {
6026 while (!Worklist.empty()) {
6027 auto [IterationChanged, IterationCFGChanged] =
6028 runOnAlloca(AI&: *Worklist.pop_back_val());
6029 Changed |= IterationChanged;
6030 CFGChanged |= IterationCFGChanged;
6031
6032 Changed |= deleteDeadInstructions(DeletedAllocas);
6033
6034 // Remove the deleted allocas from various lists so that we don't try to
6035 // continue processing them.
6036 if (!DeletedAllocas.empty()) {
6037 Worklist.set_subtract(DeletedAllocas);
6038 PostPromotionWorklist.set_subtract(DeletedAllocas);
6039 PromotableAllocas.set_subtract(DeletedAllocas);
6040 DeletedAllocas.clear();
6041 }
6042 }
6043
6044 Changed |= promoteAllocas();
6045
6046 Worklist = PostPromotionWorklist;
6047 PostPromotionWorklist.clear();
6048 } while (!Worklist.empty());
6049
6050 assert((!CFGChanged || Changed) && "Can not only modify the CFG.");
6051 assert((!CFGChanged || !PreserveCFG) &&
6052 "Should not have modified the CFG when told to preserve it.");
6053
6054 if (Changed && isAssignmentTrackingEnabled(M: *F.getParent())) {
6055 for (auto &BB : F) {
6056 RemoveRedundantDbgInstrs(BB: &BB);
6057 }
6058 }
6059
6060 return {Changed, CFGChanged};
6061}
6062
6063PreservedAnalyses SROAPass::run(Function &F, FunctionAnalysisManager &AM) {
6064 DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(IR&: F);
6065 AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(IR&: F);
6066 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
6067 auto [Changed, CFGChanged] =
6068 SROA(&F.getContext(), &DTU, &AC, PreserveCFG).runSROA(F);
6069 if (!Changed)
6070 return PreservedAnalyses::all();
6071 PreservedAnalyses PA;
6072 if (!CFGChanged)
6073 PA.preserveSet<CFGAnalyses>();
6074 PA.preserve<DominatorTreeAnalysis>();
6075 return PA;
6076}
6077
6078void SROAPass::printPipeline(
6079 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
6080 static_cast<PassInfoMixin<SROAPass> *>(this)->printPipeline(
6081 OS, MapClassName2PassName);
6082 OS << (PreserveCFG == SROAOptions::PreserveCFG ? "<preserve-cfg>"
6083 : "<modify-cfg>");
6084}
6085
6086SROAPass::SROAPass(SROAOptions PreserveCFG) : PreserveCFG(PreserveCFG) {}
6087
6088namespace {
6089
6090/// A legacy pass for the legacy pass manager that wraps the \c SROA pass.
6091class SROALegacyPass : public FunctionPass {
6092 SROAOptions PreserveCFG;
6093
6094public:
6095 static char ID;
6096
6097 SROALegacyPass(SROAOptions PreserveCFG = SROAOptions::PreserveCFG)
6098 : FunctionPass(ID), PreserveCFG(PreserveCFG) {
6099 initializeSROALegacyPassPass(*PassRegistry::getPassRegistry());
6100 }
6101
6102 bool runOnFunction(Function &F) override {
6103 if (skipFunction(F))
6104 return false;
6105
6106 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
6107 AssumptionCache &AC =
6108 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
6109 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
6110 auto [Changed, _] =
6111 SROA(&F.getContext(), &DTU, &AC, PreserveCFG).runSROA(F);
6112 return Changed;
6113 }
6114
6115 void getAnalysisUsage(AnalysisUsage &AU) const override {
6116 AU.addRequired<AssumptionCacheTracker>();
6117 AU.addRequired<DominatorTreeWrapperPass>();
6118 AU.addPreserved<GlobalsAAWrapperPass>();
6119 AU.addPreserved<DominatorTreeWrapperPass>();
6120 }
6121
6122 StringRef getPassName() const override { return "SROA"; }
6123};
6124
6125} // end anonymous namespace
6126
6127char SROALegacyPass::ID = 0;
6128
6129FunctionPass *llvm::createSROAPass(bool PreserveCFG) {
6130 return new SROALegacyPass(PreserveCFG ? SROAOptions::PreserveCFG
6131 : SROAOptions::ModifyCFG);
6132}
6133
6134INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa",
6135 "Scalar Replacement Of Aggregates", false, false)
6136INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6137INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6138INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates",
6139 false, false)
6140