1//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This transformation implements the well known scalar replacement of
10/// aggregates transformation. It tries to identify promotable elements of an
11/// aggregate alloca, and promote them to registers. It will also try to
12/// convert uses of an element (or set of elements) of an alloca into a vector
13/// or bitfield-style integer scalar if appropriate.
14///
15/// It works to do this with minimal slicing of the alloca so that regions
16/// which are merely transferred in and out of external memory remain unchanged
17/// and are not decomposed to scalar code.
18///
19/// Because this also performs alloca promotion, it can be thought of as also
20/// serving the purpose of SSA formation. The algorithm iterates on the
21/// function until all opportunities for promotion have been realized.
22///
23//===----------------------------------------------------------------------===//
24
25#include "llvm/Transforms/Scalar/SROA.h"
26#include "llvm/ADT/APInt.h"
27#include "llvm/ADT/ArrayRef.h"
28#include "llvm/ADT/DenseMap.h"
29#include "llvm/ADT/MapVector.h"
30#include "llvm/ADT/PointerIntPair.h"
31#include "llvm/ADT/STLExtras.h"
32#include "llvm/ADT/SetVector.h"
33#include "llvm/ADT/SmallBitVector.h"
34#include "llvm/ADT/SmallPtrSet.h"
35#include "llvm/ADT/SmallVector.h"
36#include "llvm/ADT/Statistic.h"
37#include "llvm/ADT/StringRef.h"
38#include "llvm/ADT/Twine.h"
39#include "llvm/ADT/iterator.h"
40#include "llvm/ADT/iterator_range.h"
41#include "llvm/Analysis/AssumptionCache.h"
42#include "llvm/Analysis/DomTreeUpdater.h"
43#include "llvm/Analysis/GlobalsModRef.h"
44#include "llvm/Analysis/Loads.h"
45#include "llvm/Analysis/PtrUseVisitor.h"
46#include "llvm/Analysis/ValueTracking.h"
47#include "llvm/Config/llvm-config.h"
48#include "llvm/IR/BasicBlock.h"
49#include "llvm/IR/Constant.h"
50#include "llvm/IR/ConstantFolder.h"
51#include "llvm/IR/Constants.h"
52#include "llvm/IR/DIBuilder.h"
53#include "llvm/IR/DataLayout.h"
54#include "llvm/IR/DebugInfo.h"
55#include "llvm/IR/DebugInfoMetadata.h"
56#include "llvm/IR/DerivedTypes.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/GlobalAlias.h"
60#include "llvm/IR/IRBuilder.h"
61#include "llvm/IR/InstVisitor.h"
62#include "llvm/IR/Instruction.h"
63#include "llvm/IR/Instructions.h"
64#include "llvm/IR/IntrinsicInst.h"
65#include "llvm/IR/LLVMContext.h"
66#include "llvm/IR/Metadata.h"
67#include "llvm/IR/Module.h"
68#include "llvm/IR/Operator.h"
69#include "llvm/IR/PassManager.h"
70#include "llvm/IR/Type.h"
71#include "llvm/IR/Use.h"
72#include "llvm/IR/User.h"
73#include "llvm/IR/Value.h"
74#include "llvm/IR/ValueHandle.h"
75#include "llvm/InitializePasses.h"
76#include "llvm/Pass.h"
77#include "llvm/Support/Casting.h"
78#include "llvm/Support/CommandLine.h"
79#include "llvm/Support/Compiler.h"
80#include "llvm/Support/Debug.h"
81#include "llvm/Support/ErrorHandling.h"
82#include "llvm/Support/raw_ostream.h"
83#include "llvm/Transforms/Scalar.h"
84#include "llvm/Transforms/Utils/BasicBlockUtils.h"
85#include "llvm/Transforms/Utils/Local.h"
86#include "llvm/Transforms/Utils/PromoteMemToReg.h"
87#include "llvm/Transforms/Utils/SSAUpdater.h"
88#include <algorithm>
89#include <cassert>
90#include <cstddef>
91#include <cstdint>
92#include <cstring>
93#include <iterator>
94#include <string>
95#include <tuple>
96#include <utility>
97#include <variant>
98#include <vector>
99
100using namespace llvm;
101
102#define DEBUG_TYPE "sroa"
103
104STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
105STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
106STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca");
107STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten");
108STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition");
109STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced");
110STATISTIC(NumPromoted, "Number of allocas promoted to SSA values");
111STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
112STATISTIC(NumLoadsPredicated,
113 "Number of loads rewritten into predicated loads to allow promotion");
114STATISTIC(
115 NumStoresPredicated,
116 "Number of stores rewritten into predicated loads to allow promotion");
117STATISTIC(NumDeleted, "Number of instructions deleted");
118STATISTIC(NumVectorized, "Number of vectorized aggregates");
119
120/// Disable running mem2reg during SROA in order to test or debug SROA.
121static cl::opt<bool> SROASkipMem2Reg("sroa-skip-mem2reg", cl::init(Val: false),
122 cl::Hidden);
123namespace {
124
125class AllocaSliceRewriter;
126class AllocaSlices;
127class Partition;
128
129class SelectHandSpeculativity {
130 unsigned char Storage = 0; // None are speculatable by default.
131 using TrueVal = Bitfield::Element<bool, 0, 1>; // Low 0'th bit.
132 using FalseVal = Bitfield::Element<bool, 1, 1>; // Low 1'th bit.
133public:
134 SelectHandSpeculativity() = default;
135 SelectHandSpeculativity &setAsSpeculatable(bool isTrueVal);
136 bool isSpeculatable(bool isTrueVal) const;
137 bool areAllSpeculatable() const;
138 bool areAnySpeculatable() const;
139 bool areNoneSpeculatable() const;
140 // For interop as int half of PointerIntPair.
141 explicit operator intptr_t() const { return static_cast<intptr_t>(Storage); }
142 explicit SelectHandSpeculativity(intptr_t Storage_) : Storage(Storage_) {}
143};
144static_assert(sizeof(SelectHandSpeculativity) == sizeof(unsigned char));
145
146using PossiblySpeculatableLoad =
147 PointerIntPair<LoadInst *, 2, SelectHandSpeculativity>;
148using UnspeculatableStore = StoreInst *;
149using RewriteableMemOp =
150 std::variant<PossiblySpeculatableLoad, UnspeculatableStore>;
151using RewriteableMemOps = SmallVector<RewriteableMemOp, 2>;
152
153/// An optimization pass providing Scalar Replacement of Aggregates.
154///
155/// This pass takes allocations which can be completely analyzed (that is, they
156/// don't escape) and tries to turn them into scalar SSA values. There are
157/// a few steps to this process.
158///
159/// 1) It takes allocations of aggregates and analyzes the ways in which they
160/// are used to try to split them into smaller allocations, ideally of
161/// a single scalar data type. It will split up memcpy and memset accesses
162/// as necessary and try to isolate individual scalar accesses.
163/// 2) It will transform accesses into forms which are suitable for SSA value
164/// promotion. This can be replacing a memset with a scalar store of an
165/// integer value, or it can involve speculating operations on a PHI or
166/// select to be a PHI or select of the results.
167/// 3) Finally, this will try to detect a pattern of accesses which map cleanly
168/// onto insert and extract operations on a vector value, and convert them to
169/// this form. By doing so, it will enable promotion of vector aggregates to
170/// SSA vector values.
171class SROA {
172 LLVMContext *const C;
173 DomTreeUpdater *const DTU;
174 AssumptionCache *const AC;
175 const bool PreserveCFG;
176
177 /// Worklist of alloca instructions to simplify.
178 ///
179 /// Each alloca in the function is added to this. Each new alloca formed gets
180 /// added to it as well to recursively simplify unless that alloca can be
181 /// directly promoted. Finally, each time we rewrite a use of an alloca other
182 /// the one being actively rewritten, we add it back onto the list if not
183 /// already present to ensure it is re-visited.
184 SmallSetVector<AllocaInst *, 16> Worklist;
185
186 /// A collection of instructions to delete.
187 /// We try to batch deletions to simplify code and make things a bit more
188 /// efficient. We also make sure there is no dangling pointers.
189 SmallVector<WeakVH, 8> DeadInsts;
190
191 /// Post-promotion worklist.
192 ///
193 /// Sometimes we discover an alloca which has a high probability of becoming
194 /// viable for SROA after a round of promotion takes place. In those cases,
195 /// the alloca is enqueued here for re-processing.
196 ///
197 /// Note that we have to be very careful to clear allocas out of this list in
198 /// the event they are deleted.
199 SmallSetVector<AllocaInst *, 16> PostPromotionWorklist;
200
201 /// A collection of alloca instructions we can directly promote.
202 SetVector<AllocaInst *, SmallVector<AllocaInst *>,
203 SmallPtrSet<AllocaInst *, 16>, 16>
204 PromotableAllocas;
205
206 /// A worklist of PHIs to speculate prior to promoting allocas.
207 ///
208 /// All of these PHIs have been checked for the safety of speculation and by
209 /// being speculated will allow promoting allocas currently in the promotable
210 /// queue.
211 SmallSetVector<PHINode *, 8> SpeculatablePHIs;
212
213 /// A worklist of select instructions to rewrite prior to promoting
214 /// allocas.
215 SmallMapVector<SelectInst *, RewriteableMemOps, 8> SelectsToRewrite;
216
217 /// Select instructions that use an alloca and are subsequently loaded can be
218 /// rewritten to load both input pointers and then select between the result,
219 /// allowing the load of the alloca to be promoted.
220 /// From this:
221 /// %P2 = select i1 %cond, ptr %Alloca, ptr %Other
222 /// %V = load <type>, ptr %P2
223 /// to:
224 /// %V1 = load <type>, ptr %Alloca -> will be mem2reg'd
225 /// %V2 = load <type>, ptr %Other
226 /// %V = select i1 %cond, <type> %V1, <type> %V2
227 ///
228 /// We can do this to a select if its only uses are loads
229 /// and if either the operand to the select can be loaded unconditionally,
230 /// or if we are allowed to perform CFG modifications.
231 /// If found an intervening bitcast with a single use of the load,
232 /// allow the promotion.
233 static std::optional<RewriteableMemOps>
234 isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG);
235
236public:
237 SROA(LLVMContext *C, DomTreeUpdater *DTU, AssumptionCache *AC,
238 SROAOptions PreserveCFG_)
239 : C(C), DTU(DTU), AC(AC),
240 PreserveCFG(PreserveCFG_ == SROAOptions::PreserveCFG) {}
241
242 /// Main run method used by both the SROAPass and by the legacy pass.
243 std::pair<bool /*Changed*/, bool /*CFGChanged*/> runSROA(Function &F);
244
245private:
246 friend class AllocaSliceRewriter;
247
248 bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS);
249 AllocaInst *rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P);
250 bool splitAlloca(AllocaInst &AI, AllocaSlices &AS);
251 bool propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS);
252 std::pair<bool /*Changed*/, bool /*CFGChanged*/> runOnAlloca(AllocaInst &AI);
253 void clobberUse(Use &U);
254 bool deleteDeadInstructions(SmallPtrSetImpl<AllocaInst *> &DeletedAllocas);
255 bool promoteAllocas();
256};
257
258} // end anonymous namespace
259
260/// Calculate the fragment of a variable to use when slicing a store
261/// based on the slice dimensions, existing fragment, and base storage
262/// fragment.
263/// Results:
264/// UseFrag - Use Target as the new fragment.
265/// UseNoFrag - The new slice already covers the whole variable.
266/// Skip - The new alloca slice doesn't include this variable.
267/// FIXME: Can we use calculateFragmentIntersect instead?
268namespace {
269enum FragCalcResult { UseFrag, UseNoFrag, Skip };
270}
271static FragCalcResult
272calculateFragment(DILocalVariable *Variable,
273 uint64_t NewStorageSliceOffsetInBits,
274 uint64_t NewStorageSliceSizeInBits,
275 std::optional<DIExpression::FragmentInfo> StorageFragment,
276 std::optional<DIExpression::FragmentInfo> CurrentFragment,
277 DIExpression::FragmentInfo &Target) {
278 // If the base storage describes part of the variable apply the offset and
279 // the size constraint.
280 if (StorageFragment) {
281 Target.SizeInBits =
282 std::min(a: NewStorageSliceSizeInBits, b: StorageFragment->SizeInBits);
283 Target.OffsetInBits =
284 NewStorageSliceOffsetInBits + StorageFragment->OffsetInBits;
285 } else {
286 Target.SizeInBits = NewStorageSliceSizeInBits;
287 Target.OffsetInBits = NewStorageSliceOffsetInBits;
288 }
289
290 // If this slice extracts the entirety of an independent variable from a
291 // larger alloca, do not produce a fragment expression, as the variable is
292 // not fragmented.
293 if (!CurrentFragment) {
294 if (auto Size = Variable->getSizeInBits()) {
295 // Treat the current fragment as covering the whole variable.
296 CurrentFragment = DIExpression::FragmentInfo(*Size, 0);
297 if (Target == CurrentFragment)
298 return UseNoFrag;
299 }
300 }
301
302 // No additional work to do if there isn't a fragment already, or there is
303 // but it already exactly describes the new assignment.
304 if (!CurrentFragment || *CurrentFragment == Target)
305 return UseFrag;
306
307 // Reject the target fragment if it doesn't fit wholly within the current
308 // fragment. TODO: We could instead chop up the target to fit in the case of
309 // a partial overlap.
310 if (Target.startInBits() < CurrentFragment->startInBits() ||
311 Target.endInBits() > CurrentFragment->endInBits())
312 return Skip;
313
314 // Target fits within the current fragment, return it.
315 return UseFrag;
316}
317
318static DebugVariable getAggregateVariable(DbgVariableIntrinsic *DVI) {
319 return DebugVariable(DVI->getVariable(), std::nullopt,
320 DVI->getDebugLoc().getInlinedAt());
321}
322static DebugVariable getAggregateVariable(DbgVariableRecord *DVR) {
323 return DebugVariable(DVR->getVariable(), std::nullopt,
324 DVR->getDebugLoc().getInlinedAt());
325}
326
327/// Helpers for handling new and old debug info modes in migrateDebugInfo.
328/// These overloads unwrap a DbgInstPtr {Instruction* | DbgRecord*} union based
329/// on the \p Unused parameter type.
330DbgVariableRecord *UnwrapDbgInstPtr(DbgInstPtr P, DbgVariableRecord *Unused) {
331 (void)Unused;
332 return static_cast<DbgVariableRecord *>(cast<DbgRecord *>(Val&: P));
333}
334DbgAssignIntrinsic *UnwrapDbgInstPtr(DbgInstPtr P, DbgAssignIntrinsic *Unused) {
335 (void)Unused;
336 return static_cast<DbgAssignIntrinsic *>(cast<Instruction *>(Val&: P));
337}
338
339/// Find linked dbg.assign and generate a new one with the correct
340/// FragmentInfo. Link Inst to the new dbg.assign. If Value is nullptr the
341/// value component is copied from the old dbg.assign to the new.
342/// \param OldAlloca Alloca for the variable before splitting.
343/// \param IsSplit True if the store (not necessarily alloca)
344/// is being split.
345/// \param OldAllocaOffsetInBits Offset of the slice taken from OldAlloca.
346/// \param SliceSizeInBits New number of bits being written to.
347/// \param OldInst Instruction that is being split.
348/// \param Inst New instruction performing this part of the
349/// split store.
350/// \param Dest Store destination.
351/// \param Value Stored value.
352/// \param DL Datalayout.
353static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
354 uint64_t OldAllocaOffsetInBits,
355 uint64_t SliceSizeInBits, Instruction *OldInst,
356 Instruction *Inst, Value *Dest, Value *Value,
357 const DataLayout &DL) {
358 auto MarkerRange = at::getAssignmentMarkers(Inst: OldInst);
359 auto DVRAssignMarkerRange = at::getDVRAssignmentMarkers(Inst: OldInst);
360 // Nothing to do if OldInst has no linked dbg.assign intrinsics.
361 if (MarkerRange.empty() && DVRAssignMarkerRange.empty())
362 return;
363
364 LLVM_DEBUG(dbgs() << " migrateDebugInfo\n");
365 LLVM_DEBUG(dbgs() << " OldAlloca: " << *OldAlloca << "\n");
366 LLVM_DEBUG(dbgs() << " IsSplit: " << IsSplit << "\n");
367 LLVM_DEBUG(dbgs() << " OldAllocaOffsetInBits: " << OldAllocaOffsetInBits
368 << "\n");
369 LLVM_DEBUG(dbgs() << " SliceSizeInBits: " << SliceSizeInBits << "\n");
370 LLVM_DEBUG(dbgs() << " OldInst: " << *OldInst << "\n");
371 LLVM_DEBUG(dbgs() << " Inst: " << *Inst << "\n");
372 LLVM_DEBUG(dbgs() << " Dest: " << *Dest << "\n");
373 if (Value)
374 LLVM_DEBUG(dbgs() << " Value: " << *Value << "\n");
375
376 /// Map of aggregate variables to their fragment associated with OldAlloca.
377 DenseMap<DebugVariable, std::optional<DIExpression::FragmentInfo>>
378 BaseFragments;
379 for (auto *DAI : at::getAssignmentMarkers(Inst: OldAlloca))
380 BaseFragments[getAggregateVariable(DVI: DAI)] =
381 DAI->getExpression()->getFragmentInfo();
382 for (auto *DVR : at::getDVRAssignmentMarkers(Inst: OldAlloca))
383 BaseFragments[getAggregateVariable(DVR)] =
384 DVR->getExpression()->getFragmentInfo();
385
386 // The new inst needs a DIAssignID unique metadata tag (if OldInst has
387 // one). It shouldn't already have one: assert this assumption.
388 assert(!Inst->getMetadata(LLVMContext::MD_DIAssignID));
389 DIAssignID *NewID = nullptr;
390 auto &Ctx = Inst->getContext();
391 DIBuilder DIB(*OldInst->getModule(), /*AllowUnresolved*/ false);
392 assert(OldAlloca->isStaticAlloca());
393
394 auto MigrateDbgAssign = [&](auto *DbgAssign) {
395 LLVM_DEBUG(dbgs() << " existing dbg.assign is: " << *DbgAssign
396 << "\n");
397 auto *Expr = DbgAssign->getExpression();
398 bool SetKillLocation = false;
399
400 if (IsSplit) {
401 std::optional<DIExpression::FragmentInfo> BaseFragment;
402 {
403 auto R = BaseFragments.find(getAggregateVariable(DbgAssign));
404 if (R == BaseFragments.end())
405 return;
406 BaseFragment = R->second;
407 }
408 std::optional<DIExpression::FragmentInfo> CurrentFragment =
409 Expr->getFragmentInfo();
410 DIExpression::FragmentInfo NewFragment;
411 FragCalcResult Result = calculateFragment(
412 DbgAssign->getVariable(), OldAllocaOffsetInBits, SliceSizeInBits,
413 BaseFragment, CurrentFragment, NewFragment);
414
415 if (Result == Skip)
416 return;
417 if (Result == UseFrag && !(NewFragment == CurrentFragment)) {
418 if (CurrentFragment) {
419 // Rewrite NewFragment to be relative to the existing one (this is
420 // what createFragmentExpression wants). CalculateFragment has
421 // already resolved the size for us. FIXME: Should it return the
422 // relative fragment too?
423 NewFragment.OffsetInBits -= CurrentFragment->OffsetInBits;
424 }
425 // Add the new fragment info to the existing expression if possible.
426 if (auto E = DIExpression::createFragmentExpression(
427 Expr, OffsetInBits: NewFragment.OffsetInBits, SizeInBits: NewFragment.SizeInBits)) {
428 Expr = *E;
429 } else {
430 // Otherwise, add the new fragment info to an empty expression and
431 // discard the value component of this dbg.assign as the value cannot
432 // be computed with the new fragment.
433 Expr = *DIExpression::createFragmentExpression(
434 Expr: DIExpression::get(Context&: Expr->getContext(), Elements: {}),
435 OffsetInBits: NewFragment.OffsetInBits, SizeInBits: NewFragment.SizeInBits);
436 SetKillLocation = true;
437 }
438 }
439 }
440
441 // If we haven't created a DIAssignID ID do that now and attach it to Inst.
442 if (!NewID) {
443 NewID = DIAssignID::getDistinct(Context&: Ctx);
444 Inst->setMetadata(KindID: LLVMContext::MD_DIAssignID, Node: NewID);
445 }
446
447 ::Value *NewValue = Value ? Value : DbgAssign->getValue();
448 auto *NewAssign = UnwrapDbgInstPtr(
449 DIB.insertDbgAssign(LinkedInstr: Inst, Val: NewValue, SrcVar: DbgAssign->getVariable(), ValExpr: Expr,
450 Addr: Dest, AddrExpr: DIExpression::get(Context&: Expr->getContext(), Elements: {}),
451 DL: DbgAssign->getDebugLoc()),
452 DbgAssign);
453
454 // If we've updated the value but the original dbg.assign has an arglist
455 // then kill it now - we can't use the requested new value.
456 // We can't replace the DIArgList with the new value as it'd leave
457 // the DIExpression in an invalid state (DW_OP_LLVM_arg operands without
458 // an arglist). And we can't keep the DIArgList in case the linked store
459 // is being split - in which case the DIArgList + expression may no longer
460 // be computing the correct value.
461 // This should be a very rare situation as it requires the value being
462 // stored to differ from the dbg.assign (i.e., the value has been
463 // represented differently in the debug intrinsic for some reason).
464 SetKillLocation |=
465 Value && (DbgAssign->hasArgList() ||
466 !DbgAssign->getExpression()->isSingleLocationExpression());
467 if (SetKillLocation)
468 NewAssign->setKillLocation();
469
470 // We could use more precision here at the cost of some additional (code)
471 // complexity - if the original dbg.assign was adjacent to its store, we
472 // could position this new dbg.assign adjacent to its store rather than the
473 // old dbg.assgn. That would result in interleaved dbg.assigns rather than
474 // what we get now:
475 // split store !1
476 // split store !2
477 // dbg.assign !1
478 // dbg.assign !2
479 // This (current behaviour) results results in debug assignments being
480 // noted as slightly offset (in code) from the store. In practice this
481 // should have little effect on the debugging experience due to the fact
482 // that all the split stores should get the same line number.
483 NewAssign->moveBefore(DbgAssign->getIterator());
484
485 NewAssign->setDebugLoc(DbgAssign->getDebugLoc());
486 LLVM_DEBUG(dbgs() << "Created new assign: " << *NewAssign << "\n");
487 };
488
489 for_each(Range&: MarkerRange, F: MigrateDbgAssign);
490 for_each(Range&: DVRAssignMarkerRange, F: MigrateDbgAssign);
491}
492
493namespace {
494
495/// A custom IRBuilder inserter which prefixes all names, but only in
496/// Assert builds.
497class IRBuilderPrefixedInserter final : public IRBuilderDefaultInserter {
498 std::string Prefix;
499
500 Twine getNameWithPrefix(const Twine &Name) const {
501 return Name.isTriviallyEmpty() ? Name : Prefix + Name;
502 }
503
504public:
505 void SetNamePrefix(const Twine &P) { Prefix = P.str(); }
506
507 void InsertHelper(Instruction *I, const Twine &Name,
508 BasicBlock::iterator InsertPt) const override {
509 IRBuilderDefaultInserter::InsertHelper(I, Name: getNameWithPrefix(Name),
510 InsertPt);
511 }
512};
513
514/// Provide a type for IRBuilder that drops names in release builds.
515using IRBuilderTy = IRBuilder<ConstantFolder, IRBuilderPrefixedInserter>;
516
517/// A used slice of an alloca.
518///
519/// This structure represents a slice of an alloca used by some instruction. It
520/// stores both the begin and end offsets of this use, a pointer to the use
521/// itself, and a flag indicating whether we can classify the use as splittable
522/// or not when forming partitions of the alloca.
523class Slice {
524 /// The beginning offset of the range.
525 uint64_t BeginOffset = 0;
526
527 /// The ending offset, not included in the range.
528 uint64_t EndOffset = 0;
529
530 /// Storage for both the use of this slice and whether it can be
531 /// split.
532 PointerIntPair<Use *, 1, bool> UseAndIsSplittable;
533
534public:
535 Slice() = default;
536
537 Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable)
538 : BeginOffset(BeginOffset), EndOffset(EndOffset),
539 UseAndIsSplittable(U, IsSplittable) {}
540
541 uint64_t beginOffset() const { return BeginOffset; }
542 uint64_t endOffset() const { return EndOffset; }
543
544 bool isSplittable() const { return UseAndIsSplittable.getInt(); }
545 void makeUnsplittable() { UseAndIsSplittable.setInt(false); }
546
547 Use *getUse() const { return UseAndIsSplittable.getPointer(); }
548
549 bool isDead() const { return getUse() == nullptr; }
550 void kill() { UseAndIsSplittable.setPointer(nullptr); }
551
552 /// Support for ordering ranges.
553 ///
554 /// This provides an ordering over ranges such that start offsets are
555 /// always increasing, and within equal start offsets, the end offsets are
556 /// decreasing. Thus the spanning range comes first in a cluster with the
557 /// same start position.
558 bool operator<(const Slice &RHS) const {
559 if (beginOffset() < RHS.beginOffset())
560 return true;
561 if (beginOffset() > RHS.beginOffset())
562 return false;
563 if (isSplittable() != RHS.isSplittable())
564 return !isSplittable();
565 if (endOffset() > RHS.endOffset())
566 return true;
567 return false;
568 }
569
570 /// Support comparison with a single offset to allow binary searches.
571 friend LLVM_ATTRIBUTE_UNUSED bool operator<(const Slice &LHS,
572 uint64_t RHSOffset) {
573 return LHS.beginOffset() < RHSOffset;
574 }
575 friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset,
576 const Slice &RHS) {
577 return LHSOffset < RHS.beginOffset();
578 }
579
580 bool operator==(const Slice &RHS) const {
581 return isSplittable() == RHS.isSplittable() &&
582 beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset();
583 }
584 bool operator!=(const Slice &RHS) const { return !operator==(RHS); }
585};
586
587/// Representation of the alloca slices.
588///
589/// This class represents the slices of an alloca which are formed by its
590/// various uses. If a pointer escapes, we can't fully build a representation
591/// for the slices used and we reflect that in this structure. The uses are
592/// stored, sorted by increasing beginning offset and with unsplittable slices
593/// starting at a particular offset before splittable slices.
594class AllocaSlices {
595public:
596 /// Construct the slices of a particular alloca.
597 AllocaSlices(const DataLayout &DL, AllocaInst &AI);
598
599 /// Test whether a pointer to the allocation escapes our analysis.
600 ///
601 /// If this is true, the slices are never fully built and should be
602 /// ignored.
603 bool isEscaped() const { return PointerEscapingInstr; }
604 bool isEscapedReadOnly() const { return PointerEscapingInstrReadOnly; }
605
606 /// Support for iterating over the slices.
607 /// @{
608 using iterator = SmallVectorImpl<Slice>::iterator;
609 using range = iterator_range<iterator>;
610
611 iterator begin() { return Slices.begin(); }
612 iterator end() { return Slices.end(); }
613
614 using const_iterator = SmallVectorImpl<Slice>::const_iterator;
615 using const_range = iterator_range<const_iterator>;
616
617 const_iterator begin() const { return Slices.begin(); }
618 const_iterator end() const { return Slices.end(); }
619 /// @}
620
621 /// Erase a range of slices.
622 void erase(iterator Start, iterator Stop) { Slices.erase(CS: Start, CE: Stop); }
623
624 /// Insert new slices for this alloca.
625 ///
626 /// This moves the slices into the alloca's slices collection, and re-sorts
627 /// everything so that the usual ordering properties of the alloca's slices
628 /// hold.
629 void insert(ArrayRef<Slice> NewSlices) {
630 int OldSize = Slices.size();
631 Slices.append(in_start: NewSlices.begin(), in_end: NewSlices.end());
632 auto SliceI = Slices.begin() + OldSize;
633 std::stable_sort(first: SliceI, last: Slices.end());
634 std::inplace_merge(first: Slices.begin(), middle: SliceI, last: Slices.end());
635 }
636
637 // Forward declare the iterator and range accessor for walking the
638 // partitions.
639 class partition_iterator;
640 iterator_range<partition_iterator> partitions();
641
642 /// Access the dead users for this alloca.
643 ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
644
645 /// Access Uses that should be dropped if the alloca is promotable.
646 ArrayRef<Use *> getDeadUsesIfPromotable() const {
647 return DeadUseIfPromotable;
648 }
649
650 /// Access the dead operands referring to this alloca.
651 ///
652 /// These are operands which have cannot actually be used to refer to the
653 /// alloca as they are outside its range and the user doesn't correct for
654 /// that. These mostly consist of PHI node inputs and the like which we just
655 /// need to replace with undef.
656 ArrayRef<Use *> getDeadOperands() const { return DeadOperands; }
657
658#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
659 void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const;
660 void printSlice(raw_ostream &OS, const_iterator I,
661 StringRef Indent = " ") const;
662 void printUse(raw_ostream &OS, const_iterator I,
663 StringRef Indent = " ") const;
664 void print(raw_ostream &OS) const;
665 void dump(const_iterator I) const;
666 void dump() const;
667#endif
668
669private:
670 template <typename DerivedT, typename RetT = void> class BuilderBase;
671 class SliceBuilder;
672
673 friend class AllocaSlices::SliceBuilder;
674
675#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
676 /// Handle to alloca instruction to simplify method interfaces.
677 AllocaInst &AI;
678#endif
679
680 /// The instruction responsible for this alloca not having a known set
681 /// of slices.
682 ///
683 /// When an instruction (potentially) escapes the pointer to the alloca, we
684 /// store a pointer to that here and abort trying to form slices of the
685 /// alloca. This will be null if the alloca slices are analyzed successfully.
686 Instruction *PointerEscapingInstr;
687 Instruction *PointerEscapingInstrReadOnly;
688
689 /// The slices of the alloca.
690 ///
691 /// We store a vector of the slices formed by uses of the alloca here. This
692 /// vector is sorted by increasing begin offset, and then the unsplittable
693 /// slices before the splittable ones. See the Slice inner class for more
694 /// details.
695 SmallVector<Slice, 8> Slices;
696
697 /// Instructions which will become dead if we rewrite the alloca.
698 ///
699 /// Note that these are not separated by slice. This is because we expect an
700 /// alloca to be completely rewritten or not rewritten at all. If rewritten,
701 /// all these instructions can simply be removed and replaced with poison as
702 /// they come from outside of the allocated space.
703 SmallVector<Instruction *, 8> DeadUsers;
704
705 /// Uses which will become dead if can promote the alloca.
706 SmallVector<Use *, 8> DeadUseIfPromotable;
707
708 /// Operands which will become dead if we rewrite the alloca.
709 ///
710 /// These are operands that in their particular use can be replaced with
711 /// poison when we rewrite the alloca. These show up in out-of-bounds inputs
712 /// to PHI nodes and the like. They aren't entirely dead (there might be
713 /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
714 /// want to swap this particular input for poison to simplify the use lists of
715 /// the alloca.
716 SmallVector<Use *, 8> DeadOperands;
717};
718
719/// A partition of the slices.
720///
721/// An ephemeral representation for a range of slices which can be viewed as
722/// a partition of the alloca. This range represents a span of the alloca's
723/// memory which cannot be split, and provides access to all of the slices
724/// overlapping some part of the partition.
725///
726/// Objects of this type are produced by traversing the alloca's slices, but
727/// are only ephemeral and not persistent.
728class Partition {
729private:
730 friend class AllocaSlices;
731 friend class AllocaSlices::partition_iterator;
732
733 using iterator = AllocaSlices::iterator;
734
735 /// The beginning and ending offsets of the alloca for this
736 /// partition.
737 uint64_t BeginOffset = 0, EndOffset = 0;
738
739 /// The start and end iterators of this partition.
740 iterator SI, SJ;
741
742 /// A collection of split slice tails overlapping the partition.
743 SmallVector<Slice *, 4> SplitTails;
744
745 /// Raw constructor builds an empty partition starting and ending at
746 /// the given iterator.
747 Partition(iterator SI) : SI(SI), SJ(SI) {}
748
749public:
750 /// The start offset of this partition.
751 ///
752 /// All of the contained slices start at or after this offset.
753 uint64_t beginOffset() const { return BeginOffset; }
754
755 /// The end offset of this partition.
756 ///
757 /// All of the contained slices end at or before this offset.
758 uint64_t endOffset() const { return EndOffset; }
759
760 /// The size of the partition.
761 ///
762 /// Note that this can never be zero.
763 uint64_t size() const {
764 assert(BeginOffset < EndOffset && "Partitions must span some bytes!");
765 return EndOffset - BeginOffset;
766 }
767
768 /// Test whether this partition contains no slices, and merely spans
769 /// a region occupied by split slices.
770 bool empty() const { return SI == SJ; }
771
772 /// \name Iterate slices that start within the partition.
773 /// These may be splittable or unsplittable. They have a begin offset >= the
774 /// partition begin offset.
775 /// @{
776 // FIXME: We should probably define a "concat_iterator" helper and use that
777 // to stitch together pointee_iterators over the split tails and the
778 // contiguous iterators of the partition. That would give a much nicer
779 // interface here. We could then additionally expose filtered iterators for
780 // split, unsplit, and unsplittable splices based on the usage patterns.
781 iterator begin() const { return SI; }
782 iterator end() const { return SJ; }
783 /// @}
784
785 /// Get the sequence of split slice tails.
786 ///
787 /// These tails are of slices which start before this partition but are
788 /// split and overlap into the partition. We accumulate these while forming
789 /// partitions.
790 ArrayRef<Slice *> splitSliceTails() const { return SplitTails; }
791};
792
793} // end anonymous namespace
794
795/// An iterator over partitions of the alloca's slices.
796///
797/// This iterator implements the core algorithm for partitioning the alloca's
798/// slices. It is a forward iterator as we don't support backtracking for
799/// efficiency reasons, and re-use a single storage area to maintain the
800/// current set of split slices.
801///
802/// It is templated on the slice iterator type to use so that it can operate
803/// with either const or non-const slice iterators.
804class AllocaSlices::partition_iterator
805 : public iterator_facade_base<partition_iterator, std::forward_iterator_tag,
806 Partition> {
807 friend class AllocaSlices;
808
809 /// Most of the state for walking the partitions is held in a class
810 /// with a nice interface for examining them.
811 Partition P;
812
813 /// We need to keep the end of the slices to know when to stop.
814 AllocaSlices::iterator SE;
815
816 /// We also need to keep track of the maximum split end offset seen.
817 /// FIXME: Do we really?
818 uint64_t MaxSplitSliceEndOffset = 0;
819
820 /// Sets the partition to be empty at given iterator, and sets the
821 /// end iterator.
822 partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE)
823 : P(SI), SE(SE) {
824 // If not already at the end, advance our state to form the initial
825 // partition.
826 if (SI != SE)
827 advance();
828 }
829
830 /// Advance the iterator to the next partition.
831 ///
832 /// Requires that the iterator not be at the end of the slices.
833 void advance() {
834 assert((P.SI != SE || !P.SplitTails.empty()) &&
835 "Cannot advance past the end of the slices!");
836
837 // Clear out any split uses which have ended.
838 if (!P.SplitTails.empty()) {
839 if (P.EndOffset >= MaxSplitSliceEndOffset) {
840 // If we've finished all splits, this is easy.
841 P.SplitTails.clear();
842 MaxSplitSliceEndOffset = 0;
843 } else {
844 // Remove the uses which have ended in the prior partition. This
845 // cannot change the max split slice end because we just checked that
846 // the prior partition ended prior to that max.
847 llvm::erase_if(C&: P.SplitTails,
848 P: [&](Slice *S) { return S->endOffset() <= P.EndOffset; });
849 assert(llvm::any_of(P.SplitTails,
850 [&](Slice *S) {
851 return S->endOffset() == MaxSplitSliceEndOffset;
852 }) &&
853 "Could not find the current max split slice offset!");
854 assert(llvm::all_of(P.SplitTails,
855 [&](Slice *S) {
856 return S->endOffset() <= MaxSplitSliceEndOffset;
857 }) &&
858 "Max split slice end offset is not actually the max!");
859 }
860 }
861
862 // If P.SI is already at the end, then we've cleared the split tail and
863 // now have an end iterator.
864 if (P.SI == SE) {
865 assert(P.SplitTails.empty() && "Failed to clear the split slices!");
866 return;
867 }
868
869 // If we had a non-empty partition previously, set up the state for
870 // subsequent partitions.
871 if (P.SI != P.SJ) {
872 // Accumulate all the splittable slices which started in the old
873 // partition into the split list.
874 for (Slice &S : P)
875 if (S.isSplittable() && S.endOffset() > P.EndOffset) {
876 P.SplitTails.push_back(Elt: &S);
877 MaxSplitSliceEndOffset =
878 std::max(a: S.endOffset(), b: MaxSplitSliceEndOffset);
879 }
880
881 // Start from the end of the previous partition.
882 P.SI = P.SJ;
883
884 // If P.SI is now at the end, we at most have a tail of split slices.
885 if (P.SI == SE) {
886 P.BeginOffset = P.EndOffset;
887 P.EndOffset = MaxSplitSliceEndOffset;
888 return;
889 }
890
891 // If the we have split slices and the next slice is after a gap and is
892 // not splittable immediately form an empty partition for the split
893 // slices up until the next slice begins.
894 if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset &&
895 !P.SI->isSplittable()) {
896 P.BeginOffset = P.EndOffset;
897 P.EndOffset = P.SI->beginOffset();
898 return;
899 }
900 }
901
902 // OK, we need to consume new slices. Set the end offset based on the
903 // current slice, and step SJ past it. The beginning offset of the
904 // partition is the beginning offset of the next slice unless we have
905 // pre-existing split slices that are continuing, in which case we begin
906 // at the prior end offset.
907 P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset;
908 P.EndOffset = P.SI->endOffset();
909 ++P.SJ;
910
911 // There are two strategies to form a partition based on whether the
912 // partition starts with an unsplittable slice or a splittable slice.
913 if (!P.SI->isSplittable()) {
914 // When we're forming an unsplittable region, it must always start at
915 // the first slice and will extend through its end.
916 assert(P.BeginOffset == P.SI->beginOffset());
917
918 // Form a partition including all of the overlapping slices with this
919 // unsplittable slice.
920 while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
921 if (!P.SJ->isSplittable())
922 P.EndOffset = std::max(a: P.EndOffset, b: P.SJ->endOffset());
923 ++P.SJ;
924 }
925
926 // We have a partition across a set of overlapping unsplittable
927 // partitions.
928 return;
929 }
930
931 // If we're starting with a splittable slice, then we need to form
932 // a synthetic partition spanning it and any other overlapping splittable
933 // splices.
934 assert(P.SI->isSplittable() && "Forming a splittable partition!");
935
936 // Collect all of the overlapping splittable slices.
937 while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset &&
938 P.SJ->isSplittable()) {
939 P.EndOffset = std::max(a: P.EndOffset, b: P.SJ->endOffset());
940 ++P.SJ;
941 }
942
943 // Back upiP.EndOffset if we ended the span early when encountering an
944 // unsplittable slice. This synthesizes the early end offset of
945 // a partition spanning only splittable slices.
946 if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
947 assert(!P.SJ->isSplittable());
948 P.EndOffset = P.SJ->beginOffset();
949 }
950 }
951
952public:
953 bool operator==(const partition_iterator &RHS) const {
954 assert(SE == RHS.SE &&
955 "End iterators don't match between compared partition iterators!");
956
957 // The observed positions of partitions is marked by the P.SI iterator and
958 // the emptiness of the split slices. The latter is only relevant when
959 // P.SI == SE, as the end iterator will additionally have an empty split
960 // slices list, but the prior may have the same P.SI and a tail of split
961 // slices.
962 if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) {
963 assert(P.SJ == RHS.P.SJ &&
964 "Same set of slices formed two different sized partitions!");
965 assert(P.SplitTails.size() == RHS.P.SplitTails.size() &&
966 "Same slice position with differently sized non-empty split "
967 "slice tails!");
968 return true;
969 }
970 return false;
971 }
972
973 partition_iterator &operator++() {
974 advance();
975 return *this;
976 }
977
978 Partition &operator*() { return P; }
979};
980
981/// A forward range over the partitions of the alloca's slices.
982///
983/// This accesses an iterator range over the partitions of the alloca's
984/// slices. It computes these partitions on the fly based on the overlapping
985/// offsets of the slices and the ability to split them. It will visit "empty"
986/// partitions to cover regions of the alloca only accessed via split
987/// slices.
988iterator_range<AllocaSlices::partition_iterator> AllocaSlices::partitions() {
989 return make_range(x: partition_iterator(begin(), end()),
990 y: partition_iterator(end(), end()));
991}
992
993static Value *foldSelectInst(SelectInst &SI) {
994 // If the condition being selected on is a constant or the same value is
995 // being selected between, fold the select. Yes this does (rarely) happen
996 // early on.
997 if (ConstantInt *CI = dyn_cast<ConstantInt>(Val: SI.getCondition()))
998 return SI.getOperand(i_nocapture: 1 + CI->isZero());
999 if (SI.getOperand(i_nocapture: 1) == SI.getOperand(i_nocapture: 2))
1000 return SI.getOperand(i_nocapture: 1);
1001
1002 return nullptr;
1003}
1004
1005/// A helper that folds a PHI node or a select.
1006static Value *foldPHINodeOrSelectInst(Instruction &I) {
1007 if (PHINode *PN = dyn_cast<PHINode>(Val: &I)) {
1008 // If PN merges together the same value, return that value.
1009 return PN->hasConstantValue();
1010 }
1011 return foldSelectInst(SI&: cast<SelectInst>(Val&: I));
1012}
1013
1014/// Builder for the alloca slices.
1015///
1016/// This class builds a set of alloca slices by recursively visiting the uses
1017/// of an alloca and making a slice for each load and store at each offset.
1018class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
1019 friend class PtrUseVisitor<SliceBuilder>;
1020 friend class InstVisitor<SliceBuilder>;
1021
1022 using Base = PtrUseVisitor<SliceBuilder>;
1023
1024 const uint64_t AllocSize;
1025 AllocaSlices &AS;
1026
1027 SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap;
1028 SmallDenseMap<Instruction *, uint64_t> PHIOrSelectSizes;
1029
1030 /// Set to de-duplicate dead instructions found in the use walk.
1031 SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
1032
1033public:
1034 SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
1035 : PtrUseVisitor<SliceBuilder>(DL),
1036 AllocSize(DL.getTypeAllocSize(Ty: AI.getAllocatedType()).getFixedValue()),
1037 AS(AS) {}
1038
1039private:
1040 void markAsDead(Instruction &I) {
1041 if (VisitedDeadInsts.insert(Ptr: &I).second)
1042 AS.DeadUsers.push_back(Elt: &I);
1043 }
1044
1045 void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
1046 bool IsSplittable = false) {
1047 // Completely skip uses which have a zero size or start either before or
1048 // past the end of the allocation.
1049 if (Size == 0 || Offset.uge(RHS: AllocSize)) {
1050 LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @"
1051 << Offset
1052 << " which has zero size or starts outside of the "
1053 << AllocSize << " byte alloca:\n"
1054 << " alloca: " << AS.AI << "\n"
1055 << " use: " << I << "\n");
1056 return markAsDead(I);
1057 }
1058
1059 uint64_t BeginOffset = Offset.getZExtValue();
1060 uint64_t EndOffset = BeginOffset + Size;
1061
1062 // Clamp the end offset to the end of the allocation. Note that this is
1063 // formulated to handle even the case where "BeginOffset + Size" overflows.
1064 // This may appear superficially to be something we could ignore entirely,
1065 // but that is not so! There may be widened loads or PHI-node uses where
1066 // some instructions are dead but not others. We can't completely ignore
1067 // them, and so have to record at least the information here.
1068 assert(AllocSize >= BeginOffset); // Established above.
1069 if (Size > AllocSize - BeginOffset) {
1070 LLVM_DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @"
1071 << Offset << " to remain within the " << AllocSize
1072 << " byte alloca:\n"
1073 << " alloca: " << AS.AI << "\n"
1074 << " use: " << I << "\n");
1075 EndOffset = AllocSize;
1076 }
1077
1078 AS.Slices.push_back(Elt: Slice(BeginOffset, EndOffset, U, IsSplittable));
1079 }
1080
1081 void visitBitCastInst(BitCastInst &BC) {
1082 if (BC.use_empty())
1083 return markAsDead(I&: BC);
1084
1085 return Base::visitBitCastInst(BC);
1086 }
1087
1088 void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
1089 if (ASC.use_empty())
1090 return markAsDead(I&: ASC);
1091
1092 return Base::visitAddrSpaceCastInst(ASC);
1093 }
1094
1095 void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
1096 if (GEPI.use_empty())
1097 return markAsDead(I&: GEPI);
1098
1099 return Base::visitGetElementPtrInst(GEPI);
1100 }
1101
1102 void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
1103 uint64_t Size, bool IsVolatile) {
1104 // We allow splitting of non-volatile loads and stores where the type is an
1105 // integer type. These may be used to implement 'memcpy' or other "transfer
1106 // of bits" patterns.
1107 bool IsSplittable =
1108 Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty);
1109
1110 insertUse(I, Offset, Size, IsSplittable);
1111 }
1112
1113 void visitLoadInst(LoadInst &LI) {
1114 assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
1115 "All simple FCA loads should have been pre-split");
1116
1117 // If there is a load with an unknown offset, we can still perform store
1118 // to load forwarding for other known-offset loads.
1119 if (!IsOffsetKnown)
1120 return PI.setEscapedReadOnly(&LI);
1121
1122 TypeSize Size = DL.getTypeStoreSize(Ty: LI.getType());
1123 if (Size.isScalable()) {
1124 unsigned VScale = LI.getFunction()->getVScaleValue();
1125 if (!VScale)
1126 return PI.setAborted(&LI);
1127
1128 Size = TypeSize::getFixed(ExactSize: Size.getKnownMinValue() * VScale);
1129 }
1130
1131 return handleLoadOrStore(Ty: LI.getType(), I&: LI, Offset, Size: Size.getFixedValue(),
1132 IsVolatile: LI.isVolatile());
1133 }
1134
1135 void visitStoreInst(StoreInst &SI) {
1136 Value *ValOp = SI.getValueOperand();
1137 if (ValOp == *U)
1138 return PI.setEscapedAndAborted(&SI);
1139 if (!IsOffsetKnown)
1140 return PI.setAborted(&SI);
1141
1142 TypeSize StoreSize = DL.getTypeStoreSize(Ty: ValOp->getType());
1143 if (StoreSize.isScalable()) {
1144 unsigned VScale = SI.getFunction()->getVScaleValue();
1145 if (!VScale)
1146 return PI.setAborted(&SI);
1147
1148 StoreSize = TypeSize::getFixed(ExactSize: StoreSize.getKnownMinValue() * VScale);
1149 }
1150
1151 uint64_t Size = StoreSize.getFixedValue();
1152
1153 // If this memory access can be shown to *statically* extend outside the
1154 // bounds of the allocation, it's behavior is undefined, so simply
1155 // ignore it. Note that this is more strict than the generic clamping
1156 // behavior of insertUse. We also try to handle cases which might run the
1157 // risk of overflow.
1158 // FIXME: We should instead consider the pointer to have escaped if this
1159 // function is being instrumented for addressing bugs or race conditions.
1160 if (Size > AllocSize || Offset.ugt(RHS: AllocSize - Size)) {
1161 LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @"
1162 << Offset << " which extends past the end of the "
1163 << AllocSize << " byte alloca:\n"
1164 << " alloca: " << AS.AI << "\n"
1165 << " use: " << SI << "\n");
1166 return markAsDead(I&: SI);
1167 }
1168
1169 assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
1170 "All simple FCA stores should have been pre-split");
1171 handleLoadOrStore(Ty: ValOp->getType(), I&: SI, Offset, Size, IsVolatile: SI.isVolatile());
1172 }
1173
1174 void visitMemSetInst(MemSetInst &II) {
1175 assert(II.getRawDest() == *U && "Pointer use is not the destination?");
1176 ConstantInt *Length = dyn_cast<ConstantInt>(Val: II.getLength());
1177 if ((Length && Length->getValue() == 0) ||
1178 (IsOffsetKnown && Offset.uge(RHS: AllocSize)))
1179 // Zero-length mem transfer intrinsics can be ignored entirely.
1180 return markAsDead(I&: II);
1181
1182 if (!IsOffsetKnown)
1183 return PI.setAborted(&II);
1184
1185 insertUse(I&: II, Offset,
1186 Size: Length ? Length->getLimitedValue()
1187 : AllocSize - Offset.getLimitedValue(),
1188 IsSplittable: (bool)Length);
1189 }
1190
1191 void visitMemTransferInst(MemTransferInst &II) {
1192 ConstantInt *Length = dyn_cast<ConstantInt>(Val: II.getLength());
1193 if (Length && Length->getValue() == 0)
1194 // Zero-length mem transfer intrinsics can be ignored entirely.
1195 return markAsDead(I&: II);
1196
1197 // Because we can visit these intrinsics twice, also check to see if the
1198 // first time marked this instruction as dead. If so, skip it.
1199 if (VisitedDeadInsts.count(Ptr: &II))
1200 return;
1201
1202 if (!IsOffsetKnown)
1203 return PI.setAborted(&II);
1204
1205 // This side of the transfer is completely out-of-bounds, and so we can
1206 // nuke the entire transfer. However, we also need to nuke the other side
1207 // if already added to our partitions.
1208 // FIXME: Yet another place we really should bypass this when
1209 // instrumenting for ASan.
1210 if (Offset.uge(RHS: AllocSize)) {
1211 SmallDenseMap<Instruction *, unsigned>::iterator MTPI =
1212 MemTransferSliceMap.find(Val: &II);
1213 if (MTPI != MemTransferSliceMap.end())
1214 AS.Slices[MTPI->second].kill();
1215 return markAsDead(I&: II);
1216 }
1217
1218 uint64_t RawOffset = Offset.getLimitedValue();
1219 uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset;
1220
1221 // Check for the special case where the same exact value is used for both
1222 // source and dest.
1223 if (*U == II.getRawDest() && *U == II.getRawSource()) {
1224 // For non-volatile transfers this is a no-op.
1225 if (!II.isVolatile())
1226 return markAsDead(I&: II);
1227
1228 return insertUse(I&: II, Offset, Size, /*IsSplittable=*/false);
1229 }
1230
1231 // If we have seen both source and destination for a mem transfer, then
1232 // they both point to the same alloca.
1233 bool Inserted;
1234 SmallDenseMap<Instruction *, unsigned>::iterator MTPI;
1235 std::tie(args&: MTPI, args&: Inserted) =
1236 MemTransferSliceMap.insert(KV: std::make_pair(x: &II, y: AS.Slices.size()));
1237 unsigned PrevIdx = MTPI->second;
1238 if (!Inserted) {
1239 Slice &PrevP = AS.Slices[PrevIdx];
1240
1241 // Check if the begin offsets match and this is a non-volatile transfer.
1242 // In that case, we can completely elide the transfer.
1243 if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) {
1244 PrevP.kill();
1245 return markAsDead(I&: II);
1246 }
1247
1248 // Otherwise we have an offset transfer within the same alloca. We can't
1249 // split those.
1250 PrevP.makeUnsplittable();
1251 }
1252
1253 // Insert the use now that we've fixed up the splittable nature.
1254 insertUse(I&: II, Offset, Size, /*IsSplittable=*/Inserted && Length);
1255
1256 // Check that we ended up with a valid index in the map.
1257 assert(AS.Slices[PrevIdx].getUse()->getUser() == &II &&
1258 "Map index doesn't point back to a slice with this user.");
1259 }
1260
1261 // Disable SRoA for any intrinsics except for lifetime invariants and
1262 // invariant group.
1263 // FIXME: What about debug intrinsics? This matches old behavior, but
1264 // doesn't make sense.
1265 void visitIntrinsicInst(IntrinsicInst &II) {
1266 if (II.isDroppable()) {
1267 AS.DeadUseIfPromotable.push_back(Elt: U);
1268 return;
1269 }
1270
1271 if (!IsOffsetKnown)
1272 return PI.setAborted(&II);
1273
1274 if (II.isLifetimeStartOrEnd()) {
1275 ConstantInt *Length = cast<ConstantInt>(Val: II.getArgOperand(i: 0));
1276 uint64_t Size = std::min(a: AllocSize - Offset.getLimitedValue(),
1277 b: Length->getLimitedValue());
1278 insertUse(I&: II, Offset, Size, IsSplittable: true);
1279 return;
1280 }
1281
1282 if (II.isLaunderOrStripInvariantGroup()) {
1283 insertUse(I&: II, Offset, Size: AllocSize, IsSplittable: true);
1284 enqueueUsers(I&: II);
1285 return;
1286 }
1287
1288 Base::visitIntrinsicInst(II);
1289 }
1290
1291 Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) {
1292 // We consider any PHI or select that results in a direct load or store of
1293 // the same offset to be a viable use for slicing purposes. These uses
1294 // are considered unsplittable and the size is the maximum loaded or stored
1295 // size.
1296 SmallPtrSet<Instruction *, 4> Visited;
1297 SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses;
1298 Visited.insert(Ptr: Root);
1299 Uses.push_back(Elt: std::make_pair(x: cast<Instruction>(Val&: *U), y&: Root));
1300 const DataLayout &DL = Root->getDataLayout();
1301 // If there are no loads or stores, the access is dead. We mark that as
1302 // a size zero access.
1303 Size = 0;
1304 do {
1305 Instruction *I, *UsedI;
1306 std::tie(args&: UsedI, args&: I) = Uses.pop_back_val();
1307
1308 if (LoadInst *LI = dyn_cast<LoadInst>(Val: I)) {
1309 TypeSize LoadSize = DL.getTypeStoreSize(Ty: LI->getType());
1310 if (LoadSize.isScalable()) {
1311 PI.setAborted(LI);
1312 return nullptr;
1313 }
1314 Size = std::max(a: Size, b: LoadSize.getFixedValue());
1315 continue;
1316 }
1317 if (StoreInst *SI = dyn_cast<StoreInst>(Val: I)) {
1318 Value *Op = SI->getOperand(i_nocapture: 0);
1319 if (Op == UsedI)
1320 return SI;
1321 TypeSize StoreSize = DL.getTypeStoreSize(Ty: Op->getType());
1322 if (StoreSize.isScalable()) {
1323 PI.setAborted(SI);
1324 return nullptr;
1325 }
1326 Size = std::max(a: Size, b: StoreSize.getFixedValue());
1327 continue;
1328 }
1329
1330 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: I)) {
1331 if (!GEP->hasAllZeroIndices())
1332 return GEP;
1333 } else if (!isa<BitCastInst>(Val: I) && !isa<PHINode>(Val: I) &&
1334 !isa<SelectInst>(Val: I) && !isa<AddrSpaceCastInst>(Val: I)) {
1335 return I;
1336 }
1337
1338 for (User *U : I->users())
1339 if (Visited.insert(Ptr: cast<Instruction>(Val: U)).second)
1340 Uses.push_back(Elt: std::make_pair(x&: I, y: cast<Instruction>(Val: U)));
1341 } while (!Uses.empty());
1342
1343 return nullptr;
1344 }
1345
1346 void visitPHINodeOrSelectInst(Instruction &I) {
1347 assert(isa<PHINode>(I) || isa<SelectInst>(I));
1348 if (I.use_empty())
1349 return markAsDead(I);
1350
1351 // If this is a PHI node before a catchswitch, we cannot insert any non-PHI
1352 // instructions in this BB, which may be required during rewriting. Bail out
1353 // on these cases.
1354 if (isa<PHINode>(Val: I) &&
1355 I.getParent()->getFirstInsertionPt() == I.getParent()->end())
1356 return PI.setAborted(&I);
1357
1358 // TODO: We could use simplifyInstruction here to fold PHINodes and
1359 // SelectInsts. However, doing so requires to change the current
1360 // dead-operand-tracking mechanism. For instance, suppose neither loading
1361 // from %U nor %other traps. Then "load (select undef, %U, %other)" does not
1362 // trap either. However, if we simply replace %U with undef using the
1363 // current dead-operand-tracking mechanism, "load (select undef, undef,
1364 // %other)" may trap because the select may return the first operand
1365 // "undef".
1366 if (Value *Result = foldPHINodeOrSelectInst(I)) {
1367 if (Result == *U)
1368 // If the result of the constant fold will be the pointer, recurse
1369 // through the PHI/select as if we had RAUW'ed it.
1370 enqueueUsers(I);
1371 else
1372 // Otherwise the operand to the PHI/select is dead, and we can replace
1373 // it with poison.
1374 AS.DeadOperands.push_back(Elt: U);
1375
1376 return;
1377 }
1378
1379 if (!IsOffsetKnown)
1380 return PI.setAborted(&I);
1381
1382 // See if we already have computed info on this node.
1383 uint64_t &Size = PHIOrSelectSizes[&I];
1384 if (!Size) {
1385 // This is a new PHI/Select, check for an unsafe use of it.
1386 if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(Root: &I, Size))
1387 return PI.setAborted(UnsafeI);
1388 }
1389
1390 // For PHI and select operands outside the alloca, we can't nuke the entire
1391 // phi or select -- the other side might still be relevant, so we special
1392 // case them here and use a separate structure to track the operands
1393 // themselves which should be replaced with poison.
1394 // FIXME: This should instead be escaped in the event we're instrumenting
1395 // for address sanitization.
1396 if (Offset.uge(RHS: AllocSize)) {
1397 AS.DeadOperands.push_back(Elt: U);
1398 return;
1399 }
1400
1401 insertUse(I, Offset, Size);
1402 }
1403
1404 void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(I&: PN); }
1405
1406 void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(I&: SI); }
1407
1408 /// Disable SROA entirely if there are unhandled users of the alloca.
1409 void visitInstruction(Instruction &I) { PI.setAborted(&I); }
1410
1411 void visitCallBase(CallBase &CB) {
1412 // If the call operand is read-only and only does a read-only or address
1413 // capture, then we mark it as EscapedReadOnly.
1414 if (CB.isDataOperand(U) &&
1415 !capturesFullProvenance(CC: CB.getCaptureInfo(OpNo: U->getOperandNo())) &&
1416 CB.onlyReadsMemory(OpNo: U->getOperandNo())) {
1417 PI.setEscapedReadOnly(&CB);
1418 return;
1419 }
1420
1421 Base::visitCallBase(CB);
1422 }
1423};
1424
1425AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
1426 :
1427#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1428 AI(AI),
1429#endif
1430 PointerEscapingInstr(nullptr), PointerEscapingInstrReadOnly(nullptr) {
1431 SliceBuilder PB(DL, AI, *this);
1432 SliceBuilder::PtrInfo PtrI = PB.visitPtr(I&: AI);
1433 if (PtrI.isEscaped() || PtrI.isAborted()) {
1434 // FIXME: We should sink the escape vs. abort info into the caller nicely,
1435 // possibly by just storing the PtrInfo in the AllocaSlices.
1436 PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst()
1437 : PtrI.getAbortingInst();
1438 assert(PointerEscapingInstr && "Did not track a bad instruction");
1439 return;
1440 }
1441 PointerEscapingInstrReadOnly = PtrI.getEscapedReadOnlyInst();
1442
1443 llvm::erase_if(C&: Slices, P: [](const Slice &S) { return S.isDead(); });
1444
1445 // Sort the uses. This arranges for the offsets to be in ascending order,
1446 // and the sizes to be in descending order.
1447 llvm::stable_sort(Range&: Slices);
1448}
1449
1450#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1451
1452void AllocaSlices::print(raw_ostream &OS, const_iterator I,
1453 StringRef Indent) const {
1454 printSlice(OS, I, Indent);
1455 OS << "\n";
1456 printUse(OS, I, Indent);
1457}
1458
1459void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I,
1460 StringRef Indent) const {
1461 OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")"
1462 << " slice #" << (I - begin())
1463 << (I->isSplittable() ? " (splittable)" : "");
1464}
1465
1466void AllocaSlices::printUse(raw_ostream &OS, const_iterator I,
1467 StringRef Indent) const {
1468 OS << Indent << " used by: " << *I->getUse()->getUser() << "\n";
1469}
1470
1471void AllocaSlices::print(raw_ostream &OS) const {
1472 if (PointerEscapingInstr) {
1473 OS << "Can't analyze slices for alloca: " << AI << "\n"
1474 << " A pointer to this alloca escaped by:\n"
1475 << " " << *PointerEscapingInstr << "\n";
1476 return;
1477 }
1478
1479 if (PointerEscapingInstrReadOnly)
1480 OS << "Escapes into ReadOnly: " << *PointerEscapingInstrReadOnly << "\n";
1481
1482 OS << "Slices of alloca: " << AI << "\n";
1483 for (const_iterator I = begin(), E = end(); I != E; ++I)
1484 print(OS, I);
1485}
1486
1487LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const {
1488 print(dbgs(), I);
1489}
1490LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }
1491
1492#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1493
1494/// Walk the range of a partitioning looking for a common type to cover this
1495/// sequence of slices.
1496static std::pair<Type *, IntegerType *>
1497findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E,
1498 uint64_t EndOffset) {
1499 Type *Ty = nullptr;
1500 bool TyIsCommon = true;
1501 IntegerType *ITy = nullptr;
1502
1503 // Note that we need to look at *every* alloca slice's Use to ensure we
1504 // always get consistent results regardless of the order of slices.
1505 for (AllocaSlices::const_iterator I = B; I != E; ++I) {
1506 Use *U = I->getUse();
1507 if (isa<IntrinsicInst>(Val: *U->getUser()))
1508 continue;
1509 if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset)
1510 continue;
1511
1512 Type *UserTy = nullptr;
1513 if (LoadInst *LI = dyn_cast<LoadInst>(Val: U->getUser())) {
1514 UserTy = LI->getType();
1515 } else if (StoreInst *SI = dyn_cast<StoreInst>(Val: U->getUser())) {
1516 UserTy = SI->getValueOperand()->getType();
1517 }
1518
1519 if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(Val: UserTy)) {
1520 // If the type is larger than the partition, skip it. We only encounter
1521 // this for split integer operations where we want to use the type of the
1522 // entity causing the split. Also skip if the type is not a byte width
1523 // multiple.
1524 if (UserITy->getBitWidth() % 8 != 0 ||
1525 UserITy->getBitWidth() / 8 > (EndOffset - B->beginOffset()))
1526 continue;
1527
1528 // Track the largest bitwidth integer type used in this way in case there
1529 // is no common type.
1530 if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth())
1531 ITy = UserITy;
1532 }
1533
1534 // To avoid depending on the order of slices, Ty and TyIsCommon must not
1535 // depend on types skipped above.
1536 if (!UserTy || (Ty && Ty != UserTy))
1537 TyIsCommon = false; // Give up on anything but an iN type.
1538 else
1539 Ty = UserTy;
1540 }
1541
1542 return {TyIsCommon ? Ty : nullptr, ITy};
1543}
1544
1545/// PHI instructions that use an alloca and are subsequently loaded can be
1546/// rewritten to load both input pointers in the pred blocks and then PHI the
1547/// results, allowing the load of the alloca to be promoted.
1548/// From this:
1549/// %P2 = phi [i32* %Alloca, i32* %Other]
1550/// %V = load i32* %P2
1551/// to:
1552/// %V1 = load i32* %Alloca -> will be mem2reg'd
1553/// ...
1554/// %V2 = load i32* %Other
1555/// ...
1556/// %V = phi [i32 %V1, i32 %V2]
1557///
1558/// We can do this to a select if its only uses are loads and if the operands
1559/// to the select can be loaded unconditionally.
1560///
1561/// FIXME: This should be hoisted into a generic utility, likely in
1562/// Transforms/Util/Local.h
1563static bool isSafePHIToSpeculate(PHINode &PN) {
1564 const DataLayout &DL = PN.getDataLayout();
1565
1566 // For now, we can only do this promotion if the load is in the same block
1567 // as the PHI, and if there are no stores between the phi and load.
1568 // TODO: Allow recursive phi users.
1569 // TODO: Allow stores.
1570 BasicBlock *BB = PN.getParent();
1571 Align MaxAlign;
1572 uint64_t APWidth = DL.getIndexTypeSizeInBits(Ty: PN.getType());
1573 Type *LoadType = nullptr;
1574 for (User *U : PN.users()) {
1575 LoadInst *LI = dyn_cast<LoadInst>(Val: U);
1576 if (!LI || !LI->isSimple())
1577 return false;
1578
1579 // For now we only allow loads in the same block as the PHI. This is
1580 // a common case that happens when instcombine merges two loads through
1581 // a PHI.
1582 if (LI->getParent() != BB)
1583 return false;
1584
1585 if (LoadType) {
1586 if (LoadType != LI->getType())
1587 return false;
1588 } else {
1589 LoadType = LI->getType();
1590 }
1591
1592 // Ensure that there are no instructions between the PHI and the load that
1593 // could store.
1594 for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI)
1595 if (BBI->mayWriteToMemory())
1596 return false;
1597
1598 MaxAlign = std::max(a: MaxAlign, b: LI->getAlign());
1599 }
1600
1601 if (!LoadType)
1602 return false;
1603
1604 APInt LoadSize =
1605 APInt(APWidth, DL.getTypeStoreSize(Ty: LoadType).getFixedValue());
1606
1607 // We can only transform this if it is safe to push the loads into the
1608 // predecessor blocks. The only thing to watch out for is that we can't put
1609 // a possibly trapping load in the predecessor if it is a critical edge.
1610 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1611 Instruction *TI = PN.getIncomingBlock(i: Idx)->getTerminator();
1612 Value *InVal = PN.getIncomingValue(i: Idx);
1613
1614 // If the value is produced by the terminator of the predecessor (an
1615 // invoke) or it has side-effects, there is no valid place to put a load
1616 // in the predecessor.
1617 if (TI == InVal || TI->mayHaveSideEffects())
1618 return false;
1619
1620 // If the predecessor has a single successor, then the edge isn't
1621 // critical.
1622 if (TI->getNumSuccessors() == 1)
1623 continue;
1624
1625 // If this pointer is always safe to load, or if we can prove that there
1626 // is already a load in the block, then we can move the load to the pred
1627 // block.
1628 if (isSafeToLoadUnconditionally(V: InVal, Alignment: MaxAlign, Size: LoadSize, DL, ScanFrom: TI))
1629 continue;
1630
1631 return false;
1632 }
1633
1634 return true;
1635}
1636
1637static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN) {
1638 LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
1639
1640 LoadInst *SomeLoad = cast<LoadInst>(Val: PN.user_back());
1641 Type *LoadTy = SomeLoad->getType();
1642 IRB.SetInsertPoint(&PN);
1643 PHINode *NewPN = IRB.CreatePHI(Ty: LoadTy, NumReservedValues: PN.getNumIncomingValues(),
1644 Name: PN.getName() + ".sroa.speculated");
1645
1646 // Get the AA tags and alignment to use from one of the loads. It does not
1647 // matter which one we get and if any differ.
1648 AAMDNodes AATags = SomeLoad->getAAMetadata();
1649 Align Alignment = SomeLoad->getAlign();
1650
1651 // Rewrite all loads of the PN to use the new PHI.
1652 while (!PN.use_empty()) {
1653 LoadInst *LI = cast<LoadInst>(Val: PN.user_back());
1654 LI->replaceAllUsesWith(V: NewPN);
1655 LI->eraseFromParent();
1656 }
1657
1658 // Inject loads into all of the pred blocks.
1659 DenseMap<BasicBlock *, Value *> InjectedLoads;
1660 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1661 BasicBlock *Pred = PN.getIncomingBlock(i: Idx);
1662 Value *InVal = PN.getIncomingValue(i: Idx);
1663
1664 // A PHI node is allowed to have multiple (duplicated) entries for the same
1665 // basic block, as long as the value is the same. So if we already injected
1666 // a load in the predecessor, then we should reuse the same load for all
1667 // duplicated entries.
1668 if (Value *V = InjectedLoads.lookup(Val: Pred)) {
1669 NewPN->addIncoming(V, BB: Pred);
1670 continue;
1671 }
1672
1673 Instruction *TI = Pred->getTerminator();
1674 IRB.SetInsertPoint(TI);
1675
1676 LoadInst *Load = IRB.CreateAlignedLoad(
1677 Ty: LoadTy, Ptr: InVal, Align: Alignment,
1678 Name: (PN.getName() + ".sroa.speculate.load." + Pred->getName()));
1679 ++NumLoadsSpeculated;
1680 if (AATags)
1681 Load->setAAMetadata(AATags);
1682 NewPN->addIncoming(V: Load, BB: Pred);
1683 InjectedLoads[Pred] = Load;
1684 }
1685
1686 LLVM_DEBUG(dbgs() << " speculated to: " << *NewPN << "\n");
1687 PN.eraseFromParent();
1688}
1689
1690SelectHandSpeculativity &
1691SelectHandSpeculativity::setAsSpeculatable(bool isTrueVal) {
1692 if (isTrueVal)
1693 Bitfield::set<SelectHandSpeculativity::TrueVal>(Packed&: Storage, Value: true);
1694 else
1695 Bitfield::set<SelectHandSpeculativity::FalseVal>(Packed&: Storage, Value: true);
1696 return *this;
1697}
1698
1699bool SelectHandSpeculativity::isSpeculatable(bool isTrueVal) const {
1700 return isTrueVal ? Bitfield::get<SelectHandSpeculativity::TrueVal>(Packed: Storage)
1701 : Bitfield::get<SelectHandSpeculativity::FalseVal>(Packed: Storage);
1702}
1703
1704bool SelectHandSpeculativity::areAllSpeculatable() const {
1705 return isSpeculatable(/*isTrueVal=*/true) &&
1706 isSpeculatable(/*isTrueVal=*/false);
1707}
1708
1709bool SelectHandSpeculativity::areAnySpeculatable() const {
1710 return isSpeculatable(/*isTrueVal=*/true) ||
1711 isSpeculatable(/*isTrueVal=*/false);
1712}
1713bool SelectHandSpeculativity::areNoneSpeculatable() const {
1714 return !areAnySpeculatable();
1715}
1716
1717static SelectHandSpeculativity
1718isSafeLoadOfSelectToSpeculate(LoadInst &LI, SelectInst &SI, bool PreserveCFG) {
1719 assert(LI.isSimple() && "Only for simple loads");
1720 SelectHandSpeculativity Spec;
1721
1722 const DataLayout &DL = SI.getDataLayout();
1723 for (Value *Value : {SI.getTrueValue(), SI.getFalseValue()})
1724 if (isSafeToLoadUnconditionally(V: Value, Ty: LI.getType(), Alignment: LI.getAlign(), DL,
1725 ScanFrom: &LI))
1726 Spec.setAsSpeculatable(/*isTrueVal=*/Value == SI.getTrueValue());
1727 else if (PreserveCFG)
1728 return Spec;
1729
1730 return Spec;
1731}
1732
1733std::optional<RewriteableMemOps>
1734SROA::isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG) {
1735 RewriteableMemOps Ops;
1736
1737 for (User *U : SI.users()) {
1738 if (auto *BC = dyn_cast<BitCastInst>(Val: U); BC && BC->hasOneUse())
1739 U = *BC->user_begin();
1740
1741 if (auto *Store = dyn_cast<StoreInst>(Val: U)) {
1742 // Note that atomic stores can be transformed; atomic semantics do not
1743 // have any meaning for a local alloca. Stores are not speculatable,
1744 // however, so if we can't turn it into a predicated store, we are done.
1745 if (Store->isVolatile() || PreserveCFG)
1746 return {}; // Give up on this `select`.
1747 Ops.emplace_back(Args&: Store);
1748 continue;
1749 }
1750
1751 auto *LI = dyn_cast<LoadInst>(Val: U);
1752
1753 // Note that atomic loads can be transformed;
1754 // atomic semantics do not have any meaning for a local alloca.
1755 if (!LI || LI->isVolatile())
1756 return {}; // Give up on this `select`.
1757
1758 PossiblySpeculatableLoad Load(LI);
1759 if (!LI->isSimple()) {
1760 // If the `load` is not simple, we can't speculatively execute it,
1761 // but we could handle this via a CFG modification. But can we?
1762 if (PreserveCFG)
1763 return {}; // Give up on this `select`.
1764 Ops.emplace_back(Args&: Load);
1765 continue;
1766 }
1767
1768 SelectHandSpeculativity Spec =
1769 isSafeLoadOfSelectToSpeculate(LI&: *LI, SI, PreserveCFG);
1770 if (PreserveCFG && !Spec.areAllSpeculatable())
1771 return {}; // Give up on this `select`.
1772
1773 Load.setInt(Spec);
1774 Ops.emplace_back(Args&: Load);
1775 }
1776
1777 return Ops;
1778}
1779
1780static void speculateSelectInstLoads(SelectInst &SI, LoadInst &LI,
1781 IRBuilderTy &IRB) {
1782 LLVM_DEBUG(dbgs() << " original load: " << SI << "\n");
1783
1784 Value *TV = SI.getTrueValue();
1785 Value *FV = SI.getFalseValue();
1786 // Replace the given load of the select with a select of two loads.
1787
1788 assert(LI.isSimple() && "We only speculate simple loads");
1789
1790 IRB.SetInsertPoint(&LI);
1791
1792 LoadInst *TL =
1793 IRB.CreateAlignedLoad(Ty: LI.getType(), Ptr: TV, Align: LI.getAlign(),
1794 Name: LI.getName() + ".sroa.speculate.load.true");
1795 LoadInst *FL =
1796 IRB.CreateAlignedLoad(Ty: LI.getType(), Ptr: FV, Align: LI.getAlign(),
1797 Name: LI.getName() + ".sroa.speculate.load.false");
1798 NumLoadsSpeculated += 2;
1799
1800 // Transfer alignment and AA info if present.
1801 TL->setAlignment(LI.getAlign());
1802 FL->setAlignment(LI.getAlign());
1803
1804 AAMDNodes Tags = LI.getAAMetadata();
1805 if (Tags) {
1806 TL->setAAMetadata(Tags);
1807 FL->setAAMetadata(Tags);
1808 }
1809
1810 Value *V = IRB.CreateSelect(C: SI.getCondition(), True: TL, False: FL,
1811 Name: LI.getName() + ".sroa.speculated");
1812
1813 LLVM_DEBUG(dbgs() << " speculated to: " << *V << "\n");
1814 LI.replaceAllUsesWith(V);
1815}
1816
1817template <typename T>
1818static void rewriteMemOpOfSelect(SelectInst &SI, T &I,
1819 SelectHandSpeculativity Spec,
1820 DomTreeUpdater &DTU) {
1821 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && "Only for load and store!");
1822 LLVM_DEBUG(dbgs() << " original mem op: " << I << "\n");
1823 BasicBlock *Head = I.getParent();
1824 Instruction *ThenTerm = nullptr;
1825 Instruction *ElseTerm = nullptr;
1826 if (Spec.areNoneSpeculatable())
1827 SplitBlockAndInsertIfThenElse(SI.getCondition(), &I, &ThenTerm, &ElseTerm,
1828 SI.getMetadata(KindID: LLVMContext::MD_prof), &DTU);
1829 else {
1830 SplitBlockAndInsertIfThen(SI.getCondition(), &I, /*Unreachable=*/false,
1831 SI.getMetadata(KindID: LLVMContext::MD_prof), &DTU,
1832 /*LI=*/nullptr, /*ThenBlock=*/nullptr);
1833 if (Spec.isSpeculatable(/*isTrueVal=*/true))
1834 cast<BranchInst>(Val: Head->getTerminator())->swapSuccessors();
1835 }
1836 auto *HeadBI = cast<BranchInst>(Val: Head->getTerminator());
1837 Spec = {}; // Do not use `Spec` beyond this point.
1838 BasicBlock *Tail = I.getParent();
1839 Tail->setName(Head->getName() + ".cont");
1840 PHINode *PN;
1841 if (isa<LoadInst>(I))
1842 PN = PHINode::Create(Ty: I.getType(), NumReservedValues: 2, NameStr: "", InsertBefore: I.getIterator());
1843 for (BasicBlock *SuccBB : successors(BB: Head)) {
1844 bool IsThen = SuccBB == HeadBI->getSuccessor(i: 0);
1845 int SuccIdx = IsThen ? 0 : 1;
1846 auto *NewMemOpBB = SuccBB == Tail ? Head : SuccBB;
1847 auto &CondMemOp = cast<T>(*I.clone());
1848 if (NewMemOpBB != Head) {
1849 NewMemOpBB->setName(Head->getName() + (IsThen ? ".then" : ".else"));
1850 if (isa<LoadInst>(I))
1851 ++NumLoadsPredicated;
1852 else
1853 ++NumStoresPredicated;
1854 } else {
1855 CondMemOp.dropUBImplyingAttrsAndMetadata();
1856 ++NumLoadsSpeculated;
1857 }
1858 CondMemOp.insertBefore(NewMemOpBB->getTerminator()->getIterator());
1859 Value *Ptr = SI.getOperand(i_nocapture: 1 + SuccIdx);
1860 CondMemOp.setOperand(I.getPointerOperandIndex(), Ptr);
1861 if (isa<LoadInst>(I)) {
1862 CondMemOp.setName(I.getName() + (IsThen ? ".then" : ".else") + ".val");
1863 PN->addIncoming(V: &CondMemOp, BB: NewMemOpBB);
1864 } else
1865 LLVM_DEBUG(dbgs() << " to: " << CondMemOp << "\n");
1866 }
1867 if (isa<LoadInst>(I)) {
1868 PN->takeName(V: &I);
1869 LLVM_DEBUG(dbgs() << " to: " << *PN << "\n");
1870 I.replaceAllUsesWith(PN);
1871 }
1872}
1873
1874static void rewriteMemOpOfSelect(SelectInst &SelInst, Instruction &I,
1875 SelectHandSpeculativity Spec,
1876 DomTreeUpdater &DTU) {
1877 if (auto *LI = dyn_cast<LoadInst>(Val: &I))
1878 rewriteMemOpOfSelect(SI&: SelInst, I&: *LI, Spec, DTU);
1879 else if (auto *SI = dyn_cast<StoreInst>(Val: &I))
1880 rewriteMemOpOfSelect(SI&: SelInst, I&: *SI, Spec, DTU);
1881 else
1882 llvm_unreachable_internal(msg: "Only for load and store.");
1883}
1884
1885static bool rewriteSelectInstMemOps(SelectInst &SI,
1886 const RewriteableMemOps &Ops,
1887 IRBuilderTy &IRB, DomTreeUpdater *DTU) {
1888 bool CFGChanged = false;
1889 LLVM_DEBUG(dbgs() << " original select: " << SI << "\n");
1890
1891 for (const RewriteableMemOp &Op : Ops) {
1892 SelectHandSpeculativity Spec;
1893 Instruction *I;
1894 if (auto *const *US = std::get_if<UnspeculatableStore>(ptr: &Op)) {
1895 I = *US;
1896 } else {
1897 auto PSL = std::get<PossiblySpeculatableLoad>(v: Op);
1898 I = PSL.getPointer();
1899 Spec = PSL.getInt();
1900 }
1901 if (Spec.areAllSpeculatable()) {
1902 speculateSelectInstLoads(SI, LI&: cast<LoadInst>(Val&: *I), IRB);
1903 } else {
1904 assert(DTU && "Should not get here when not allowed to modify the CFG!");
1905 rewriteMemOpOfSelect(SelInst&: SI, I&: *I, Spec, DTU&: *DTU);
1906 CFGChanged = true;
1907 }
1908 I->eraseFromParent();
1909 }
1910
1911 for (User *U : make_early_inc_range(Range: SI.users()))
1912 cast<BitCastInst>(Val: U)->eraseFromParent();
1913 SI.eraseFromParent();
1914 return CFGChanged;
1915}
1916
1917/// Compute an adjusted pointer from Ptr by Offset bytes where the
1918/// resulting pointer has PointerTy.
1919static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
1920 APInt Offset, Type *PointerTy,
1921 const Twine &NamePrefix) {
1922 if (Offset != 0)
1923 Ptr = IRB.CreateInBoundsPtrAdd(Ptr, Offset: IRB.getInt(AI: Offset),
1924 Name: NamePrefix + "sroa_idx");
1925 return IRB.CreatePointerBitCastOrAddrSpaceCast(V: Ptr, DestTy: PointerTy,
1926 Name: NamePrefix + "sroa_cast");
1927}
1928
1929/// Compute the adjusted alignment for a load or store from an offset.
1930static Align getAdjustedAlignment(Instruction *I, uint64_t Offset) {
1931 return commonAlignment(A: getLoadStoreAlignment(I), Offset);
1932}
1933
1934/// Test whether we can convert a value from the old to the new type.
1935///
1936/// This predicate should be used to guard calls to convertValue in order to
1937/// ensure that we only try to convert viable values. The strategy is that we
1938/// will peel off single element struct and array wrappings to get to an
1939/// underlying value, and convert that value.
1940static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy,
1941 unsigned VScale = 0) {
1942 if (OldTy == NewTy)
1943 return true;
1944
1945 // For integer types, we can't handle any bit-width differences. This would
1946 // break both vector conversions with extension and introduce endianness
1947 // issues when in conjunction with loads and stores.
1948 if (isa<IntegerType>(Val: OldTy) && isa<IntegerType>(Val: NewTy)) {
1949 assert(cast<IntegerType>(OldTy)->getBitWidth() !=
1950 cast<IntegerType>(NewTy)->getBitWidth() &&
1951 "We can't have the same bitwidth for different int types");
1952 return false;
1953 }
1954
1955 TypeSize NewSize = DL.getTypeSizeInBits(Ty: NewTy);
1956 TypeSize OldSize = DL.getTypeSizeInBits(Ty: OldTy);
1957
1958 if ((isa<ScalableVectorType>(Val: NewTy) && isa<FixedVectorType>(Val: OldTy)) ||
1959 (isa<ScalableVectorType>(Val: OldTy) && isa<FixedVectorType>(Val: NewTy))) {
1960 // Conversion is only possible when the size of scalable vectors is known.
1961 if (!VScale)
1962 return false;
1963
1964 // For ptr-to-int and int-to-ptr casts, the pointer side is resolved within
1965 // a single domain (either fixed or scalable). Any additional conversion
1966 // between fixed and scalable types is handled through integer types.
1967 auto OldVTy = OldTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(OldTy) : OldTy;
1968 auto NewVTy = NewTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(NewTy) : NewTy;
1969
1970 if (isa<ScalableVectorType>(Val: NewTy)) {
1971 if (!VectorType::getWithSizeAndScalar(SizeTy: cast<VectorType>(Val: NewVTy), EltTy: OldVTy))
1972 return false;
1973
1974 NewSize = TypeSize::getFixed(ExactSize: NewSize.getKnownMinValue() * VScale);
1975 } else {
1976 if (!VectorType::getWithSizeAndScalar(SizeTy: cast<VectorType>(Val: OldVTy), EltTy: NewVTy))
1977 return false;
1978
1979 OldSize = TypeSize::getFixed(ExactSize: OldSize.getKnownMinValue() * VScale);
1980 }
1981 }
1982
1983 if (NewSize != OldSize)
1984 return false;
1985 if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
1986 return false;
1987
1988 // We can convert pointers to integers and vice-versa. Same for vectors
1989 // of pointers and integers.
1990 OldTy = OldTy->getScalarType();
1991 NewTy = NewTy->getScalarType();
1992 if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
1993 if (NewTy->isPointerTy() && OldTy->isPointerTy()) {
1994 unsigned OldAS = OldTy->getPointerAddressSpace();
1995 unsigned NewAS = NewTy->getPointerAddressSpace();
1996 // Convert pointers if they are pointers from the same address space or
1997 // different integral (not non-integral) address spaces with the same
1998 // pointer size.
1999 return OldAS == NewAS ||
2000 (!DL.isNonIntegralAddressSpace(AddrSpace: OldAS) &&
2001 !DL.isNonIntegralAddressSpace(AddrSpace: NewAS) &&
2002 DL.getPointerSize(AS: OldAS) == DL.getPointerSize(AS: NewAS));
2003 }
2004
2005 // We can convert integers to integral pointers, but not to non-integral
2006 // pointers.
2007 if (OldTy->isIntegerTy())
2008 return !DL.isNonIntegralPointerType(Ty: NewTy);
2009
2010 // We can convert integral pointers to integers, but non-integral pointers
2011 // need to remain pointers.
2012 if (!DL.isNonIntegralPointerType(Ty: OldTy))
2013 return NewTy->isIntegerTy();
2014
2015 return false;
2016 }
2017
2018 if (OldTy->isTargetExtTy() || NewTy->isTargetExtTy())
2019 return false;
2020
2021 return true;
2022}
2023
2024/// Generic routine to convert an SSA value to a value of a different
2025/// type.
2026///
2027/// This will try various different casting techniques, such as bitcasts,
2028/// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
2029/// two types for viability with this routine.
2030static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
2031 Type *NewTy) {
2032 Type *OldTy = V->getType();
2033
2034#ifndef NDEBUG
2035 BasicBlock *BB = IRB.GetInsertBlock();
2036 assert(BB && BB->getParent() && "VScale unknown!");
2037 unsigned VScale = BB->getParent()->getVScaleValue();
2038 assert(canConvertValue(DL, OldTy, NewTy, VScale) &&
2039 "Value not convertable to type");
2040#endif
2041
2042 if (OldTy == NewTy)
2043 return V;
2044
2045 assert(!(isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) &&
2046 "Integer types must be the exact same to convert.");
2047
2048 // A variant of bitcast that supports a mixture of fixed and scalable types
2049 // that are know to have the same size.
2050 auto CreateBitCastLike = [&IRB](Value *In, Type *Ty) -> Value * {
2051 Type *InTy = In->getType();
2052 if (InTy == Ty)
2053 return In;
2054
2055 if (isa<FixedVectorType>(Val: InTy) && isa<ScalableVectorType>(Val: Ty)) {
2056 // For vscale_range(2) expand <4 x i32> to <vscale x 4 x i16> -->
2057 // <4 x i32> to <vscale x 2 x i32> to <vscale x 4 x i16>
2058 auto *VTy = VectorType::getWithSizeAndScalar(SizeTy: cast<VectorType>(Val: Ty), EltTy: InTy);
2059 return IRB.CreateBitCast(V: IRB.CreateInsertVector(DstType: VTy,
2060 SrcVec: PoisonValue::get(T: VTy), SubVec: In,
2061 Idx: IRB.getInt64(C: 0)),
2062 DestTy: Ty);
2063 }
2064
2065 if (isa<ScalableVectorType>(Val: InTy) && isa<FixedVectorType>(Val: Ty)) {
2066 // For vscale_range(2) expand <vscale x 4 x i16> to <4 x i32> -->
2067 // <vscale x 4 x i16> to <vscale x 2 x i32> to <4 x i32>
2068 auto *VTy = VectorType::getWithSizeAndScalar(SizeTy: cast<VectorType>(Val: InTy), EltTy: Ty);
2069 return IRB.CreateExtractVector(DstType: Ty, SrcVec: IRB.CreateBitCast(V: In, DestTy: VTy),
2070 Idx: IRB.getInt64(C: 0));
2071 }
2072
2073 return IRB.CreateBitCast(V: In, DestTy: Ty);
2074 };
2075
2076 // See if we need inttoptr for this type pair. May require additional bitcast.
2077 if (OldTy->isIntOrIntVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
2078 // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8*
2079 // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*>
2080 // Expand <4 x i32> to <2 x i8*> --> <4 x i32> to <2 x i64> to <2 x i8*>
2081 // Directly handle i64 to i8*
2082 return IRB.CreateIntToPtr(V: CreateBitCastLike(V, DL.getIntPtrType(NewTy)),
2083 DestTy: NewTy);
2084 }
2085
2086 // See if we need ptrtoint for this type pair. May require additional bitcast.
2087 if (OldTy->isPtrOrPtrVectorTy() && NewTy->isIntOrIntVectorTy()) {
2088 // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128
2089 // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32>
2090 // Expand <2 x i8*> to <4 x i32> --> <2 x i8*> to <2 x i64> to <4 x i32>
2091 // Expand i8* to i64 --> i8* to i64 to i64
2092 return CreateBitCastLike(IRB.CreatePtrToInt(V, DestTy: DL.getIntPtrType(OldTy)),
2093 NewTy);
2094 }
2095
2096 if (OldTy->isPtrOrPtrVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
2097 unsigned OldAS = OldTy->getPointerAddressSpace();
2098 unsigned NewAS = NewTy->getPointerAddressSpace();
2099 // To convert pointers with different address spaces (they are already
2100 // checked convertible, i.e. they have the same pointer size), so far we
2101 // cannot use `bitcast` (which has restrict on the same address space) or
2102 // `addrspacecast` (which is not always no-op casting). Instead, use a pair
2103 // of no-op `ptrtoint`/`inttoptr` casts through an integer with the same bit
2104 // size.
2105 if (OldAS != NewAS) {
2106 assert(DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
2107 return IRB.CreateIntToPtr(
2108 V: CreateBitCastLike(IRB.CreatePtrToInt(V, DestTy: DL.getIntPtrType(OldTy)),
2109 DL.getIntPtrType(NewTy)),
2110 DestTy: NewTy);
2111 }
2112 }
2113
2114 return CreateBitCastLike(V, NewTy);
2115}
2116
2117/// Test whether the given slice use can be promoted to a vector.
2118///
2119/// This function is called to test each entry in a partition which is slated
2120/// for a single slice.
2121static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
2122 VectorType *Ty,
2123 uint64_t ElementSize,
2124 const DataLayout &DL,
2125 unsigned VScale) {
2126 // First validate the slice offsets.
2127 uint64_t BeginOffset =
2128 std::max(a: S.beginOffset(), b: P.beginOffset()) - P.beginOffset();
2129 uint64_t BeginIndex = BeginOffset / ElementSize;
2130 if (BeginIndex * ElementSize != BeginOffset ||
2131 BeginIndex >= cast<FixedVectorType>(Val: Ty)->getNumElements())
2132 return false;
2133 uint64_t EndOffset = std::min(a: S.endOffset(), b: P.endOffset()) - P.beginOffset();
2134 uint64_t EndIndex = EndOffset / ElementSize;
2135 if (EndIndex * ElementSize != EndOffset ||
2136 EndIndex > cast<FixedVectorType>(Val: Ty)->getNumElements())
2137 return false;
2138
2139 assert(EndIndex > BeginIndex && "Empty vector!");
2140 uint64_t NumElements = EndIndex - BeginIndex;
2141 Type *SliceTy = (NumElements == 1)
2142 ? Ty->getElementType()
2143 : FixedVectorType::get(ElementType: Ty->getElementType(), NumElts: NumElements);
2144
2145 Type *SplitIntTy =
2146 Type::getIntNTy(C&: Ty->getContext(), N: NumElements * ElementSize * 8);
2147
2148 Use *U = S.getUse();
2149
2150 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Val: U->getUser())) {
2151 if (MI->isVolatile())
2152 return false;
2153 if (!S.isSplittable())
2154 return false; // Skip any unsplittable intrinsics.
2155 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: U->getUser())) {
2156 if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
2157 return false;
2158 } else if (LoadInst *LI = dyn_cast<LoadInst>(Val: U->getUser())) {
2159 if (LI->isVolatile())
2160 return false;
2161 Type *LTy = LI->getType();
2162 // Disable vector promotion when there are loads or stores of an FCA.
2163 if (LTy->isStructTy())
2164 return false;
2165 if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
2166 assert(LTy->isIntegerTy());
2167 LTy = SplitIntTy;
2168 }
2169 if (!canConvertValue(DL, OldTy: SliceTy, NewTy: LTy, VScale))
2170 return false;
2171 } else if (StoreInst *SI = dyn_cast<StoreInst>(Val: U->getUser())) {
2172 if (SI->isVolatile())
2173 return false;
2174 Type *STy = SI->getValueOperand()->getType();
2175 // Disable vector promotion when there are loads or stores of an FCA.
2176 if (STy->isStructTy())
2177 return false;
2178 if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
2179 assert(STy->isIntegerTy());
2180 STy = SplitIntTy;
2181 }
2182 if (!canConvertValue(DL, OldTy: STy, NewTy: SliceTy, VScale))
2183 return false;
2184 } else {
2185 return false;
2186 }
2187
2188 return true;
2189}
2190
2191/// Test whether a vector type is viable for promotion.
2192///
2193/// This implements the necessary checking for \c checkVectorTypesForPromotion
2194/// (and thus isVectorPromotionViable) over all slices of the alloca for the
2195/// given VectorType.
2196static bool checkVectorTypeForPromotion(Partition &P, VectorType *VTy,
2197 const DataLayout &DL, unsigned VScale) {
2198 uint64_t ElementSize =
2199 DL.getTypeSizeInBits(Ty: VTy->getElementType()).getFixedValue();
2200
2201 // While the definition of LLVM vectors is bitpacked, we don't support sizes
2202 // that aren't byte sized.
2203 if (ElementSize % 8)
2204 return false;
2205 assert((DL.getTypeSizeInBits(VTy).getFixedValue() % 8) == 0 &&
2206 "vector size not a multiple of element size?");
2207 ElementSize /= 8;
2208
2209 for (const Slice &S : P)
2210 if (!isVectorPromotionViableForSlice(P, S, Ty: VTy, ElementSize, DL, VScale))
2211 return false;
2212
2213 for (const Slice *S : P.splitSliceTails())
2214 if (!isVectorPromotionViableForSlice(P, S: *S, Ty: VTy, ElementSize, DL, VScale))
2215 return false;
2216
2217 return true;
2218}
2219
2220/// Test whether any vector type in \p CandidateTys is viable for promotion.
2221///
2222/// This implements the necessary checking for \c isVectorPromotionViable over
2223/// all slices of the alloca for the given VectorType.
2224static VectorType *
2225checkVectorTypesForPromotion(Partition &P, const DataLayout &DL,
2226 SmallVectorImpl<VectorType *> &CandidateTys,
2227 bool HaveCommonEltTy, Type *CommonEltTy,
2228 bool HaveVecPtrTy, bool HaveCommonVecPtrTy,
2229 VectorType *CommonVecPtrTy, unsigned VScale) {
2230 // If we didn't find a vector type, nothing to do here.
2231 if (CandidateTys.empty())
2232 return nullptr;
2233
2234 // Pointer-ness is sticky, if we had a vector-of-pointers candidate type,
2235 // then we should choose it, not some other alternative.
2236 // But, we can't perform a no-op pointer address space change via bitcast,
2237 // so if we didn't have a common pointer element type, bail.
2238 if (HaveVecPtrTy && !HaveCommonVecPtrTy)
2239 return nullptr;
2240
2241 // Try to pick the "best" element type out of the choices.
2242 if (!HaveCommonEltTy && HaveVecPtrTy) {
2243 // If there was a pointer element type, there's really only one choice.
2244 CandidateTys.clear();
2245 CandidateTys.push_back(Elt: CommonVecPtrTy);
2246 } else if (!HaveCommonEltTy && !HaveVecPtrTy) {
2247 // Integer-ify vector types.
2248 for (VectorType *&VTy : CandidateTys) {
2249 if (!VTy->getElementType()->isIntegerTy())
2250 VTy = cast<VectorType>(Val: VTy->getWithNewType(EltTy: IntegerType::getIntNTy(
2251 C&: VTy->getContext(), N: VTy->getScalarSizeInBits())));
2252 }
2253
2254 // Rank the remaining candidate vector types. This is easy because we know
2255 // they're all integer vectors. We sort by ascending number of elements.
2256 auto RankVectorTypesComp = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
2257 (void)DL;
2258 assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
2259 DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
2260 "Cannot have vector types of different sizes!");
2261 assert(RHSTy->getElementType()->isIntegerTy() &&
2262 "All non-integer types eliminated!");
2263 assert(LHSTy->getElementType()->isIntegerTy() &&
2264 "All non-integer types eliminated!");
2265 return cast<FixedVectorType>(Val: RHSTy)->getNumElements() <
2266 cast<FixedVectorType>(Val: LHSTy)->getNumElements();
2267 };
2268 auto RankVectorTypesEq = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
2269 (void)DL;
2270 assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
2271 DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
2272 "Cannot have vector types of different sizes!");
2273 assert(RHSTy->getElementType()->isIntegerTy() &&
2274 "All non-integer types eliminated!");
2275 assert(LHSTy->getElementType()->isIntegerTy() &&
2276 "All non-integer types eliminated!");
2277 return cast<FixedVectorType>(Val: RHSTy)->getNumElements() ==
2278 cast<FixedVectorType>(Val: LHSTy)->getNumElements();
2279 };
2280 llvm::sort(C&: CandidateTys, Comp: RankVectorTypesComp);
2281 CandidateTys.erase(CS: llvm::unique(R&: CandidateTys, P: RankVectorTypesEq),
2282 CE: CandidateTys.end());
2283 } else {
2284// The only way to have the same element type in every vector type is to
2285// have the same vector type. Check that and remove all but one.
2286#ifndef NDEBUG
2287 for (VectorType *VTy : CandidateTys) {
2288 assert(VTy->getElementType() == CommonEltTy &&
2289 "Unaccounted for element type!");
2290 assert(VTy == CandidateTys[0] &&
2291 "Different vector types with the same element type!");
2292 }
2293#endif
2294 CandidateTys.resize(N: 1);
2295 }
2296
2297 // FIXME: hack. Do we have a named constant for this?
2298 // SDAG SDNode can't have more than 65535 operands.
2299 llvm::erase_if(C&: CandidateTys, P: [](VectorType *VTy) {
2300 return cast<FixedVectorType>(Val: VTy)->getNumElements() >
2301 std::numeric_limits<unsigned short>::max();
2302 });
2303
2304 for (VectorType *VTy : CandidateTys)
2305 if (checkVectorTypeForPromotion(P, VTy, DL, VScale))
2306 return VTy;
2307
2308 return nullptr;
2309}
2310
2311static VectorType *createAndCheckVectorTypesForPromotion(
2312 SetVector<Type *> &OtherTys, ArrayRef<VectorType *> CandidateTysCopy,
2313 function_ref<void(Type *)> CheckCandidateType, Partition &P,
2314 const DataLayout &DL, SmallVectorImpl<VectorType *> &CandidateTys,
2315 bool &HaveCommonEltTy, Type *&CommonEltTy, bool &HaveVecPtrTy,
2316 bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy, unsigned VScale) {
2317 [[maybe_unused]] VectorType *OriginalElt =
2318 CandidateTysCopy.size() ? CandidateTysCopy[0] : nullptr;
2319 // Consider additional vector types where the element type size is a
2320 // multiple of load/store element size.
2321 for (Type *Ty : OtherTys) {
2322 if (!VectorType::isValidElementType(ElemTy: Ty))
2323 continue;
2324 unsigned TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
2325 // Make a copy of CandidateTys and iterate through it, because we
2326 // might append to CandidateTys in the loop.
2327 for (VectorType *const VTy : CandidateTysCopy) {
2328 // The elements in the copy should remain invariant throughout the loop
2329 assert(CandidateTysCopy[0] == OriginalElt && "Different Element");
2330 unsigned VectorSize = DL.getTypeSizeInBits(Ty: VTy).getFixedValue();
2331 unsigned ElementSize =
2332 DL.getTypeSizeInBits(Ty: VTy->getElementType()).getFixedValue();
2333 if (TypeSize != VectorSize && TypeSize != ElementSize &&
2334 VectorSize % TypeSize == 0) {
2335 VectorType *NewVTy = VectorType::get(ElementType: Ty, NumElements: VectorSize / TypeSize, Scalable: false);
2336 CheckCandidateType(NewVTy);
2337 }
2338 }
2339 }
2340
2341 return checkVectorTypesForPromotion(
2342 P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
2343 HaveCommonVecPtrTy, CommonVecPtrTy, VScale);
2344}
2345
2346/// Test whether the given alloca partitioning and range of slices can be
2347/// promoted to a vector.
2348///
2349/// This is a quick test to check whether we can rewrite a particular alloca
2350/// partition (and its newly formed alloca) into a vector alloca with only
2351/// whole-vector loads and stores such that it could be promoted to a vector
2352/// SSA value. We only can ensure this for a limited set of operations, and we
2353/// don't want to do the rewrites unless we are confident that the result will
2354/// be promotable, so we have an early test here.
2355static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL,
2356 unsigned VScale) {
2357 // Collect the candidate types for vector-based promotion. Also track whether
2358 // we have different element types.
2359 SmallVector<VectorType *, 4> CandidateTys;
2360 SetVector<Type *> LoadStoreTys;
2361 SetVector<Type *> DeferredTys;
2362 Type *CommonEltTy = nullptr;
2363 VectorType *CommonVecPtrTy = nullptr;
2364 bool HaveVecPtrTy = false;
2365 bool HaveCommonEltTy = true;
2366 bool HaveCommonVecPtrTy = true;
2367 auto CheckCandidateType = [&](Type *Ty) {
2368 if (auto *VTy = dyn_cast<FixedVectorType>(Val: Ty)) {
2369 // Return if bitcast to vectors is different for total size in bits.
2370 if (!CandidateTys.empty()) {
2371 VectorType *V = CandidateTys[0];
2372 if (DL.getTypeSizeInBits(Ty: VTy).getFixedValue() !=
2373 DL.getTypeSizeInBits(Ty: V).getFixedValue()) {
2374 CandidateTys.clear();
2375 return;
2376 }
2377 }
2378 CandidateTys.push_back(Elt: VTy);
2379 Type *EltTy = VTy->getElementType();
2380
2381 if (!CommonEltTy)
2382 CommonEltTy = EltTy;
2383 else if (CommonEltTy != EltTy)
2384 HaveCommonEltTy = false;
2385
2386 if (EltTy->isPointerTy()) {
2387 HaveVecPtrTy = true;
2388 if (!CommonVecPtrTy)
2389 CommonVecPtrTy = VTy;
2390 else if (CommonVecPtrTy != VTy)
2391 HaveCommonVecPtrTy = false;
2392 }
2393 }
2394 };
2395
2396 // Put load and store types into a set for de-duplication.
2397 for (const Slice &S : P) {
2398 Type *Ty;
2399 if (auto *LI = dyn_cast<LoadInst>(Val: S.getUse()->getUser()))
2400 Ty = LI->getType();
2401 else if (auto *SI = dyn_cast<StoreInst>(Val: S.getUse()->getUser()))
2402 Ty = SI->getValueOperand()->getType();
2403 else
2404 continue;
2405
2406 auto CandTy = Ty->getScalarType();
2407 if (CandTy->isPointerTy() && (S.beginOffset() != P.beginOffset() ||
2408 S.endOffset() != P.endOffset())) {
2409 DeferredTys.insert(X: Ty);
2410 continue;
2411 }
2412
2413 LoadStoreTys.insert(X: Ty);
2414 // Consider any loads or stores that are the exact size of the slice.
2415 if (S.beginOffset() == P.beginOffset() && S.endOffset() == P.endOffset())
2416 CheckCandidateType(Ty);
2417 }
2418
2419 SmallVector<VectorType *, 4> CandidateTysCopy = CandidateTys;
2420 if (auto *VTy = createAndCheckVectorTypesForPromotion(
2421 OtherTys&: LoadStoreTys, CandidateTysCopy, CheckCandidateType, P, DL,
2422 CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
2423 HaveCommonVecPtrTy, CommonVecPtrTy, VScale))
2424 return VTy;
2425
2426 CandidateTys.clear();
2427 return createAndCheckVectorTypesForPromotion(
2428 OtherTys&: DeferredTys, CandidateTysCopy, CheckCandidateType, P, DL, CandidateTys,
2429 HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, HaveCommonVecPtrTy,
2430 CommonVecPtrTy, VScale);
2431}
2432
2433/// Test whether a slice of an alloca is valid for integer widening.
2434///
2435/// This implements the necessary checking for the \c isIntegerWideningViable
2436/// test below on a single slice of the alloca.
2437static bool isIntegerWideningViableForSlice(const Slice &S,
2438 uint64_t AllocBeginOffset,
2439 Type *AllocaTy,
2440 const DataLayout &DL,
2441 bool &WholeAllocaOp) {
2442 uint64_t Size = DL.getTypeStoreSize(Ty: AllocaTy).getFixedValue();
2443
2444 uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
2445 uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
2446
2447 Use *U = S.getUse();
2448
2449 // Lifetime intrinsics operate over the whole alloca whose sizes are usually
2450 // larger than other load/store slices (RelEnd > Size). But lifetime are
2451 // always promotable and should not impact other slices' promotability of the
2452 // partition.
2453 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: U->getUser())) {
2454 if (II->isLifetimeStartOrEnd() || II->isDroppable())
2455 return true;
2456 }
2457
2458 // We can't reasonably handle cases where the load or store extends past
2459 // the end of the alloca's type and into its padding.
2460 if (RelEnd > Size)
2461 return false;
2462
2463 if (LoadInst *LI = dyn_cast<LoadInst>(Val: U->getUser())) {
2464 if (LI->isVolatile())
2465 return false;
2466 // We can't handle loads that extend past the allocated memory.
2467 TypeSize LoadSize = DL.getTypeStoreSize(Ty: LI->getType());
2468 if (!LoadSize.isFixed() || LoadSize.getFixedValue() > Size)
2469 return false;
2470 // So far, AllocaSliceRewriter does not support widening split slice tails
2471 // in rewriteIntegerLoad.
2472 if (S.beginOffset() < AllocBeginOffset)
2473 return false;
2474 // Note that we don't count vector loads or stores as whole-alloca
2475 // operations which enable integer widening because we would prefer to use
2476 // vector widening instead.
2477 if (!isa<VectorType>(Val: LI->getType()) && RelBegin == 0 && RelEnd == Size)
2478 WholeAllocaOp = true;
2479 if (IntegerType *ITy = dyn_cast<IntegerType>(Val: LI->getType())) {
2480 if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(Ty: ITy).getFixedValue())
2481 return false;
2482 } else if (RelBegin != 0 || RelEnd != Size ||
2483 !canConvertValue(DL, OldTy: AllocaTy, NewTy: LI->getType())) {
2484 // Non-integer loads need to be convertible from the alloca type so that
2485 // they are promotable.
2486 return false;
2487 }
2488 } else if (StoreInst *SI = dyn_cast<StoreInst>(Val: U->getUser())) {
2489 Type *ValueTy = SI->getValueOperand()->getType();
2490 if (SI->isVolatile())
2491 return false;
2492 // We can't handle stores that extend past the allocated memory.
2493 TypeSize StoreSize = DL.getTypeStoreSize(Ty: ValueTy);
2494 if (!StoreSize.isFixed() || StoreSize.getFixedValue() > Size)
2495 return false;
2496 // So far, AllocaSliceRewriter does not support widening split slice tails
2497 // in rewriteIntegerStore.
2498 if (S.beginOffset() < AllocBeginOffset)
2499 return false;
2500 // Note that we don't count vector loads or stores as whole-alloca
2501 // operations which enable integer widening because we would prefer to use
2502 // vector widening instead.
2503 if (!isa<VectorType>(Val: ValueTy) && RelBegin == 0 && RelEnd == Size)
2504 WholeAllocaOp = true;
2505 if (IntegerType *ITy = dyn_cast<IntegerType>(Val: ValueTy)) {
2506 if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(Ty: ITy).getFixedValue())
2507 return false;
2508 } else if (RelBegin != 0 || RelEnd != Size ||
2509 !canConvertValue(DL, OldTy: ValueTy, NewTy: AllocaTy)) {
2510 // Non-integer stores need to be convertible to the alloca type so that
2511 // they are promotable.
2512 return false;
2513 }
2514 } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Val: U->getUser())) {
2515 if (MI->isVolatile() || !isa<Constant>(Val: MI->getLength()))
2516 return false;
2517 if (!S.isSplittable())
2518 return false; // Skip any unsplittable intrinsics.
2519 } else {
2520 return false;
2521 }
2522
2523 return true;
2524}
2525
2526/// Test whether the given alloca partition's integer operations can be
2527/// widened to promotable ones.
2528///
2529/// This is a quick test to check whether we can rewrite the integer loads and
2530/// stores to a particular alloca into wider loads and stores and be able to
2531/// promote the resulting alloca.
2532static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
2533 const DataLayout &DL) {
2534 uint64_t SizeInBits = DL.getTypeSizeInBits(Ty: AllocaTy).getFixedValue();
2535 // Don't create integer types larger than the maximum bitwidth.
2536 if (SizeInBits > IntegerType::MAX_INT_BITS)
2537 return false;
2538
2539 // Don't try to handle allocas with bit-padding.
2540 if (SizeInBits != DL.getTypeStoreSizeInBits(Ty: AllocaTy).getFixedValue())
2541 return false;
2542
2543 // We need to ensure that an integer type with the appropriate bitwidth can
2544 // be converted to the alloca type, whatever that is. We don't want to force
2545 // the alloca itself to have an integer type if there is a more suitable one.
2546 Type *IntTy = Type::getIntNTy(C&: AllocaTy->getContext(), N: SizeInBits);
2547 if (!canConvertValue(DL, OldTy: AllocaTy, NewTy: IntTy) ||
2548 !canConvertValue(DL, OldTy: IntTy, NewTy: AllocaTy))
2549 return false;
2550
2551 // While examining uses, we ensure that the alloca has a covering load or
2552 // store. We don't want to widen the integer operations only to fail to
2553 // promote due to some other unsplittable entry (which we may make splittable
2554 // later). However, if there are only splittable uses, go ahead and assume
2555 // that we cover the alloca.
2556 // FIXME: We shouldn't consider split slices that happen to start in the
2557 // partition here...
2558 bool WholeAllocaOp = P.empty() && DL.isLegalInteger(Width: SizeInBits);
2559
2560 for (const Slice &S : P)
2561 if (!isIntegerWideningViableForSlice(S, AllocBeginOffset: P.beginOffset(), AllocaTy, DL,
2562 WholeAllocaOp))
2563 return false;
2564
2565 for (const Slice *S : P.splitSliceTails())
2566 if (!isIntegerWideningViableForSlice(S: *S, AllocBeginOffset: P.beginOffset(), AllocaTy, DL,
2567 WholeAllocaOp))
2568 return false;
2569
2570 return WholeAllocaOp;
2571}
2572
2573static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
2574 IntegerType *Ty, uint64_t Offset,
2575 const Twine &Name) {
2576 LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2577 IntegerType *IntTy = cast<IntegerType>(Val: V->getType());
2578 assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <=
2579 DL.getTypeStoreSize(IntTy).getFixedValue() &&
2580 "Element extends past full value");
2581 uint64_t ShAmt = 8 * Offset;
2582 if (DL.isBigEndian())
2583 ShAmt = 8 * (DL.getTypeStoreSize(Ty: IntTy).getFixedValue() -
2584 DL.getTypeStoreSize(Ty).getFixedValue() - Offset);
2585 if (ShAmt) {
2586 V = IRB.CreateLShr(LHS: V, RHS: ShAmt, Name: Name + ".shift");
2587 LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2588 }
2589 assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2590 "Cannot extract to a larger integer!");
2591 if (Ty != IntTy) {
2592 V = IRB.CreateTrunc(V, DestTy: Ty, Name: Name + ".trunc");
2593 LLVM_DEBUG(dbgs() << " trunced: " << *V << "\n");
2594 }
2595 return V;
2596}
2597
2598static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
2599 Value *V, uint64_t Offset, const Twine &Name) {
2600 IntegerType *IntTy = cast<IntegerType>(Val: Old->getType());
2601 IntegerType *Ty = cast<IntegerType>(Val: V->getType());
2602 assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2603 "Cannot insert a larger integer!");
2604 LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2605 if (Ty != IntTy) {
2606 V = IRB.CreateZExt(V, DestTy: IntTy, Name: Name + ".ext");
2607 LLVM_DEBUG(dbgs() << " extended: " << *V << "\n");
2608 }
2609 assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <=
2610 DL.getTypeStoreSize(IntTy).getFixedValue() &&
2611 "Element store outside of alloca store");
2612 uint64_t ShAmt = 8 * Offset;
2613 if (DL.isBigEndian())
2614 ShAmt = 8 * (DL.getTypeStoreSize(Ty: IntTy).getFixedValue() -
2615 DL.getTypeStoreSize(Ty).getFixedValue() - Offset);
2616 if (ShAmt) {
2617 V = IRB.CreateShl(LHS: V, RHS: ShAmt, Name: Name + ".shift");
2618 LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2619 }
2620
2621 if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) {
2622 APInt Mask = ~Ty->getMask().zext(width: IntTy->getBitWidth()).shl(shiftAmt: ShAmt);
2623 Old = IRB.CreateAnd(LHS: Old, RHS: Mask, Name: Name + ".mask");
2624 LLVM_DEBUG(dbgs() << " masked: " << *Old << "\n");
2625 V = IRB.CreateOr(LHS: Old, RHS: V, Name: Name + ".insert");
2626 LLVM_DEBUG(dbgs() << " inserted: " << *V << "\n");
2627 }
2628 return V;
2629}
2630
2631static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
2632 unsigned EndIndex, const Twine &Name) {
2633 auto *VecTy = cast<FixedVectorType>(Val: V->getType());
2634 unsigned NumElements = EndIndex - BeginIndex;
2635 assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
2636
2637 if (NumElements == VecTy->getNumElements())
2638 return V;
2639
2640 if (NumElements == 1) {
2641 V = IRB.CreateExtractElement(Vec: V, Idx: IRB.getInt32(C: BeginIndex),
2642 Name: Name + ".extract");
2643 LLVM_DEBUG(dbgs() << " extract: " << *V << "\n");
2644 return V;
2645 }
2646
2647 auto Mask = llvm::to_vector<8>(Range: llvm::seq<int>(Begin: BeginIndex, End: EndIndex));
2648 V = IRB.CreateShuffleVector(V, Mask, Name: Name + ".extract");
2649 LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2650 return V;
2651}
2652
2653static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
2654 unsigned BeginIndex, const Twine &Name) {
2655 VectorType *VecTy = cast<VectorType>(Val: Old->getType());
2656 assert(VecTy && "Can only insert a vector into a vector");
2657
2658 VectorType *Ty = dyn_cast<VectorType>(Val: V->getType());
2659 if (!Ty) {
2660 // Single element to insert.
2661 V = IRB.CreateInsertElement(Vec: Old, NewElt: V, Idx: IRB.getInt32(C: BeginIndex),
2662 Name: Name + ".insert");
2663 LLVM_DEBUG(dbgs() << " insert: " << *V << "\n");
2664 return V;
2665 }
2666
2667 assert(cast<FixedVectorType>(Ty)->getNumElements() <=
2668 cast<FixedVectorType>(VecTy)->getNumElements() &&
2669 "Too many elements!");
2670 if (cast<FixedVectorType>(Val: Ty)->getNumElements() ==
2671 cast<FixedVectorType>(Val: VecTy)->getNumElements()) {
2672 assert(V->getType() == VecTy && "Vector type mismatch");
2673 return V;
2674 }
2675 unsigned EndIndex = BeginIndex + cast<FixedVectorType>(Val: Ty)->getNumElements();
2676
2677 // When inserting a smaller vector into the larger to store, we first
2678 // use a shuffle vector to widen it with undef elements, and then
2679 // a second shuffle vector to select between the loaded vector and the
2680 // incoming vector.
2681 SmallVector<int, 8> Mask;
2682 Mask.reserve(N: cast<FixedVectorType>(Val: VecTy)->getNumElements());
2683 for (unsigned i = 0; i != cast<FixedVectorType>(Val: VecTy)->getNumElements(); ++i)
2684 if (i >= BeginIndex && i < EndIndex)
2685 Mask.push_back(Elt: i - BeginIndex);
2686 else
2687 Mask.push_back(Elt: -1);
2688 V = IRB.CreateShuffleVector(V, Mask, Name: Name + ".expand");
2689 LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2690
2691 SmallVector<Constant *, 8> Mask2;
2692 Mask2.reserve(N: cast<FixedVectorType>(Val: VecTy)->getNumElements());
2693 for (unsigned i = 0; i != cast<FixedVectorType>(Val: VecTy)->getNumElements(); ++i)
2694 Mask2.push_back(Elt: IRB.getInt1(V: i >= BeginIndex && i < EndIndex));
2695
2696 V = IRB.CreateSelect(C: ConstantVector::get(V: Mask2), True: V, False: Old, Name: Name + "blend");
2697
2698 LLVM_DEBUG(dbgs() << " blend: " << *V << "\n");
2699 return V;
2700}
2701
2702namespace {
2703
2704/// Visitor to rewrite instructions using p particular slice of an alloca
2705/// to use a new alloca.
2706///
2707/// Also implements the rewriting to vector-based accesses when the partition
2708/// passes the isVectorPromotionViable predicate. Most of the rewriting logic
2709/// lives here.
2710class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
2711 // Befriend the base class so it can delegate to private visit methods.
2712 friend class InstVisitor<AllocaSliceRewriter, bool>;
2713
2714 using Base = InstVisitor<AllocaSliceRewriter, bool>;
2715
2716 const DataLayout &DL;
2717 AllocaSlices &AS;
2718 SROA &Pass;
2719 AllocaInst &OldAI, &NewAI;
2720 const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
2721 Type *NewAllocaTy;
2722
2723 // This is a convenience and flag variable that will be null unless the new
2724 // alloca's integer operations should be widened to this integer type due to
2725 // passing isIntegerWideningViable above. If it is non-null, the desired
2726 // integer type will be stored here for easy access during rewriting.
2727 IntegerType *IntTy;
2728
2729 // If we are rewriting an alloca partition which can be written as pure
2730 // vector operations, we stash extra information here. When VecTy is
2731 // non-null, we have some strict guarantees about the rewritten alloca:
2732 // - The new alloca is exactly the size of the vector type here.
2733 // - The accesses all either map to the entire vector or to a single
2734 // element.
2735 // - The set of accessing instructions is only one of those handled above
2736 // in isVectorPromotionViable. Generally these are the same access kinds
2737 // which are promotable via mem2reg.
2738 VectorType *VecTy;
2739 Type *ElementTy;
2740 uint64_t ElementSize;
2741
2742 // The original offset of the slice currently being rewritten relative to
2743 // the original alloca.
2744 uint64_t BeginOffset = 0;
2745 uint64_t EndOffset = 0;
2746
2747 // The new offsets of the slice currently being rewritten relative to the
2748 // original alloca.
2749 uint64_t NewBeginOffset = 0, NewEndOffset = 0;
2750
2751 uint64_t SliceSize = 0;
2752 bool IsSplittable = false;
2753 bool IsSplit = false;
2754 Use *OldUse = nullptr;
2755 Instruction *OldPtr = nullptr;
2756
2757 // Track post-rewrite users which are PHI nodes and Selects.
2758 SmallSetVector<PHINode *, 8> &PHIUsers;
2759 SmallSetVector<SelectInst *, 8> &SelectUsers;
2760
2761 // Utility IR builder, whose name prefix is setup for each visited use, and
2762 // the insertion point is set to point to the user.
2763 IRBuilderTy IRB;
2764
2765 // Return the new alloca, addrspacecasted if required to avoid changing the
2766 // addrspace of a volatile access.
2767 Value *getPtrToNewAI(unsigned AddrSpace, bool IsVolatile) {
2768 if (!IsVolatile || AddrSpace == NewAI.getType()->getPointerAddressSpace())
2769 return &NewAI;
2770
2771 Type *AccessTy = IRB.getPtrTy(AddrSpace);
2772 return IRB.CreateAddrSpaceCast(V: &NewAI, DestTy: AccessTy);
2773 }
2774
2775public:
2776 AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROA &Pass,
2777 AllocaInst &OldAI, AllocaInst &NewAI,
2778 uint64_t NewAllocaBeginOffset,
2779 uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
2780 VectorType *PromotableVecTy,
2781 SmallSetVector<PHINode *, 8> &PHIUsers,
2782 SmallSetVector<SelectInst *, 8> &SelectUsers)
2783 : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
2784 NewAllocaBeginOffset(NewAllocaBeginOffset),
2785 NewAllocaEndOffset(NewAllocaEndOffset),
2786 NewAllocaTy(NewAI.getAllocatedType()),
2787 IntTy(
2788 IsIntegerPromotable
2789 ? Type::getIntNTy(C&: NewAI.getContext(),
2790 N: DL.getTypeSizeInBits(Ty: NewAI.getAllocatedType())
2791 .getFixedValue())
2792 : nullptr),
2793 VecTy(PromotableVecTy),
2794 ElementTy(VecTy ? VecTy->getElementType() : nullptr),
2795 ElementSize(VecTy ? DL.getTypeSizeInBits(Ty: ElementTy).getFixedValue() / 8
2796 : 0),
2797 PHIUsers(PHIUsers), SelectUsers(SelectUsers),
2798 IRB(NewAI.getContext(), ConstantFolder()) {
2799 if (VecTy) {
2800 assert((DL.getTypeSizeInBits(ElementTy).getFixedValue() % 8) == 0 &&
2801 "Only multiple-of-8 sized vector elements are viable");
2802 ++NumVectorized;
2803 }
2804 assert((!IntTy && !VecTy) || (IntTy && !VecTy) || (!IntTy && VecTy));
2805 }
2806
2807 bool visit(AllocaSlices::const_iterator I) {
2808 bool CanSROA = true;
2809 BeginOffset = I->beginOffset();
2810 EndOffset = I->endOffset();
2811 IsSplittable = I->isSplittable();
2812 IsSplit =
2813 BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
2814 LLVM_DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : ""));
2815 LLVM_DEBUG(AS.printSlice(dbgs(), I, ""));
2816 LLVM_DEBUG(dbgs() << "\n");
2817
2818 // Compute the intersecting offset range.
2819 assert(BeginOffset < NewAllocaEndOffset);
2820 assert(EndOffset > NewAllocaBeginOffset);
2821 NewBeginOffset = std::max(a: BeginOffset, b: NewAllocaBeginOffset);
2822 NewEndOffset = std::min(a: EndOffset, b: NewAllocaEndOffset);
2823
2824 SliceSize = NewEndOffset - NewBeginOffset;
2825 LLVM_DEBUG(dbgs() << " Begin:(" << BeginOffset << ", " << EndOffset
2826 << ") NewBegin:(" << NewBeginOffset << ", "
2827 << NewEndOffset << ") NewAllocaBegin:("
2828 << NewAllocaBeginOffset << ", " << NewAllocaEndOffset
2829 << ")\n");
2830 assert(IsSplit || NewBeginOffset == BeginOffset);
2831 OldUse = I->getUse();
2832 OldPtr = cast<Instruction>(Val: OldUse->get());
2833
2834 Instruction *OldUserI = cast<Instruction>(Val: OldUse->getUser());
2835 IRB.SetInsertPoint(OldUserI);
2836 IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
2837 IRB.getInserter().SetNamePrefix(Twine(NewAI.getName()) + "." +
2838 Twine(BeginOffset) + ".");
2839
2840 CanSROA &= visit(I: cast<Instruction>(Val: OldUse->getUser()));
2841 if (VecTy || IntTy)
2842 assert(CanSROA);
2843 return CanSROA;
2844 }
2845
2846private:
2847 // Make sure the other visit overloads are visible.
2848 using Base::visit;
2849
2850 // Every instruction which can end up as a user must have a rewrite rule.
2851 bool visitInstruction(Instruction &I) {
2852 LLVM_DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n");
2853 llvm_unreachable("No rewrite rule for this instruction!");
2854 }
2855
2856 Value *getNewAllocaSlicePtr(IRBuilderTy &IRB, Type *PointerTy) {
2857 // Note that the offset computation can use BeginOffset or NewBeginOffset
2858 // interchangeably for unsplit slices.
2859 assert(IsSplit || BeginOffset == NewBeginOffset);
2860 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
2861
2862#ifndef NDEBUG
2863 StringRef OldName = OldPtr->getName();
2864 // Skip through the last '.sroa.' component of the name.
2865 size_t LastSROAPrefix = OldName.rfind(".sroa.");
2866 if (LastSROAPrefix != StringRef::npos) {
2867 OldName = OldName.substr(LastSROAPrefix + strlen(".sroa."));
2868 // Look for an SROA slice index.
2869 size_t IndexEnd = OldName.find_first_not_of("0123456789");
2870 if (IndexEnd != StringRef::npos && OldName[IndexEnd] == '.') {
2871 // Strip the index and look for the offset.
2872 OldName = OldName.substr(IndexEnd + 1);
2873 size_t OffsetEnd = OldName.find_first_not_of("0123456789");
2874 if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == '.')
2875 // Strip the offset.
2876 OldName = OldName.substr(OffsetEnd + 1);
2877 }
2878 }
2879 // Strip any SROA suffixes as well.
2880 OldName = OldName.substr(0, OldName.find(".sroa_"));
2881#endif
2882
2883 return getAdjustedPtr(IRB, DL, Ptr: &NewAI,
2884 Offset: APInt(DL.getIndexTypeSizeInBits(Ty: PointerTy), Offset),
2885 PointerTy,
2886#ifndef NDEBUG
2887 Twine(OldName) + "."
2888#else
2889 NamePrefix: Twine()
2890#endif
2891 );
2892 }
2893
2894 /// Compute suitable alignment to access this slice of the *new*
2895 /// alloca.
2896 ///
2897 /// You can optionally pass a type to this routine and if that type's ABI
2898 /// alignment is itself suitable, this will return zero.
2899 Align getSliceAlign() {
2900 return commonAlignment(A: NewAI.getAlign(),
2901 Offset: NewBeginOffset - NewAllocaBeginOffset);
2902 }
2903
2904 unsigned getIndex(uint64_t Offset) {
2905 assert(VecTy && "Can only call getIndex when rewriting a vector");
2906 uint64_t RelOffset = Offset - NewAllocaBeginOffset;
2907 assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
2908 uint32_t Index = RelOffset / ElementSize;
2909 assert(Index * ElementSize == RelOffset);
2910 return Index;
2911 }
2912
2913 void deleteIfTriviallyDead(Value *V) {
2914 Instruction *I = cast<Instruction>(Val: V);
2915 if (isInstructionTriviallyDead(I))
2916 Pass.DeadInsts.push_back(Elt: I);
2917 }
2918
2919 Value *rewriteVectorizedLoadInst(LoadInst &LI) {
2920 unsigned BeginIndex = getIndex(Offset: NewBeginOffset);
2921 unsigned EndIndex = getIndex(Offset: NewEndOffset);
2922 assert(EndIndex > BeginIndex && "Empty vector!");
2923
2924 LoadInst *Load = IRB.CreateAlignedLoad(Ty: NewAI.getAllocatedType(), Ptr: &NewAI,
2925 Align: NewAI.getAlign(), Name: "load");
2926
2927 Load->copyMetadata(SrcInst: LI, WL: {LLVMContext::MD_mem_parallel_loop_access,
2928 LLVMContext::MD_access_group});
2929 return extractVector(IRB, V: Load, BeginIndex, EndIndex, Name: "vec");
2930 }
2931
2932 Value *rewriteIntegerLoad(LoadInst &LI) {
2933 assert(IntTy && "We cannot insert an integer to the alloca");
2934 assert(!LI.isVolatile());
2935 Value *V = IRB.CreateAlignedLoad(Ty: NewAI.getAllocatedType(), Ptr: &NewAI,
2936 Align: NewAI.getAlign(), Name: "load");
2937 V = convertValue(DL, IRB, V, NewTy: IntTy);
2938 assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
2939 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
2940 if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) {
2941 IntegerType *ExtractTy = Type::getIntNTy(C&: LI.getContext(), N: SliceSize * 8);
2942 V = extractInteger(DL, IRB, V, Ty: ExtractTy, Offset, Name: "extract");
2943 }
2944 // It is possible that the extracted type is not the load type. This
2945 // happens if there is a load past the end of the alloca, and as
2946 // a consequence the slice is narrower but still a candidate for integer
2947 // lowering. To handle this case, we just zero extend the extracted
2948 // integer.
2949 assert(cast<IntegerType>(LI.getType())->getBitWidth() >= SliceSize * 8 &&
2950 "Can only handle an extract for an overly wide load");
2951 if (cast<IntegerType>(Val: LI.getType())->getBitWidth() > SliceSize * 8)
2952 V = IRB.CreateZExt(V, DestTy: LI.getType());
2953 return V;
2954 }
2955
2956 bool visitLoadInst(LoadInst &LI) {
2957 LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
2958 Value *OldOp = LI.getOperand(i_nocapture: 0);
2959 assert(OldOp == OldPtr);
2960
2961 AAMDNodes AATags = LI.getAAMetadata();
2962
2963 unsigned AS = LI.getPointerAddressSpace();
2964
2965 Type *TargetTy = IsSplit ? Type::getIntNTy(C&: LI.getContext(), N: SliceSize * 8)
2966 : LI.getType();
2967 bool IsPtrAdjusted = false;
2968 Value *V;
2969 if (VecTy) {
2970 V = rewriteVectorizedLoadInst(LI);
2971 } else if (IntTy && LI.getType()->isIntegerTy()) {
2972 V = rewriteIntegerLoad(LI);
2973 } else if (NewBeginOffset == NewAllocaBeginOffset &&
2974 NewEndOffset == NewAllocaEndOffset &&
2975 (canConvertValue(DL, OldTy: NewAllocaTy, NewTy: TargetTy) ||
2976 (NewAllocaTy->isIntegerTy() && TargetTy->isIntegerTy() &&
2977 DL.getTypeStoreSize(Ty: TargetTy).getFixedValue() > SliceSize &&
2978 !LI.isVolatile()))) {
2979 Value *NewPtr =
2980 getPtrToNewAI(AddrSpace: LI.getPointerAddressSpace(), IsVolatile: LI.isVolatile());
2981 LoadInst *NewLI = IRB.CreateAlignedLoad(Ty: NewAI.getAllocatedType(), Ptr: NewPtr,
2982 Align: NewAI.getAlign(), isVolatile: LI.isVolatile(),
2983 Name: LI.getName());
2984 if (LI.isVolatile())
2985 NewLI->setAtomic(Ordering: LI.getOrdering(), SSID: LI.getSyncScopeID());
2986 if (NewLI->isAtomic())
2987 NewLI->setAlignment(LI.getAlign());
2988
2989 // Copy any metadata that is valid for the new load. This may require
2990 // conversion to a different kind of metadata, e.g. !nonnull might change
2991 // to !range or vice versa.
2992 copyMetadataForLoad(Dest&: *NewLI, Source: LI);
2993
2994 // Do this after copyMetadataForLoad() to preserve the TBAA shift.
2995 if (AATags)
2996 NewLI->setAAMetadata(AATags.adjustForAccess(
2997 Offset: NewBeginOffset - BeginOffset, AccessTy: NewLI->getType(), DL));
2998
2999 // Try to preserve nonnull metadata
3000 V = NewLI;
3001
3002 // If this is an integer load past the end of the slice (which means the
3003 // bytes outside the slice are undef or this load is dead) just forcibly
3004 // fix the integer size with correct handling of endianness.
3005 if (auto *AITy = dyn_cast<IntegerType>(Val: NewAllocaTy))
3006 if (auto *TITy = dyn_cast<IntegerType>(Val: TargetTy))
3007 if (AITy->getBitWidth() < TITy->getBitWidth()) {
3008 V = IRB.CreateZExt(V, DestTy: TITy, Name: "load.ext");
3009 if (DL.isBigEndian())
3010 V = IRB.CreateShl(LHS: V, RHS: TITy->getBitWidth() - AITy->getBitWidth(),
3011 Name: "endian_shift");
3012 }
3013 } else {
3014 Type *LTy = IRB.getPtrTy(AddrSpace: AS);
3015 LoadInst *NewLI =
3016 IRB.CreateAlignedLoad(Ty: TargetTy, Ptr: getNewAllocaSlicePtr(IRB, PointerTy: LTy),
3017 Align: getSliceAlign(), isVolatile: LI.isVolatile(), Name: LI.getName());
3018
3019 if (AATags)
3020 NewLI->setAAMetadata(AATags.adjustForAccess(
3021 Offset: NewBeginOffset - BeginOffset, AccessTy: NewLI->getType(), DL));
3022
3023 if (LI.isVolatile())
3024 NewLI->setAtomic(Ordering: LI.getOrdering(), SSID: LI.getSyncScopeID());
3025 NewLI->copyMetadata(SrcInst: LI, WL: {LLVMContext::MD_mem_parallel_loop_access,
3026 LLVMContext::MD_access_group});
3027
3028 V = NewLI;
3029 IsPtrAdjusted = true;
3030 }
3031 V = convertValue(DL, IRB, V, NewTy: TargetTy);
3032
3033 if (IsSplit) {
3034 assert(!LI.isVolatile());
3035 assert(LI.getType()->isIntegerTy() &&
3036 "Only integer type loads and stores are split");
3037 assert(SliceSize < DL.getTypeStoreSize(LI.getType()).getFixedValue() &&
3038 "Split load isn't smaller than original load");
3039 assert(DL.typeSizeEqualsStoreSize(LI.getType()) &&
3040 "Non-byte-multiple bit width");
3041 // Move the insertion point just past the load so that we can refer to it.
3042 BasicBlock::iterator LIIt = std::next(x: LI.getIterator());
3043 // Ensure the insertion point comes before any debug-info immediately
3044 // after the load, so that variable values referring to the load are
3045 // dominated by it.
3046 LIIt.setHeadBit(true);
3047 IRB.SetInsertPoint(TheBB: LI.getParent(), IP: LIIt);
3048 // Create a placeholder value with the same type as LI to use as the
3049 // basis for the new value. This allows us to replace the uses of LI with
3050 // the computed value, and then replace the placeholder with LI, leaving
3051 // LI only used for this computation.
3052 Value *Placeholder =
3053 new LoadInst(LI.getType(), PoisonValue::get(T: IRB.getPtrTy(AddrSpace: AS)), "",
3054 false, Align(1));
3055 V = insertInteger(DL, IRB, Old: Placeholder, V, Offset: NewBeginOffset - BeginOffset,
3056 Name: "insert");
3057 LI.replaceAllUsesWith(V);
3058 Placeholder->replaceAllUsesWith(V: &LI);
3059 Placeholder->deleteValue();
3060 } else {
3061 LI.replaceAllUsesWith(V);
3062 }
3063
3064 Pass.DeadInsts.push_back(Elt: &LI);
3065 deleteIfTriviallyDead(V: OldOp);
3066 LLVM_DEBUG(dbgs() << " to: " << *V << "\n");
3067 return !LI.isVolatile() && !IsPtrAdjusted;
3068 }
3069
3070 bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp,
3071 AAMDNodes AATags) {
3072 // Capture V for the purpose of debug-info accounting once it's converted
3073 // to a vector store.
3074 Value *OrigV = V;
3075 if (V->getType() != VecTy) {
3076 unsigned BeginIndex = getIndex(Offset: NewBeginOffset);
3077 unsigned EndIndex = getIndex(Offset: NewEndOffset);
3078 assert(EndIndex > BeginIndex && "Empty vector!");
3079 unsigned NumElements = EndIndex - BeginIndex;
3080 assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
3081 "Too many elements!");
3082 Type *SliceTy = (NumElements == 1)
3083 ? ElementTy
3084 : FixedVectorType::get(ElementType: ElementTy, NumElts: NumElements);
3085 if (V->getType() != SliceTy)
3086 V = convertValue(DL, IRB, V, NewTy: SliceTy);
3087
3088 // Mix in the existing elements.
3089 Value *Old = IRB.CreateAlignedLoad(Ty: NewAI.getAllocatedType(), Ptr: &NewAI,
3090 Align: NewAI.getAlign(), Name: "load");
3091 V = insertVector(IRB, Old, V, BeginIndex, Name: "vec");
3092 }
3093 StoreInst *Store = IRB.CreateAlignedStore(Val: V, Ptr: &NewAI, Align: NewAI.getAlign());
3094 Store->copyMetadata(SrcInst: SI, WL: {LLVMContext::MD_mem_parallel_loop_access,
3095 LLVMContext::MD_access_group});
3096 if (AATags)
3097 Store->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3098 AccessTy: V->getType(), DL));
3099 Pass.DeadInsts.push_back(Elt: &SI);
3100
3101 // NOTE: Careful to use OrigV rather than V.
3102 migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * 8, SliceSizeInBits: SliceSize * 8, OldInst: &SI,
3103 Inst: Store, Dest: Store->getPointerOperand(), Value: OrigV, DL);
3104 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3105 return true;
3106 }
3107
3108 bool rewriteIntegerStore(Value *V, StoreInst &SI, AAMDNodes AATags) {
3109 assert(IntTy && "We cannot extract an integer from the alloca");
3110 assert(!SI.isVolatile());
3111 if (DL.getTypeSizeInBits(Ty: V->getType()).getFixedValue() !=
3112 IntTy->getBitWidth()) {
3113 Value *Old = IRB.CreateAlignedLoad(Ty: NewAI.getAllocatedType(), Ptr: &NewAI,
3114 Align: NewAI.getAlign(), Name: "oldload");
3115 Old = convertValue(DL, IRB, V: Old, NewTy: IntTy);
3116 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
3117 uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
3118 V = insertInteger(DL, IRB, Old, V: SI.getValueOperand(), Offset, Name: "insert");
3119 }
3120 V = convertValue(DL, IRB, V, NewTy: NewAllocaTy);
3121 StoreInst *Store = IRB.CreateAlignedStore(Val: V, Ptr: &NewAI, Align: NewAI.getAlign());
3122 Store->copyMetadata(SrcInst: SI, WL: {LLVMContext::MD_mem_parallel_loop_access,
3123 LLVMContext::MD_access_group});
3124 if (AATags)
3125 Store->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3126 AccessTy: V->getType(), DL));
3127
3128 migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * 8, SliceSizeInBits: SliceSize * 8, OldInst: &SI,
3129 Inst: Store, Dest: Store->getPointerOperand(),
3130 Value: Store->getValueOperand(), DL);
3131
3132 Pass.DeadInsts.push_back(Elt: &SI);
3133 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3134 return true;
3135 }
3136
3137 bool visitStoreInst(StoreInst &SI) {
3138 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3139 Value *OldOp = SI.getOperand(i_nocapture: 1);
3140 assert(OldOp == OldPtr);
3141
3142 AAMDNodes AATags = SI.getAAMetadata();
3143 Value *V = SI.getValueOperand();
3144
3145 // Strip all inbounds GEPs and pointer casts to try to dig out any root
3146 // alloca that should be re-examined after promoting this alloca.
3147 if (V->getType()->isPointerTy())
3148 if (AllocaInst *AI = dyn_cast<AllocaInst>(Val: V->stripInBoundsOffsets()))
3149 Pass.PostPromotionWorklist.insert(X: AI);
3150
3151 TypeSize StoreSize = DL.getTypeStoreSize(Ty: V->getType());
3152 if (StoreSize.isFixed() && SliceSize < StoreSize.getFixedValue()) {
3153 assert(!SI.isVolatile());
3154 assert(V->getType()->isIntegerTy() &&
3155 "Only integer type loads and stores are split");
3156 assert(DL.typeSizeEqualsStoreSize(V->getType()) &&
3157 "Non-byte-multiple bit width");
3158 IntegerType *NarrowTy = Type::getIntNTy(C&: SI.getContext(), N: SliceSize * 8);
3159 V = extractInteger(DL, IRB, V, Ty: NarrowTy, Offset: NewBeginOffset - BeginOffset,
3160 Name: "extract");
3161 }
3162
3163 if (VecTy)
3164 return rewriteVectorizedStoreInst(V, SI, OldOp, AATags);
3165 if (IntTy && V->getType()->isIntegerTy())
3166 return rewriteIntegerStore(V, SI, AATags);
3167
3168 StoreInst *NewSI;
3169 if (NewBeginOffset == NewAllocaBeginOffset &&
3170 NewEndOffset == NewAllocaEndOffset &&
3171 canConvertValue(DL, OldTy: V->getType(), NewTy: NewAllocaTy)) {
3172 V = convertValue(DL, IRB, V, NewTy: NewAllocaTy);
3173 Value *NewPtr =
3174 getPtrToNewAI(AddrSpace: SI.getPointerAddressSpace(), IsVolatile: SI.isVolatile());
3175
3176 NewSI =
3177 IRB.CreateAlignedStore(Val: V, Ptr: NewPtr, Align: NewAI.getAlign(), isVolatile: SI.isVolatile());
3178 } else {
3179 unsigned AS = SI.getPointerAddressSpace();
3180 Value *NewPtr = getNewAllocaSlicePtr(IRB, PointerTy: IRB.getPtrTy(AddrSpace: AS));
3181 NewSI =
3182 IRB.CreateAlignedStore(Val: V, Ptr: NewPtr, Align: getSliceAlign(), isVolatile: SI.isVolatile());
3183 }
3184 NewSI->copyMetadata(SrcInst: SI, WL: {LLVMContext::MD_mem_parallel_loop_access,
3185 LLVMContext::MD_access_group});
3186 if (AATags)
3187 NewSI->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3188 AccessTy: V->getType(), DL));
3189 if (SI.isVolatile())
3190 NewSI->setAtomic(Ordering: SI.getOrdering(), SSID: SI.getSyncScopeID());
3191 if (NewSI->isAtomic())
3192 NewSI->setAlignment(SI.getAlign());
3193
3194 migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * 8, SliceSizeInBits: SliceSize * 8, OldInst: &SI,
3195 Inst: NewSI, Dest: NewSI->getPointerOperand(),
3196 Value: NewSI->getValueOperand(), DL);
3197
3198 Pass.DeadInsts.push_back(Elt: &SI);
3199 deleteIfTriviallyDead(V: OldOp);
3200
3201 LLVM_DEBUG(dbgs() << " to: " << *NewSI << "\n");
3202 return NewSI->getPointerOperand() == &NewAI &&
3203 NewSI->getValueOperand()->getType() == NewAllocaTy &&
3204 !SI.isVolatile();
3205 }
3206
3207 /// Compute an integer value from splatting an i8 across the given
3208 /// number of bytes.
3209 ///
3210 /// Note that this routine assumes an i8 is a byte. If that isn't true, don't
3211 /// call this routine.
3212 /// FIXME: Heed the advice above.
3213 ///
3214 /// \param V The i8 value to splat.
3215 /// \param Size The number of bytes in the output (assuming i8 is one byte)
3216 Value *getIntegerSplat(Value *V, unsigned Size) {
3217 assert(Size > 0 && "Expected a positive number of bytes.");
3218 IntegerType *VTy = cast<IntegerType>(Val: V->getType());
3219 assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte");
3220 if (Size == 1)
3221 return V;
3222
3223 Type *SplatIntTy = Type::getIntNTy(C&: VTy->getContext(), N: Size * 8);
3224 V = IRB.CreateMul(
3225 LHS: IRB.CreateZExt(V, DestTy: SplatIntTy, Name: "zext"),
3226 RHS: IRB.CreateUDiv(LHS: Constant::getAllOnesValue(Ty: SplatIntTy),
3227 RHS: IRB.CreateZExt(V: Constant::getAllOnesValue(Ty: V->getType()),
3228 DestTy: SplatIntTy)),
3229 Name: "isplat");
3230 return V;
3231 }
3232
3233 /// Compute a vector splat for a given element value.
3234 Value *getVectorSplat(Value *V, unsigned NumElements) {
3235 V = IRB.CreateVectorSplat(NumElts: NumElements, V, Name: "vsplat");
3236 LLVM_DEBUG(dbgs() << " splat: " << *V << "\n");
3237 return V;
3238 }
3239
3240 bool visitMemSetInst(MemSetInst &II) {
3241 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3242 assert(II.getRawDest() == OldPtr);
3243
3244 AAMDNodes AATags = II.getAAMetadata();
3245
3246 // If the memset has a variable size, it cannot be split, just adjust the
3247 // pointer to the new alloca.
3248 if (!isa<ConstantInt>(Val: II.getLength())) {
3249 assert(!IsSplit);
3250 assert(NewBeginOffset == BeginOffset);
3251 II.setDest(getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType()));
3252 II.setDestAlignment(getSliceAlign());
3253 // In theory we should call migrateDebugInfo here. However, we do not
3254 // emit dbg.assign intrinsics for mem intrinsics storing through non-
3255 // constant geps, or storing a variable number of bytes.
3256 assert(at::getAssignmentMarkers(&II).empty() &&
3257 at::getDVRAssignmentMarkers(&II).empty() &&
3258 "AT: Unexpected link to non-const GEP");
3259 deleteIfTriviallyDead(V: OldPtr);
3260 return false;
3261 }
3262
3263 // Record this instruction for deletion.
3264 Pass.DeadInsts.push_back(Elt: &II);
3265
3266 Type *AllocaTy = NewAI.getAllocatedType();
3267 Type *ScalarTy = AllocaTy->getScalarType();
3268
3269 const bool CanContinue = [&]() {
3270 if (VecTy || IntTy)
3271 return true;
3272 if (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset)
3273 return false;
3274 // Length must be in range for FixedVectorType.
3275 auto *C = cast<ConstantInt>(Val: II.getLength());
3276 const uint64_t Len = C->getLimitedValue();
3277 if (Len > std::numeric_limits<unsigned>::max())
3278 return false;
3279 auto *Int8Ty = IntegerType::getInt8Ty(C&: NewAI.getContext());
3280 auto *SrcTy = FixedVectorType::get(ElementType: Int8Ty, NumElts: Len);
3281 return canConvertValue(DL, OldTy: SrcTy, NewTy: AllocaTy) &&
3282 DL.isLegalInteger(Width: DL.getTypeSizeInBits(Ty: ScalarTy).getFixedValue());
3283 }();
3284
3285 // If this doesn't map cleanly onto the alloca type, and that type isn't
3286 // a single value type, just emit a memset.
3287 if (!CanContinue) {
3288 Type *SizeTy = II.getLength()->getType();
3289 unsigned Sz = NewEndOffset - NewBeginOffset;
3290 Constant *Size = ConstantInt::get(Ty: SizeTy, V: Sz);
3291 MemIntrinsic *New = cast<MemIntrinsic>(Val: IRB.CreateMemSet(
3292 Ptr: getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType()), Val: II.getValue(), Size,
3293 Align: MaybeAlign(getSliceAlign()), isVolatile: II.isVolatile()));
3294 if (AATags)
3295 New->setAAMetadata(
3296 AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset, AccessSize: Sz));
3297
3298 migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * 8, SliceSizeInBits: SliceSize * 8, OldInst: &II,
3299 Inst: New, Dest: New->getRawDest(), Value: nullptr, DL);
3300
3301 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3302 return false;
3303 }
3304
3305 // If we can represent this as a simple value, we have to build the actual
3306 // value to store, which requires expanding the byte present in memset to
3307 // a sensible representation for the alloca type. This is essentially
3308 // splatting the byte to a sufficiently wide integer, splatting it across
3309 // any desired vector width, and bitcasting to the final type.
3310 Value *V;
3311
3312 if (VecTy) {
3313 // If this is a memset of a vectorized alloca, insert it.
3314 assert(ElementTy == ScalarTy);
3315
3316 unsigned BeginIndex = getIndex(Offset: NewBeginOffset);
3317 unsigned EndIndex = getIndex(Offset: NewEndOffset);
3318 assert(EndIndex > BeginIndex && "Empty vector!");
3319 unsigned NumElements = EndIndex - BeginIndex;
3320 assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
3321 "Too many elements!");
3322
3323 Value *Splat = getIntegerSplat(
3324 V: II.getValue(), Size: DL.getTypeSizeInBits(Ty: ElementTy).getFixedValue() / 8);
3325 Splat = convertValue(DL, IRB, V: Splat, NewTy: ElementTy);
3326 if (NumElements > 1)
3327 Splat = getVectorSplat(V: Splat, NumElements);
3328
3329 Value *Old = IRB.CreateAlignedLoad(Ty: NewAI.getAllocatedType(), Ptr: &NewAI,
3330 Align: NewAI.getAlign(), Name: "oldload");
3331 V = insertVector(IRB, Old, V: Splat, BeginIndex, Name: "vec");
3332 } else if (IntTy) {
3333 // If this is a memset on an alloca where we can widen stores, insert the
3334 // set integer.
3335 assert(!II.isVolatile());
3336
3337 uint64_t Size = NewEndOffset - NewBeginOffset;
3338 V = getIntegerSplat(V: II.getValue(), Size);
3339
3340 if (IntTy && (BeginOffset != NewAllocaBeginOffset ||
3341 EndOffset != NewAllocaBeginOffset)) {
3342 Value *Old = IRB.CreateAlignedLoad(Ty: NewAI.getAllocatedType(), Ptr: &NewAI,
3343 Align: NewAI.getAlign(), Name: "oldload");
3344 Old = convertValue(DL, IRB, V: Old, NewTy: IntTy);
3345 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3346 V = insertInteger(DL, IRB, Old, V, Offset, Name: "insert");
3347 } else {
3348 assert(V->getType() == IntTy &&
3349 "Wrong type for an alloca wide integer!");
3350 }
3351 V = convertValue(DL, IRB, V, NewTy: AllocaTy);
3352 } else {
3353 // Established these invariants above.
3354 assert(NewBeginOffset == NewAllocaBeginOffset);
3355 assert(NewEndOffset == NewAllocaEndOffset);
3356
3357 V = getIntegerSplat(V: II.getValue(),
3358 Size: DL.getTypeSizeInBits(Ty: ScalarTy).getFixedValue() / 8);
3359 if (VectorType *AllocaVecTy = dyn_cast<VectorType>(Val: AllocaTy))
3360 V = getVectorSplat(
3361 V, NumElements: cast<FixedVectorType>(Val: AllocaVecTy)->getNumElements());
3362
3363 V = convertValue(DL, IRB, V, NewTy: AllocaTy);
3364 }
3365
3366 Value *NewPtr = getPtrToNewAI(AddrSpace: II.getDestAddressSpace(), IsVolatile: II.isVolatile());
3367 StoreInst *New =
3368 IRB.CreateAlignedStore(Val: V, Ptr: NewPtr, Align: NewAI.getAlign(), isVolatile: II.isVolatile());
3369 New->copyMetadata(SrcInst: II, WL: {LLVMContext::MD_mem_parallel_loop_access,
3370 LLVMContext::MD_access_group});
3371 if (AATags)
3372 New->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3373 AccessTy: V->getType(), DL));
3374
3375 migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * 8, SliceSizeInBits: SliceSize * 8, OldInst: &II,
3376 Inst: New, Dest: New->getPointerOperand(), Value: V, DL);
3377
3378 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3379 return !II.isVolatile();
3380 }
3381
3382 bool visitMemTransferInst(MemTransferInst &II) {
3383 // Rewriting of memory transfer instructions can be a bit tricky. We break
3384 // them into two categories: split intrinsics and unsplit intrinsics.
3385
3386 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3387
3388 AAMDNodes AATags = II.getAAMetadata();
3389
3390 bool IsDest = &II.getRawDestUse() == OldUse;
3391 assert((IsDest && II.getRawDest() == OldPtr) ||
3392 (!IsDest && II.getRawSource() == OldPtr));
3393
3394 Align SliceAlign = getSliceAlign();
3395 // For unsplit intrinsics, we simply modify the source and destination
3396 // pointers in place. This isn't just an optimization, it is a matter of
3397 // correctness. With unsplit intrinsics we may be dealing with transfers
3398 // within a single alloca before SROA ran, or with transfers that have
3399 // a variable length. We may also be dealing with memmove instead of
3400 // memcpy, and so simply updating the pointers is the necessary for us to
3401 // update both source and dest of a single call.
3402 if (!IsSplittable) {
3403 Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType());
3404 if (IsDest) {
3405 // Update the address component of linked dbg.assigns.
3406 auto UpdateAssignAddress = [&](auto *DbgAssign) {
3407 if (llvm::is_contained(DbgAssign->location_ops(), II.getDest()) ||
3408 DbgAssign->getAddress() == II.getDest())
3409 DbgAssign->replaceVariableLocationOp(II.getDest(), AdjustedPtr);
3410 };
3411 for_each(Range: at::getAssignmentMarkers(Inst: &II), F: UpdateAssignAddress);
3412 for_each(Range: at::getDVRAssignmentMarkers(Inst: &II), F: UpdateAssignAddress);
3413 II.setDest(AdjustedPtr);
3414 II.setDestAlignment(SliceAlign);
3415 } else {
3416 II.setSource(AdjustedPtr);
3417 II.setSourceAlignment(SliceAlign);
3418 }
3419
3420 LLVM_DEBUG(dbgs() << " to: " << II << "\n");
3421 deleteIfTriviallyDead(V: OldPtr);
3422 return false;
3423 }
3424 // For split transfer intrinsics we have an incredibly useful assurance:
3425 // the source and destination do not reside within the same alloca, and at
3426 // least one of them does not escape. This means that we can replace
3427 // memmove with memcpy, and we don't need to worry about all manner of
3428 // downsides to splitting and transforming the operations.
3429
3430 // If this doesn't map cleanly onto the alloca type, and that type isn't
3431 // a single value type, just emit a memcpy.
3432 bool EmitMemCpy =
3433 !VecTy && !IntTy &&
3434 (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
3435 SliceSize !=
3436 DL.getTypeStoreSize(Ty: NewAI.getAllocatedType()).getFixedValue() ||
3437 !DL.typeSizeEqualsStoreSize(Ty: NewAI.getAllocatedType()) ||
3438 !NewAI.getAllocatedType()->isSingleValueType());
3439
3440 // If we're just going to emit a memcpy, the alloca hasn't changed, and the
3441 // size hasn't been shrunk based on analysis of the viable range, this is
3442 // a no-op.
3443 if (EmitMemCpy && &OldAI == &NewAI) {
3444 // Ensure the start lines up.
3445 assert(NewBeginOffset == BeginOffset);
3446
3447 // Rewrite the size as needed.
3448 if (NewEndOffset != EndOffset)
3449 II.setLength(ConstantInt::get(Ty: II.getLength()->getType(),
3450 V: NewEndOffset - NewBeginOffset));
3451 return false;
3452 }
3453 // Record this instruction for deletion.
3454 Pass.DeadInsts.push_back(Elt: &II);
3455
3456 // Strip all inbounds GEPs and pointer casts to try to dig out any root
3457 // alloca that should be re-examined after rewriting this instruction.
3458 Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
3459 if (AllocaInst *AI =
3460 dyn_cast<AllocaInst>(Val: OtherPtr->stripInBoundsOffsets())) {
3461 assert(AI != &OldAI && AI != &NewAI &&
3462 "Splittable transfers cannot reach the same alloca on both ends.");
3463 Pass.Worklist.insert(X: AI);
3464 }
3465
3466 Type *OtherPtrTy = OtherPtr->getType();
3467 unsigned OtherAS = OtherPtrTy->getPointerAddressSpace();
3468
3469 // Compute the relative offset for the other pointer within the transfer.
3470 unsigned OffsetWidth = DL.getIndexSizeInBits(AS: OtherAS);
3471 APInt OtherOffset(OffsetWidth, NewBeginOffset - BeginOffset);
3472 Align OtherAlign =
3473 (IsDest ? II.getSourceAlign() : II.getDestAlign()).valueOrOne();
3474 OtherAlign =
3475 commonAlignment(A: OtherAlign, Offset: OtherOffset.zextOrTrunc(width: 64).getZExtValue());
3476
3477 if (EmitMemCpy) {
3478 // Compute the other pointer, folding as much as possible to produce
3479 // a single, simple GEP in most cases.
3480 OtherPtr = getAdjustedPtr(IRB, DL, Ptr: OtherPtr, Offset: OtherOffset, PointerTy: OtherPtrTy,
3481 NamePrefix: OtherPtr->getName() + ".");
3482
3483 Value *OurPtr = getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType());
3484 Type *SizeTy = II.getLength()->getType();
3485 Constant *Size = ConstantInt::get(Ty: SizeTy, V: NewEndOffset - NewBeginOffset);
3486
3487 Value *DestPtr, *SrcPtr;
3488 MaybeAlign DestAlign, SrcAlign;
3489 // Note: IsDest is true iff we're copying into the new alloca slice
3490 if (IsDest) {
3491 DestPtr = OurPtr;
3492 DestAlign = SliceAlign;
3493 SrcPtr = OtherPtr;
3494 SrcAlign = OtherAlign;
3495 } else {
3496 DestPtr = OtherPtr;
3497 DestAlign = OtherAlign;
3498 SrcPtr = OurPtr;
3499 SrcAlign = SliceAlign;
3500 }
3501 CallInst *New = IRB.CreateMemCpy(Dst: DestPtr, DstAlign: DestAlign, Src: SrcPtr, SrcAlign,
3502 Size, isVolatile: II.isVolatile());
3503 if (AATags)
3504 New->setAAMetadata(AATags.shift(Offset: NewBeginOffset - BeginOffset));
3505
3506 APInt Offset(DL.getIndexTypeSizeInBits(Ty: DestPtr->getType()), 0);
3507 if (IsDest) {
3508 migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * 8, SliceSizeInBits: SliceSize * 8,
3509 OldInst: &II, Inst: New, Dest: DestPtr, Value: nullptr, DL);
3510 } else if (AllocaInst *Base = dyn_cast<AllocaInst>(
3511 Val: DestPtr->stripAndAccumulateConstantOffsets(
3512 DL, Offset, /*AllowNonInbounds*/ true))) {
3513 migrateDebugInfo(OldAlloca: Base, IsSplit, OldAllocaOffsetInBits: Offset.getZExtValue() * 8,
3514 SliceSizeInBits: SliceSize * 8, OldInst: &II, Inst: New, Dest: DestPtr, Value: nullptr, DL);
3515 }
3516 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3517 return false;
3518 }
3519
3520 bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset &&
3521 NewEndOffset == NewAllocaEndOffset;
3522 uint64_t Size = NewEndOffset - NewBeginOffset;
3523 unsigned BeginIndex = VecTy ? getIndex(Offset: NewBeginOffset) : 0;
3524 unsigned EndIndex = VecTy ? getIndex(Offset: NewEndOffset) : 0;
3525 unsigned NumElements = EndIndex - BeginIndex;
3526 IntegerType *SubIntTy =
3527 IntTy ? Type::getIntNTy(C&: IntTy->getContext(), N: Size * 8) : nullptr;
3528
3529 // Reset the other pointer type to match the register type we're going to
3530 // use, but using the address space of the original other pointer.
3531 Type *OtherTy;
3532 if (VecTy && !IsWholeAlloca) {
3533 if (NumElements == 1)
3534 OtherTy = VecTy->getElementType();
3535 else
3536 OtherTy = FixedVectorType::get(ElementType: VecTy->getElementType(), NumElts: NumElements);
3537 } else if (IntTy && !IsWholeAlloca) {
3538 OtherTy = SubIntTy;
3539 } else {
3540 OtherTy = NewAllocaTy;
3541 }
3542
3543 Value *AdjPtr = getAdjustedPtr(IRB, DL, Ptr: OtherPtr, Offset: OtherOffset, PointerTy: OtherPtrTy,
3544 NamePrefix: OtherPtr->getName() + ".");
3545 MaybeAlign SrcAlign = OtherAlign;
3546 MaybeAlign DstAlign = SliceAlign;
3547 if (!IsDest)
3548 std::swap(a&: SrcAlign, b&: DstAlign);
3549
3550 Value *SrcPtr;
3551 Value *DstPtr;
3552
3553 if (IsDest) {
3554 DstPtr = getPtrToNewAI(AddrSpace: II.getDestAddressSpace(), IsVolatile: II.isVolatile());
3555 SrcPtr = AdjPtr;
3556 } else {
3557 DstPtr = AdjPtr;
3558 SrcPtr = getPtrToNewAI(AddrSpace: II.getSourceAddressSpace(), IsVolatile: II.isVolatile());
3559 }
3560
3561 Value *Src;
3562 if (VecTy && !IsWholeAlloca && !IsDest) {
3563 Src = IRB.CreateAlignedLoad(Ty: NewAI.getAllocatedType(), Ptr: &NewAI,
3564 Align: NewAI.getAlign(), Name: "load");
3565 Src = extractVector(IRB, V: Src, BeginIndex, EndIndex, Name: "vec");
3566 } else if (IntTy && !IsWholeAlloca && !IsDest) {
3567 Src = IRB.CreateAlignedLoad(Ty: NewAI.getAllocatedType(), Ptr: &NewAI,
3568 Align: NewAI.getAlign(), Name: "load");
3569 Src = convertValue(DL, IRB, V: Src, NewTy: IntTy);
3570 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3571 Src = extractInteger(DL, IRB, V: Src, Ty: SubIntTy, Offset, Name: "extract");
3572 } else {
3573 LoadInst *Load = IRB.CreateAlignedLoad(Ty: OtherTy, Ptr: SrcPtr, Align: SrcAlign,
3574 isVolatile: II.isVolatile(), Name: "copyload");
3575 Load->copyMetadata(SrcInst: II, WL: {LLVMContext::MD_mem_parallel_loop_access,
3576 LLVMContext::MD_access_group});
3577 if (AATags)
3578 Load->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3579 AccessTy: Load->getType(), DL));
3580 Src = Load;
3581 }
3582
3583 if (VecTy && !IsWholeAlloca && IsDest) {
3584 Value *Old = IRB.CreateAlignedLoad(Ty: NewAI.getAllocatedType(), Ptr: &NewAI,
3585 Align: NewAI.getAlign(), Name: "oldload");
3586 Src = insertVector(IRB, Old, V: Src, BeginIndex, Name: "vec");
3587 } else if (IntTy && !IsWholeAlloca && IsDest) {
3588 Value *Old = IRB.CreateAlignedLoad(Ty: NewAI.getAllocatedType(), Ptr: &NewAI,
3589 Align: NewAI.getAlign(), Name: "oldload");
3590 Old = convertValue(DL, IRB, V: Old, NewTy: IntTy);
3591 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3592 Src = insertInteger(DL, IRB, Old, V: Src, Offset, Name: "insert");
3593 Src = convertValue(DL, IRB, V: Src, NewTy: NewAllocaTy);
3594 }
3595
3596 StoreInst *Store = cast<StoreInst>(
3597 Val: IRB.CreateAlignedStore(Val: Src, Ptr: DstPtr, Align: DstAlign, isVolatile: II.isVolatile()));
3598 Store->copyMetadata(SrcInst: II, WL: {LLVMContext::MD_mem_parallel_loop_access,
3599 LLVMContext::MD_access_group});
3600 if (AATags)
3601 Store->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3602 AccessTy: Src->getType(), DL));
3603
3604 APInt Offset(DL.getIndexTypeSizeInBits(Ty: DstPtr->getType()), 0);
3605 if (IsDest) {
3606
3607 migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * 8, SliceSizeInBits: SliceSize * 8, OldInst: &II,
3608 Inst: Store, Dest: DstPtr, Value: Src, DL);
3609 } else if (AllocaInst *Base = dyn_cast<AllocaInst>(
3610 Val: DstPtr->stripAndAccumulateConstantOffsets(
3611 DL, Offset, /*AllowNonInbounds*/ true))) {
3612 migrateDebugInfo(OldAlloca: Base, IsSplit, OldAllocaOffsetInBits: Offset.getZExtValue() * 8, SliceSizeInBits: SliceSize * 8,
3613 OldInst: &II, Inst: Store, Dest: DstPtr, Value: Src, DL);
3614 }
3615
3616 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3617 return !II.isVolatile();
3618 }
3619
3620 bool visitIntrinsicInst(IntrinsicInst &II) {
3621 assert((II.isLifetimeStartOrEnd() || II.isLaunderOrStripInvariantGroup() ||
3622 II.isDroppable()) &&
3623 "Unexpected intrinsic!");
3624 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3625
3626 // Record this instruction for deletion.
3627 Pass.DeadInsts.push_back(Elt: &II);
3628
3629 if (II.isDroppable()) {
3630 assert(II.getIntrinsicID() == Intrinsic::assume && "Expected assume");
3631 // TODO For now we forget assumed information, this can be improved.
3632 OldPtr->dropDroppableUsesIn(Usr&: II);
3633 return true;
3634 }
3635
3636 if (II.isLaunderOrStripInvariantGroup())
3637 return true;
3638
3639 assert(II.getArgOperand(1) == OldPtr);
3640 // Lifetime intrinsics are only promotable if they cover the whole alloca.
3641 // Therefore, we drop lifetime intrinsics which don't cover the whole
3642 // alloca.
3643 // (In theory, intrinsics which partially cover an alloca could be
3644 // promoted, but PromoteMemToReg doesn't handle that case.)
3645 // FIXME: Check whether the alloca is promotable before dropping the
3646 // lifetime intrinsics?
3647 if (NewBeginOffset != NewAllocaBeginOffset ||
3648 NewEndOffset != NewAllocaEndOffset)
3649 return true;
3650
3651 ConstantInt *Size =
3652 ConstantInt::get(Ty: cast<IntegerType>(Val: II.getArgOperand(i: 0)->getType()),
3653 V: NewEndOffset - NewBeginOffset);
3654 // Lifetime intrinsics always expect an i8* so directly get such a pointer
3655 // for the new alloca slice.
3656 Type *PointerTy = IRB.getPtrTy(AddrSpace: OldPtr->getType()->getPointerAddressSpace());
3657 Value *Ptr = getNewAllocaSlicePtr(IRB, PointerTy);
3658 Value *New;
3659 if (II.getIntrinsicID() == Intrinsic::lifetime_start)
3660 New = IRB.CreateLifetimeStart(Ptr, Size);
3661 else
3662 New = IRB.CreateLifetimeEnd(Ptr, Size);
3663
3664 (void)New;
3665 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3666
3667 return true;
3668 }
3669
3670 void fixLoadStoreAlign(Instruction &Root) {
3671 // This algorithm implements the same visitor loop as
3672 // hasUnsafePHIOrSelectUse, and fixes the alignment of each load
3673 // or store found.
3674 SmallPtrSet<Instruction *, 4> Visited;
3675 SmallVector<Instruction *, 4> Uses;
3676 Visited.insert(Ptr: &Root);
3677 Uses.push_back(Elt: &Root);
3678 do {
3679 Instruction *I = Uses.pop_back_val();
3680
3681 if (LoadInst *LI = dyn_cast<LoadInst>(Val: I)) {
3682 LI->setAlignment(std::min(a: LI->getAlign(), b: getSliceAlign()));
3683 continue;
3684 }
3685 if (StoreInst *SI = dyn_cast<StoreInst>(Val: I)) {
3686 SI->setAlignment(std::min(a: SI->getAlign(), b: getSliceAlign()));
3687 continue;
3688 }
3689
3690 assert(isa<BitCastInst>(I) || isa<AddrSpaceCastInst>(I) ||
3691 isa<PHINode>(I) || isa<SelectInst>(I) ||
3692 isa<GetElementPtrInst>(I));
3693 for (User *U : I->users())
3694 if (Visited.insert(Ptr: cast<Instruction>(Val: U)).second)
3695 Uses.push_back(Elt: cast<Instruction>(Val: U));
3696 } while (!Uses.empty());
3697 }
3698
3699 bool visitPHINode(PHINode &PN) {
3700 LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
3701 assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable");
3702 assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable");
3703
3704 // We would like to compute a new pointer in only one place, but have it be
3705 // as local as possible to the PHI. To do that, we re-use the location of
3706 // the old pointer, which necessarily must be in the right position to
3707 // dominate the PHI.
3708 IRBuilderBase::InsertPointGuard Guard(IRB);
3709 if (isa<PHINode>(Val: OldPtr))
3710 IRB.SetInsertPoint(TheBB: OldPtr->getParent(),
3711 IP: OldPtr->getParent()->getFirstInsertionPt());
3712 else
3713 IRB.SetInsertPoint(OldPtr);
3714 IRB.SetCurrentDebugLocation(OldPtr->getDebugLoc());
3715
3716 Value *NewPtr = getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType());
3717 // Replace the operands which were using the old pointer.
3718 std::replace(first: PN.op_begin(), last: PN.op_end(), old_value: cast<Value>(Val: OldPtr), new_value: NewPtr);
3719
3720 LLVM_DEBUG(dbgs() << " to: " << PN << "\n");
3721 deleteIfTriviallyDead(V: OldPtr);
3722
3723 // Fix the alignment of any loads or stores using this PHI node.
3724 fixLoadStoreAlign(Root&: PN);
3725
3726 // PHIs can't be promoted on their own, but often can be speculated. We
3727 // check the speculation outside of the rewriter so that we see the
3728 // fully-rewritten alloca.
3729 PHIUsers.insert(X: &PN);
3730 return true;
3731 }
3732
3733 bool visitSelectInst(SelectInst &SI) {
3734 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3735 assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) &&
3736 "Pointer isn't an operand!");
3737 assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
3738 assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable");
3739
3740 Value *NewPtr = getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType());
3741 // Replace the operands which were using the old pointer.
3742 if (SI.getOperand(i_nocapture: 1) == OldPtr)
3743 SI.setOperand(i_nocapture: 1, Val_nocapture: NewPtr);
3744 if (SI.getOperand(i_nocapture: 2) == OldPtr)
3745 SI.setOperand(i_nocapture: 2, Val_nocapture: NewPtr);
3746
3747 LLVM_DEBUG(dbgs() << " to: " << SI << "\n");
3748 deleteIfTriviallyDead(V: OldPtr);
3749
3750 // Fix the alignment of any loads or stores using this select.
3751 fixLoadStoreAlign(Root&: SI);
3752
3753 // Selects can't be promoted on their own, but often can be speculated. We
3754 // check the speculation outside of the rewriter so that we see the
3755 // fully-rewritten alloca.
3756 SelectUsers.insert(X: &SI);
3757 return true;
3758 }
3759};
3760
3761/// Visitor to rewrite aggregate loads and stores as scalar.
3762///
3763/// This pass aggressively rewrites all aggregate loads and stores on
3764/// a particular pointer (or any pointer derived from it which we can identify)
3765/// with scalar loads and stores.
3766class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
3767 // Befriend the base class so it can delegate to private visit methods.
3768 friend class InstVisitor<AggLoadStoreRewriter, bool>;
3769
3770 /// Queue of pointer uses to analyze and potentially rewrite.
3771 SmallVector<Use *, 8> Queue;
3772
3773 /// Set to prevent us from cycling with phi nodes and loops.
3774 SmallPtrSet<User *, 8> Visited;
3775
3776 /// The current pointer use being rewritten. This is used to dig up the used
3777 /// value (as opposed to the user).
3778 Use *U = nullptr;
3779
3780 /// Used to calculate offsets, and hence alignment, of subobjects.
3781 const DataLayout &DL;
3782
3783 IRBuilderTy &IRB;
3784
3785public:
3786 AggLoadStoreRewriter(const DataLayout &DL, IRBuilderTy &IRB)
3787 : DL(DL), IRB(IRB) {}
3788
3789 /// Rewrite loads and stores through a pointer and all pointers derived from
3790 /// it.
3791 bool rewrite(Instruction &I) {
3792 LLVM_DEBUG(dbgs() << " Rewriting FCA loads and stores...\n");
3793 enqueueUsers(I);
3794 bool Changed = false;
3795 while (!Queue.empty()) {
3796 U = Queue.pop_back_val();
3797 Changed |= visit(I: cast<Instruction>(Val: U->getUser()));
3798 }
3799 return Changed;
3800 }
3801
3802private:
3803 /// Enqueue all the users of the given instruction for further processing.
3804 /// This uses a set to de-duplicate users.
3805 void enqueueUsers(Instruction &I) {
3806 for (Use &U : I.uses())
3807 if (Visited.insert(Ptr: U.getUser()).second)
3808 Queue.push_back(Elt: &U);
3809 }
3810
3811 // Conservative default is to not rewrite anything.
3812 bool visitInstruction(Instruction &I) { return false; }
3813
3814 /// Generic recursive split emission class.
3815 template <typename Derived> class OpSplitter {
3816 protected:
3817 /// The builder used to form new instructions.
3818 IRBuilderTy &IRB;
3819
3820 /// The indices which to be used with insert- or extractvalue to select the
3821 /// appropriate value within the aggregate.
3822 SmallVector<unsigned, 4> Indices;
3823
3824 /// The indices to a GEP instruction which will move Ptr to the correct slot
3825 /// within the aggregate.
3826 SmallVector<Value *, 4> GEPIndices;
3827
3828 /// The base pointer of the original op, used as a base for GEPing the
3829 /// split operations.
3830 Value *Ptr;
3831
3832 /// The base pointee type being GEPed into.
3833 Type *BaseTy;
3834
3835 /// Known alignment of the base pointer.
3836 Align BaseAlign;
3837
3838 /// To calculate offset of each component so we can correctly deduce
3839 /// alignments.
3840 const DataLayout &DL;
3841
3842 /// Initialize the splitter with an insertion point, Ptr and start with a
3843 /// single zero GEP index.
3844 OpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
3845 Align BaseAlign, const DataLayout &DL, IRBuilderTy &IRB)
3846 : IRB(IRB), GEPIndices(1, IRB.getInt32(C: 0)), Ptr(Ptr), BaseTy(BaseTy),
3847 BaseAlign(BaseAlign), DL(DL) {
3848 IRB.SetInsertPoint(InsertionPoint);
3849 }
3850
3851 public:
3852 /// Generic recursive split emission routine.
3853 ///
3854 /// This method recursively splits an aggregate op (load or store) into
3855 /// scalar or vector ops. It splits recursively until it hits a single value
3856 /// and emits that single value operation via the template argument.
3857 ///
3858 /// The logic of this routine relies on GEPs and insertvalue and
3859 /// extractvalue all operating with the same fundamental index list, merely
3860 /// formatted differently (GEPs need actual values).
3861 ///
3862 /// \param Ty The type being split recursively into smaller ops.
3863 /// \param Agg The aggregate value being built up or stored, depending on
3864 /// whether this is splitting a load or a store respectively.
3865 void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) {
3866 if (Ty->isSingleValueType()) {
3867 unsigned Offset = DL.getIndexedOffsetInType(ElemTy: BaseTy, Indices: GEPIndices);
3868 return static_cast<Derived *>(this)->emitFunc(
3869 Ty, Agg, commonAlignment(A: BaseAlign, Offset), Name);
3870 }
3871
3872 if (ArrayType *ATy = dyn_cast<ArrayType>(Val: Ty)) {
3873 unsigned OldSize = Indices.size();
3874 (void)OldSize;
3875 for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size;
3876 ++Idx) {
3877 assert(Indices.size() == OldSize && "Did not return to the old size");
3878 Indices.push_back(Elt: Idx);
3879 GEPIndices.push_back(Elt: IRB.getInt32(C: Idx));
3880 emitSplitOps(Ty: ATy->getElementType(), Agg, Name: Name + "." + Twine(Idx));
3881 GEPIndices.pop_back();
3882 Indices.pop_back();
3883 }
3884 return;
3885 }
3886
3887 if (StructType *STy = dyn_cast<StructType>(Val: Ty)) {
3888 unsigned OldSize = Indices.size();
3889 (void)OldSize;
3890 for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size;
3891 ++Idx) {
3892 assert(Indices.size() == OldSize && "Did not return to the old size");
3893 Indices.push_back(Elt: Idx);
3894 GEPIndices.push_back(Elt: IRB.getInt32(C: Idx));
3895 emitSplitOps(Ty: STy->getElementType(N: Idx), Agg, Name: Name + "." + Twine(Idx));
3896 GEPIndices.pop_back();
3897 Indices.pop_back();
3898 }
3899 return;
3900 }
3901
3902 llvm_unreachable("Only arrays and structs are aggregate loadable types");
3903 }
3904 };
3905
3906 struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
3907 AAMDNodes AATags;
3908 // A vector to hold the split components that we want to emit
3909 // separate fake uses for.
3910 SmallVector<Value *, 4> Components;
3911 // A vector to hold all the fake uses of the struct that we are splitting.
3912 // Usually there should only be one, but we are handling the general case.
3913 SmallVector<Instruction *, 1> FakeUses;
3914
3915 LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
3916 AAMDNodes AATags, Align BaseAlign, const DataLayout &DL,
3917 IRBuilderTy &IRB)
3918 : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, DL,
3919 IRB),
3920 AATags(AATags) {}
3921
3922 /// Emit a leaf load of a single value. This is called at the leaves of the
3923 /// recursive emission to actually load values.
3924 void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
3925 assert(Ty->isSingleValueType());
3926 // Load the single value and insert it using the indices.
3927 Value *GEP =
3928 IRB.CreateInBoundsGEP(Ty: BaseTy, Ptr, IdxList: GEPIndices, Name: Name + ".gep");
3929 LoadInst *Load =
3930 IRB.CreateAlignedLoad(Ty, Ptr: GEP, Align: Alignment, Name: Name + ".load");
3931
3932 APInt Offset(
3933 DL.getIndexSizeInBits(AS: Ptr->getType()->getPointerAddressSpace()), 0);
3934 if (AATags &&
3935 GEPOperator::accumulateConstantOffset(SourceType: BaseTy, Index: GEPIndices, DL, Offset))
3936 Load->setAAMetadata(
3937 AATags.adjustForAccess(Offset: Offset.getZExtValue(), AccessTy: Load->getType(), DL));
3938 // Record the load so we can generate a fake use for this aggregate
3939 // component.
3940 Components.push_back(Elt: Load);
3941
3942 Agg = IRB.CreateInsertValue(Agg, Val: Load, Idxs: Indices, Name: Name + ".insert");
3943 LLVM_DEBUG(dbgs() << " to: " << *Load << "\n");
3944 }
3945
3946 // Stash the fake uses that use the value generated by this instruction.
3947 void recordFakeUses(LoadInst &LI) {
3948 for (Use &U : LI.uses())
3949 if (auto *II = dyn_cast<IntrinsicInst>(Val: U.getUser()))
3950 if (II->getIntrinsicID() == Intrinsic::fake_use)
3951 FakeUses.push_back(Elt: II);
3952 }
3953
3954 // Replace all fake uses of the aggregate with a series of fake uses, one
3955 // for each split component.
3956 void emitFakeUses() {
3957 for (Instruction *I : FakeUses) {
3958 IRB.SetInsertPoint(I);
3959 for (auto *V : Components)
3960 IRB.CreateIntrinsic(ID: Intrinsic::fake_use, Args: {V});
3961 I->eraseFromParent();
3962 }
3963 }
3964 };
3965
3966 bool visitLoadInst(LoadInst &LI) {
3967 assert(LI.getPointerOperand() == *U);
3968 if (!LI.isSimple() || LI.getType()->isSingleValueType())
3969 return false;
3970
3971 // We have an aggregate being loaded, split it apart.
3972 LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
3973 LoadOpSplitter Splitter(&LI, *U, LI.getType(), LI.getAAMetadata(),
3974 getAdjustedAlignment(I: &LI, Offset: 0), DL, IRB);
3975 Splitter.recordFakeUses(LI);
3976 Value *V = PoisonValue::get(T: LI.getType());
3977 Splitter.emitSplitOps(Ty: LI.getType(), Agg&: V, Name: LI.getName() + ".fca");
3978 Splitter.emitFakeUses();
3979 Visited.erase(Ptr: &LI);
3980 LI.replaceAllUsesWith(V);
3981 LI.eraseFromParent();
3982 return true;
3983 }
3984
3985 struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
3986 StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
3987 AAMDNodes AATags, StoreInst *AggStore, Align BaseAlign,
3988 const DataLayout &DL, IRBuilderTy &IRB)
3989 : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
3990 DL, IRB),
3991 AATags(AATags), AggStore(AggStore) {}
3992 AAMDNodes AATags;
3993 StoreInst *AggStore;
3994 /// Emit a leaf store of a single value. This is called at the leaves of the
3995 /// recursive emission to actually produce stores.
3996 void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
3997 assert(Ty->isSingleValueType());
3998 // Extract the single value and store it using the indices.
3999 //
4000 // The gep and extractvalue values are factored out of the CreateStore
4001 // call to make the output independent of the argument evaluation order.
4002 Value *ExtractValue =
4003 IRB.CreateExtractValue(Agg, Idxs: Indices, Name: Name + ".extract");
4004 Value *InBoundsGEP =
4005 IRB.CreateInBoundsGEP(Ty: BaseTy, Ptr, IdxList: GEPIndices, Name: Name + ".gep");
4006 StoreInst *Store =
4007 IRB.CreateAlignedStore(Val: ExtractValue, Ptr: InBoundsGEP, Align: Alignment);
4008
4009 APInt Offset(
4010 DL.getIndexSizeInBits(AS: Ptr->getType()->getPointerAddressSpace()), 0);
4011 GEPOperator::accumulateConstantOffset(SourceType: BaseTy, Index: GEPIndices, DL, Offset);
4012 if (AATags) {
4013 Store->setAAMetadata(AATags.adjustForAccess(
4014 Offset: Offset.getZExtValue(), AccessTy: ExtractValue->getType(), DL));
4015 }
4016
4017 // migrateDebugInfo requires the base Alloca. Walk to it from this gep.
4018 // If we cannot (because there's an intervening non-const or unbounded
4019 // gep) then we wouldn't expect to see dbg.assign intrinsics linked to
4020 // this instruction.
4021 Value *Base = AggStore->getPointerOperand()->stripInBoundsOffsets();
4022 if (auto *OldAI = dyn_cast<AllocaInst>(Val: Base)) {
4023 uint64_t SizeInBits =
4024 DL.getTypeSizeInBits(Ty: Store->getValueOperand()->getType());
4025 migrateDebugInfo(OldAlloca: OldAI, /*IsSplit*/ true, OldAllocaOffsetInBits: Offset.getZExtValue() * 8,
4026 SliceSizeInBits: SizeInBits, OldInst: AggStore, Inst: Store,
4027 Dest: Store->getPointerOperand(), Value: Store->getValueOperand(),
4028 DL);
4029 } else {
4030 assert(at::getAssignmentMarkers(Store).empty() &&
4031 at::getDVRAssignmentMarkers(Store).empty() &&
4032 "AT: unexpected debug.assign linked to store through "
4033 "unbounded GEP");
4034 }
4035 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
4036 }
4037 };
4038
4039 bool visitStoreInst(StoreInst &SI) {
4040 if (!SI.isSimple() || SI.getPointerOperand() != *U)
4041 return false;
4042 Value *V = SI.getValueOperand();
4043 if (V->getType()->isSingleValueType())
4044 return false;
4045
4046 // We have an aggregate being stored, split it apart.
4047 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
4048 StoreOpSplitter Splitter(&SI, *U, V->getType(), SI.getAAMetadata(), &SI,
4049 getAdjustedAlignment(I: &SI, Offset: 0), DL, IRB);
4050 Splitter.emitSplitOps(Ty: V->getType(), Agg&: V, Name: V->getName() + ".fca");
4051 Visited.erase(Ptr: &SI);
4052 // The stores replacing SI each have markers describing fragments of the
4053 // assignment so delete the assignment markers linked to SI.
4054 at::deleteAssignmentMarkers(Inst: &SI);
4055 SI.eraseFromParent();
4056 return true;
4057 }
4058
4059 bool visitBitCastInst(BitCastInst &BC) {
4060 enqueueUsers(I&: BC);
4061 return false;
4062 }
4063
4064 bool visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
4065 enqueueUsers(I&: ASC);
4066 return false;
4067 }
4068
4069 // Unfold gep (select cond, ptr1, ptr2), idx
4070 // => select cond, gep(ptr1, idx), gep(ptr2, idx)
4071 // and gep ptr, (select cond, idx1, idx2)
4072 // => select cond, gep(ptr, idx1), gep(ptr, idx2)
4073 bool unfoldGEPSelect(GetElementPtrInst &GEPI) {
4074 // Check whether the GEP has exactly one select operand and all indices
4075 // will become constant after the transform.
4076 SelectInst *Sel = dyn_cast<SelectInst>(Val: GEPI.getPointerOperand());
4077 for (Value *Op : GEPI.indices()) {
4078 if (auto *SI = dyn_cast<SelectInst>(Val: Op)) {
4079 if (Sel)
4080 return false;
4081
4082 Sel = SI;
4083 if (!isa<ConstantInt>(Val: Sel->getTrueValue()) ||
4084 !isa<ConstantInt>(Val: Sel->getFalseValue()))
4085 return false;
4086 continue;
4087 }
4088
4089 if (!isa<ConstantInt>(Val: Op))
4090 return false;
4091 }
4092
4093 if (!Sel)
4094 return false;
4095
4096 LLVM_DEBUG(dbgs() << " Rewriting gep(select) -> select(gep):\n";
4097 dbgs() << " original: " << *Sel << "\n";
4098 dbgs() << " " << GEPI << "\n";);
4099
4100 auto GetNewOps = [&](Value *SelOp) {
4101 SmallVector<Value *> NewOps;
4102 for (Value *Op : GEPI.operands())
4103 if (Op == Sel)
4104 NewOps.push_back(Elt: SelOp);
4105 else
4106 NewOps.push_back(Elt: Op);
4107 return NewOps;
4108 };
4109
4110 Value *True = Sel->getTrueValue();
4111 Value *False = Sel->getFalseValue();
4112 SmallVector<Value *> TrueOps = GetNewOps(True);
4113 SmallVector<Value *> FalseOps = GetNewOps(False);
4114
4115 IRB.SetInsertPoint(&GEPI);
4116 GEPNoWrapFlags NW = GEPI.getNoWrapFlags();
4117
4118 Type *Ty = GEPI.getSourceElementType();
4119 Value *NTrue = IRB.CreateGEP(Ty, Ptr: TrueOps[0], IdxList: ArrayRef(TrueOps).drop_front(),
4120 Name: True->getName() + ".sroa.gep", NW);
4121
4122 Value *NFalse =
4123 IRB.CreateGEP(Ty, Ptr: FalseOps[0], IdxList: ArrayRef(FalseOps).drop_front(),
4124 Name: False->getName() + ".sroa.gep", NW);
4125
4126 Value *NSel = IRB.CreateSelect(C: Sel->getCondition(), True: NTrue, False: NFalse,
4127 Name: Sel->getName() + ".sroa.sel");
4128 Visited.erase(Ptr: &GEPI);
4129 GEPI.replaceAllUsesWith(V: NSel);
4130 GEPI.eraseFromParent();
4131 Instruction *NSelI = cast<Instruction>(Val: NSel);
4132 Visited.insert(Ptr: NSelI);
4133 enqueueUsers(I&: *NSelI);
4134
4135 LLVM_DEBUG(dbgs() << " to: " << *NTrue << "\n";
4136 dbgs() << " " << *NFalse << "\n";
4137 dbgs() << " " << *NSel << "\n";);
4138
4139 return true;
4140 }
4141
4142 // Unfold gep (phi ptr1, ptr2), idx
4143 // => phi ((gep ptr1, idx), (gep ptr2, idx))
4144 // and gep ptr, (phi idx1, idx2)
4145 // => phi ((gep ptr, idx1), (gep ptr, idx2))
4146 bool unfoldGEPPhi(GetElementPtrInst &GEPI) {
4147 // To prevent infinitely expanding recursive phis, bail if the GEP pointer
4148 // operand (looking through the phi if it is the phi we want to unfold) is
4149 // an instruction besides a static alloca.
4150 PHINode *Phi = dyn_cast<PHINode>(Val: GEPI.getPointerOperand());
4151 auto IsInvalidPointerOperand = [](Value *V) {
4152 if (!isa<Instruction>(Val: V))
4153 return false;
4154 if (auto *AI = dyn_cast<AllocaInst>(Val: V))
4155 return !AI->isStaticAlloca();
4156 return true;
4157 };
4158 if (Phi) {
4159 if (any_of(Range: Phi->operands(), P: IsInvalidPointerOperand))
4160 return false;
4161 } else {
4162 if (IsInvalidPointerOperand(GEPI.getPointerOperand()))
4163 return false;
4164 }
4165 // Check whether the GEP has exactly one phi operand (including the pointer
4166 // operand) and all indices will become constant after the transform.
4167 for (Value *Op : GEPI.indices()) {
4168 if (auto *SI = dyn_cast<PHINode>(Val: Op)) {
4169 if (Phi)
4170 return false;
4171
4172 Phi = SI;
4173 if (!all_of(Range: Phi->incoming_values(),
4174 P: [](Value *V) { return isa<ConstantInt>(Val: V); }))
4175 return false;
4176 continue;
4177 }
4178
4179 if (!isa<ConstantInt>(Val: Op))
4180 return false;
4181 }
4182
4183 if (!Phi)
4184 return false;
4185
4186 LLVM_DEBUG(dbgs() << " Rewriting gep(phi) -> phi(gep):\n";
4187 dbgs() << " original: " << *Phi << "\n";
4188 dbgs() << " " << GEPI << "\n";);
4189
4190 auto GetNewOps = [&](Value *PhiOp) {
4191 SmallVector<Value *> NewOps;
4192 for (Value *Op : GEPI.operands())
4193 if (Op == Phi)
4194 NewOps.push_back(Elt: PhiOp);
4195 else
4196 NewOps.push_back(Elt: Op);
4197 return NewOps;
4198 };
4199
4200 IRB.SetInsertPoint(Phi);
4201 PHINode *NewPhi = IRB.CreatePHI(Ty: GEPI.getType(), NumReservedValues: Phi->getNumIncomingValues(),
4202 Name: Phi->getName() + ".sroa.phi");
4203
4204 Type *SourceTy = GEPI.getSourceElementType();
4205 // We only handle arguments, constants, and static allocas here, so we can
4206 // insert GEPs at the end of the entry block.
4207 IRB.SetInsertPoint(GEPI.getFunction()->getEntryBlock().getTerminator());
4208 for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
4209 Value *Op = Phi->getIncomingValue(i: I);
4210 BasicBlock *BB = Phi->getIncomingBlock(i: I);
4211 Value *NewGEP;
4212 if (int NI = NewPhi->getBasicBlockIndex(BB); NI >= 0) {
4213 NewGEP = NewPhi->getIncomingValue(i: NI);
4214 } else {
4215 SmallVector<Value *> NewOps = GetNewOps(Op);
4216 NewGEP =
4217 IRB.CreateGEP(Ty: SourceTy, Ptr: NewOps[0], IdxList: ArrayRef(NewOps).drop_front(),
4218 Name: Phi->getName() + ".sroa.gep", NW: GEPI.getNoWrapFlags());
4219 }
4220 NewPhi->addIncoming(V: NewGEP, BB);
4221 }
4222
4223 Visited.erase(Ptr: &GEPI);
4224 GEPI.replaceAllUsesWith(V: NewPhi);
4225 GEPI.eraseFromParent();
4226 Visited.insert(Ptr: NewPhi);
4227 enqueueUsers(I&: *NewPhi);
4228
4229 LLVM_DEBUG(dbgs() << " to: ";
4230 for (Value *In
4231 : NewPhi->incoming_values()) dbgs()
4232 << "\n " << *In;
4233 dbgs() << "\n " << *NewPhi << '\n');
4234
4235 return true;
4236 }
4237
4238 bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
4239 if (unfoldGEPSelect(GEPI))
4240 return true;
4241
4242 if (unfoldGEPPhi(GEPI))
4243 return true;
4244
4245 enqueueUsers(I&: GEPI);
4246 return false;
4247 }
4248
4249 bool visitPHINode(PHINode &PN) {
4250 enqueueUsers(I&: PN);
4251 return false;
4252 }
4253
4254 bool visitSelectInst(SelectInst &SI) {
4255 enqueueUsers(I&: SI);
4256 return false;
4257 }
4258};
4259
4260} // end anonymous namespace
4261
4262/// Strip aggregate type wrapping.
4263///
4264/// This removes no-op aggregate types wrapping an underlying type. It will
4265/// strip as many layers of types as it can without changing either the type
4266/// size or the allocated size.
4267static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) {
4268 if (Ty->isSingleValueType())
4269 return Ty;
4270
4271 uint64_t AllocSize = DL.getTypeAllocSize(Ty).getFixedValue();
4272 uint64_t TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
4273
4274 Type *InnerTy;
4275 if (ArrayType *ArrTy = dyn_cast<ArrayType>(Val: Ty)) {
4276 InnerTy = ArrTy->getElementType();
4277 } else if (StructType *STy = dyn_cast<StructType>(Val: Ty)) {
4278 const StructLayout *SL = DL.getStructLayout(Ty: STy);
4279 unsigned Index = SL->getElementContainingOffset(FixedOffset: 0);
4280 InnerTy = STy->getElementType(N: Index);
4281 } else {
4282 return Ty;
4283 }
4284
4285 if (AllocSize > DL.getTypeAllocSize(Ty: InnerTy).getFixedValue() ||
4286 TypeSize > DL.getTypeSizeInBits(Ty: InnerTy).getFixedValue())
4287 return Ty;
4288
4289 return stripAggregateTypeWrapping(DL, Ty: InnerTy);
4290}
4291
4292/// Try to find a partition of the aggregate type passed in for a given
4293/// offset and size.
4294///
4295/// This recurses through the aggregate type and tries to compute a subtype
4296/// based on the offset and size. When the offset and size span a sub-section
4297/// of an array, it will even compute a new array type for that sub-section,
4298/// and the same for structs.
4299///
4300/// Note that this routine is very strict and tries to find a partition of the
4301/// type which produces the *exact* right offset and size. It is not forgiving
4302/// when the size or offset cause either end of type-based partition to be off.
4303/// Also, this is a best-effort routine. It is reasonable to give up and not
4304/// return a type if necessary.
4305static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
4306 uint64_t Size) {
4307 if (Offset == 0 && DL.getTypeAllocSize(Ty).getFixedValue() == Size)
4308 return stripAggregateTypeWrapping(DL, Ty);
4309 if (Offset > DL.getTypeAllocSize(Ty).getFixedValue() ||
4310 (DL.getTypeAllocSize(Ty).getFixedValue() - Offset) < Size)
4311 return nullptr;
4312
4313 if (isa<ArrayType>(Val: Ty) || isa<VectorType>(Val: Ty)) {
4314 Type *ElementTy;
4315 uint64_t TyNumElements;
4316 if (auto *AT = dyn_cast<ArrayType>(Val: Ty)) {
4317 ElementTy = AT->getElementType();
4318 TyNumElements = AT->getNumElements();
4319 } else {
4320 // FIXME: This isn't right for vectors with non-byte-sized or
4321 // non-power-of-two sized elements.
4322 auto *VT = cast<FixedVectorType>(Val: Ty);
4323 ElementTy = VT->getElementType();
4324 TyNumElements = VT->getNumElements();
4325 }
4326 uint64_t ElementSize = DL.getTypeAllocSize(Ty: ElementTy).getFixedValue();
4327 uint64_t NumSkippedElements = Offset / ElementSize;
4328 if (NumSkippedElements >= TyNumElements)
4329 return nullptr;
4330 Offset -= NumSkippedElements * ElementSize;
4331
4332 // First check if we need to recurse.
4333 if (Offset > 0 || Size < ElementSize) {
4334 // Bail if the partition ends in a different array element.
4335 if ((Offset + Size) > ElementSize)
4336 return nullptr;
4337 // Recurse through the element type trying to peel off offset bytes.
4338 return getTypePartition(DL, Ty: ElementTy, Offset, Size);
4339 }
4340 assert(Offset == 0);
4341
4342 if (Size == ElementSize)
4343 return stripAggregateTypeWrapping(DL, Ty: ElementTy);
4344 assert(Size > ElementSize);
4345 uint64_t NumElements = Size / ElementSize;
4346 if (NumElements * ElementSize != Size)
4347 return nullptr;
4348 return ArrayType::get(ElementType: ElementTy, NumElements);
4349 }
4350
4351 StructType *STy = dyn_cast<StructType>(Val: Ty);
4352 if (!STy)
4353 return nullptr;
4354
4355 const StructLayout *SL = DL.getStructLayout(Ty: STy);
4356
4357 if (SL->getSizeInBits().isScalable())
4358 return nullptr;
4359
4360 if (Offset >= SL->getSizeInBytes())
4361 return nullptr;
4362 uint64_t EndOffset = Offset + Size;
4363 if (EndOffset > SL->getSizeInBytes())
4364 return nullptr;
4365
4366 unsigned Index = SL->getElementContainingOffset(FixedOffset: Offset);
4367 Offset -= SL->getElementOffset(Idx: Index);
4368
4369 Type *ElementTy = STy->getElementType(N: Index);
4370 uint64_t ElementSize = DL.getTypeAllocSize(Ty: ElementTy).getFixedValue();
4371 if (Offset >= ElementSize)
4372 return nullptr; // The offset points into alignment padding.
4373
4374 // See if any partition must be contained by the element.
4375 if (Offset > 0 || Size < ElementSize) {
4376 if ((Offset + Size) > ElementSize)
4377 return nullptr;
4378 return getTypePartition(DL, Ty: ElementTy, Offset, Size);
4379 }
4380 assert(Offset == 0);
4381
4382 if (Size == ElementSize)
4383 return stripAggregateTypeWrapping(DL, Ty: ElementTy);
4384
4385 StructType::element_iterator EI = STy->element_begin() + Index,
4386 EE = STy->element_end();
4387 if (EndOffset < SL->getSizeInBytes()) {
4388 unsigned EndIndex = SL->getElementContainingOffset(FixedOffset: EndOffset);
4389 if (Index == EndIndex)
4390 return nullptr; // Within a single element and its padding.
4391
4392 // Don't try to form "natural" types if the elements don't line up with the
4393 // expected size.
4394 // FIXME: We could potentially recurse down through the last element in the
4395 // sub-struct to find a natural end point.
4396 if (SL->getElementOffset(Idx: EndIndex) != EndOffset)
4397 return nullptr;
4398
4399 assert(Index < EndIndex);
4400 EE = STy->element_begin() + EndIndex;
4401 }
4402
4403 // Try to build up a sub-structure.
4404 StructType *SubTy =
4405 StructType::get(Context&: STy->getContext(), Elements: ArrayRef(EI, EE), isPacked: STy->isPacked());
4406 const StructLayout *SubSL = DL.getStructLayout(Ty: SubTy);
4407 if (Size != SubSL->getSizeInBytes())
4408 return nullptr; // The sub-struct doesn't have quite the size needed.
4409
4410 return SubTy;
4411}
4412
4413/// Pre-split loads and stores to simplify rewriting.
4414///
4415/// We want to break up the splittable load+store pairs as much as
4416/// possible. This is important to do as a preprocessing step, as once we
4417/// start rewriting the accesses to partitions of the alloca we lose the
4418/// necessary information to correctly split apart paired loads and stores
4419/// which both point into this alloca. The case to consider is something like
4420/// the following:
4421///
4422/// %a = alloca [12 x i8]
4423/// %gep1 = getelementptr i8, ptr %a, i32 0
4424/// %gep2 = getelementptr i8, ptr %a, i32 4
4425/// %gep3 = getelementptr i8, ptr %a, i32 8
4426/// store float 0.0, ptr %gep1
4427/// store float 1.0, ptr %gep2
4428/// %v = load i64, ptr %gep1
4429/// store i64 %v, ptr %gep2
4430/// %f1 = load float, ptr %gep2
4431/// %f2 = load float, ptr %gep3
4432///
4433/// Here we want to form 3 partitions of the alloca, each 4 bytes large, and
4434/// promote everything so we recover the 2 SSA values that should have been
4435/// there all along.
4436///
4437/// \returns true if any changes are made.
4438bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
4439 LLVM_DEBUG(dbgs() << "Pre-splitting loads and stores\n");
4440
4441 // Track the loads and stores which are candidates for pre-splitting here, in
4442 // the order they first appear during the partition scan. These give stable
4443 // iteration order and a basis for tracking which loads and stores we
4444 // actually split.
4445 SmallVector<LoadInst *, 4> Loads;
4446 SmallVector<StoreInst *, 4> Stores;
4447
4448 // We need to accumulate the splits required of each load or store where we
4449 // can find them via a direct lookup. This is important to cross-check loads
4450 // and stores against each other. We also track the slice so that we can kill
4451 // all the slices that end up split.
4452 struct SplitOffsets {
4453 Slice *S;
4454 std::vector<uint64_t> Splits;
4455 };
4456 SmallDenseMap<Instruction *, SplitOffsets, 8> SplitOffsetsMap;
4457
4458 // Track loads out of this alloca which cannot, for any reason, be pre-split.
4459 // This is important as we also cannot pre-split stores of those loads!
4460 // FIXME: This is all pretty gross. It means that we can be more aggressive
4461 // in pre-splitting when the load feeding the store happens to come from
4462 // a separate alloca. Put another way, the effectiveness of SROA would be
4463 // decreased by a frontend which just concatenated all of its local allocas
4464 // into one big flat alloca. But defeating such patterns is exactly the job
4465 // SROA is tasked with! Sadly, to not have this discrepancy we would have
4466 // change store pre-splitting to actually force pre-splitting of the load
4467 // that feeds it *and all stores*. That makes pre-splitting much harder, but
4468 // maybe it would make it more principled?
4469 SmallPtrSet<LoadInst *, 8> UnsplittableLoads;
4470
4471 LLVM_DEBUG(dbgs() << " Searching for candidate loads and stores\n");
4472 for (auto &P : AS.partitions()) {
4473 for (Slice &S : P) {
4474 Instruction *I = cast<Instruction>(Val: S.getUse()->getUser());
4475 if (!S.isSplittable() || S.endOffset() <= P.endOffset()) {
4476 // If this is a load we have to track that it can't participate in any
4477 // pre-splitting. If this is a store of a load we have to track that
4478 // that load also can't participate in any pre-splitting.
4479 if (auto *LI = dyn_cast<LoadInst>(Val: I))
4480 UnsplittableLoads.insert(Ptr: LI);
4481 else if (auto *SI = dyn_cast<StoreInst>(Val: I))
4482 if (auto *LI = dyn_cast<LoadInst>(Val: SI->getValueOperand()))
4483 UnsplittableLoads.insert(Ptr: LI);
4484 continue;
4485 }
4486 assert(P.endOffset() > S.beginOffset() &&
4487 "Empty or backwards partition!");
4488
4489 // Determine if this is a pre-splittable slice.
4490 if (auto *LI = dyn_cast<LoadInst>(Val: I)) {
4491 assert(!LI->isVolatile() && "Cannot split volatile loads!");
4492
4493 // The load must be used exclusively to store into other pointers for
4494 // us to be able to arbitrarily pre-split it. The stores must also be
4495 // simple to avoid changing semantics.
4496 auto IsLoadSimplyStored = [](LoadInst *LI) {
4497 for (User *LU : LI->users()) {
4498 auto *SI = dyn_cast<StoreInst>(Val: LU);
4499 if (!SI || !SI->isSimple())
4500 return false;
4501 }
4502 return true;
4503 };
4504 if (!IsLoadSimplyStored(LI)) {
4505 UnsplittableLoads.insert(Ptr: LI);
4506 continue;
4507 }
4508
4509 Loads.push_back(Elt: LI);
4510 } else if (auto *SI = dyn_cast<StoreInst>(Val: I)) {
4511 if (S.getUse() != &SI->getOperandUse(i: SI->getPointerOperandIndex()))
4512 // Skip stores *of* pointers. FIXME: This shouldn't even be possible!
4513 continue;
4514 auto *StoredLoad = dyn_cast<LoadInst>(Val: SI->getValueOperand());
4515 if (!StoredLoad || !StoredLoad->isSimple())
4516 continue;
4517 assert(!SI->isVolatile() && "Cannot split volatile stores!");
4518
4519 Stores.push_back(Elt: SI);
4520 } else {
4521 // Other uses cannot be pre-split.
4522 continue;
4523 }
4524
4525 // Record the initial split.
4526 LLVM_DEBUG(dbgs() << " Candidate: " << *I << "\n");
4527 auto &Offsets = SplitOffsetsMap[I];
4528 assert(Offsets.Splits.empty() &&
4529 "Should not have splits the first time we see an instruction!");
4530 Offsets.S = &S;
4531 Offsets.Splits.push_back(x: P.endOffset() - S.beginOffset());
4532 }
4533
4534 // Now scan the already split slices, and add a split for any of them which
4535 // we're going to pre-split.
4536 for (Slice *S : P.splitSliceTails()) {
4537 auto SplitOffsetsMapI =
4538 SplitOffsetsMap.find(Val: cast<Instruction>(Val: S->getUse()->getUser()));
4539 if (SplitOffsetsMapI == SplitOffsetsMap.end())
4540 continue;
4541 auto &Offsets = SplitOffsetsMapI->second;
4542
4543 assert(Offsets.S == S && "Found a mismatched slice!");
4544 assert(!Offsets.Splits.empty() &&
4545 "Cannot have an empty set of splits on the second partition!");
4546 assert(Offsets.Splits.back() ==
4547 P.beginOffset() - Offsets.S->beginOffset() &&
4548 "Previous split does not end where this one begins!");
4549
4550 // Record each split. The last partition's end isn't needed as the size
4551 // of the slice dictates that.
4552 if (S->endOffset() > P.endOffset())
4553 Offsets.Splits.push_back(x: P.endOffset() - Offsets.S->beginOffset());
4554 }
4555 }
4556
4557 // We may have split loads where some of their stores are split stores. For
4558 // such loads and stores, we can only pre-split them if their splits exactly
4559 // match relative to their starting offset. We have to verify this prior to
4560 // any rewriting.
4561 llvm::erase_if(C&: Stores, P: [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
4562 // Lookup the load we are storing in our map of split
4563 // offsets.
4564 auto *LI = cast<LoadInst>(Val: SI->getValueOperand());
4565 // If it was completely unsplittable, then we're done,
4566 // and this store can't be pre-split.
4567 if (UnsplittableLoads.count(Ptr: LI))
4568 return true;
4569
4570 auto LoadOffsetsI = SplitOffsetsMap.find(Val: LI);
4571 if (LoadOffsetsI == SplitOffsetsMap.end())
4572 return false; // Unrelated loads are definitely safe.
4573 auto &LoadOffsets = LoadOffsetsI->second;
4574
4575 // Now lookup the store's offsets.
4576 auto &StoreOffsets = SplitOffsetsMap[SI];
4577
4578 // If the relative offsets of each split in the load and
4579 // store match exactly, then we can split them and we
4580 // don't need to remove them here.
4581 if (LoadOffsets.Splits == StoreOffsets.Splits)
4582 return false;
4583
4584 LLVM_DEBUG(dbgs() << " Mismatched splits for load and store:\n"
4585 << " " << *LI << "\n"
4586 << " " << *SI << "\n");
4587
4588 // We've found a store and load that we need to split
4589 // with mismatched relative splits. Just give up on them
4590 // and remove both instructions from our list of
4591 // candidates.
4592 UnsplittableLoads.insert(Ptr: LI);
4593 return true;
4594 });
4595 // Now we have to go *back* through all the stores, because a later store may
4596 // have caused an earlier store's load to become unsplittable and if it is
4597 // unsplittable for the later store, then we can't rely on it being split in
4598 // the earlier store either.
4599 llvm::erase_if(C&: Stores, P: [&UnsplittableLoads](StoreInst *SI) {
4600 auto *LI = cast<LoadInst>(Val: SI->getValueOperand());
4601 return UnsplittableLoads.count(Ptr: LI);
4602 });
4603 // Once we've established all the loads that can't be split for some reason,
4604 // filter any that made it into our list out.
4605 llvm::erase_if(C&: Loads, P: [&UnsplittableLoads](LoadInst *LI) {
4606 return UnsplittableLoads.count(Ptr: LI);
4607 });
4608
4609 // If no loads or stores are left, there is no pre-splitting to be done for
4610 // this alloca.
4611 if (Loads.empty() && Stores.empty())
4612 return false;
4613
4614 // From here on, we can't fail and will be building new accesses, so rig up
4615 // an IR builder.
4616 IRBuilderTy IRB(&AI);
4617
4618 // Collect the new slices which we will merge into the alloca slices.
4619 SmallVector<Slice, 4> NewSlices;
4620
4621 // Track any allocas we end up splitting loads and stores for so we iterate
4622 // on them.
4623 SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas;
4624
4625 // At this point, we have collected all of the loads and stores we can
4626 // pre-split, and the specific splits needed for them. We actually do the
4627 // splitting in a specific order in order to handle when one of the loads in
4628 // the value operand to one of the stores.
4629 //
4630 // First, we rewrite all of the split loads, and just accumulate each split
4631 // load in a parallel structure. We also build the slices for them and append
4632 // them to the alloca slices.
4633 SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap;
4634 std::vector<LoadInst *> SplitLoads;
4635 const DataLayout &DL = AI.getDataLayout();
4636 for (LoadInst *LI : Loads) {
4637 SplitLoads.clear();
4638
4639 auto &Offsets = SplitOffsetsMap[LI];
4640 unsigned SliceSize = Offsets.S->endOffset() - Offsets.S->beginOffset();
4641 assert(LI->getType()->getIntegerBitWidth() % 8 == 0 &&
4642 "Load must have type size equal to store size");
4643 assert(LI->getType()->getIntegerBitWidth() / 8 >= SliceSize &&
4644 "Load must be >= slice size");
4645
4646 uint64_t BaseOffset = Offsets.S->beginOffset();
4647 assert(BaseOffset + SliceSize > BaseOffset &&
4648 "Cannot represent alloca access size using 64-bit integers!");
4649
4650 Instruction *BasePtr = cast<Instruction>(Val: LI->getPointerOperand());
4651 IRB.SetInsertPoint(LI);
4652
4653 LLVM_DEBUG(dbgs() << " Splitting load: " << *LI << "\n");
4654
4655 uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
4656 int Idx = 0, Size = Offsets.Splits.size();
4657 for (;;) {
4658 auto *PartTy = Type::getIntNTy(C&: LI->getContext(), N: PartSize * 8);
4659 auto AS = LI->getPointerAddressSpace();
4660 auto *PartPtrTy = LI->getPointerOperandType();
4661 LoadInst *PLoad = IRB.CreateAlignedLoad(
4662 Ty: PartTy,
4663 Ptr: getAdjustedPtr(IRB, DL, Ptr: BasePtr,
4664 Offset: APInt(DL.getIndexSizeInBits(AS), PartOffset),
4665 PointerTy: PartPtrTy, NamePrefix: BasePtr->getName() + "."),
4666 Align: getAdjustedAlignment(I: LI, Offset: PartOffset),
4667 /*IsVolatile*/ isVolatile: false, Name: LI->getName());
4668 PLoad->copyMetadata(SrcInst: *LI, WL: {LLVMContext::MD_mem_parallel_loop_access,
4669 LLVMContext::MD_access_group});
4670
4671 // Append this load onto the list of split loads so we can find it later
4672 // to rewrite the stores.
4673 SplitLoads.push_back(x: PLoad);
4674
4675 // Now build a new slice for the alloca.
4676 NewSlices.push_back(
4677 Elt: Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
4678 &PLoad->getOperandUse(i: PLoad->getPointerOperandIndex()),
4679 /*IsSplittable*/ false));
4680 LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
4681 << ", " << NewSlices.back().endOffset()
4682 << "): " << *PLoad << "\n");
4683
4684 // See if we've handled all the splits.
4685 if (Idx >= Size)
4686 break;
4687
4688 // Setup the next partition.
4689 PartOffset = Offsets.Splits[Idx];
4690 ++Idx;
4691 PartSize = (Idx < Size ? Offsets.Splits[Idx] : SliceSize) - PartOffset;
4692 }
4693
4694 // Now that we have the split loads, do the slow walk over all uses of the
4695 // load and rewrite them as split stores, or save the split loads to use
4696 // below if the store is going to be split there anyways.
4697 bool DeferredStores = false;
4698 for (User *LU : LI->users()) {
4699 StoreInst *SI = cast<StoreInst>(Val: LU);
4700 if (!Stores.empty() && SplitOffsetsMap.count(Val: SI)) {
4701 DeferredStores = true;
4702 LLVM_DEBUG(dbgs() << " Deferred splitting of store: " << *SI
4703 << "\n");
4704 continue;
4705 }
4706
4707 Value *StoreBasePtr = SI->getPointerOperand();
4708 IRB.SetInsertPoint(SI);
4709 AAMDNodes AATags = SI->getAAMetadata();
4710
4711 LLVM_DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n");
4712
4713 for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) {
4714 LoadInst *PLoad = SplitLoads[Idx];
4715 uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1];
4716 auto *PartPtrTy = SI->getPointerOperandType();
4717
4718 auto AS = SI->getPointerAddressSpace();
4719 StoreInst *PStore = IRB.CreateAlignedStore(
4720 Val: PLoad,
4721 Ptr: getAdjustedPtr(IRB, DL, Ptr: StoreBasePtr,
4722 Offset: APInt(DL.getIndexSizeInBits(AS), PartOffset),
4723 PointerTy: PartPtrTy, NamePrefix: StoreBasePtr->getName() + "."),
4724 Align: getAdjustedAlignment(I: SI, Offset: PartOffset),
4725 /*IsVolatile*/ isVolatile: false);
4726 PStore->copyMetadata(SrcInst: *SI, WL: {LLVMContext::MD_mem_parallel_loop_access,
4727 LLVMContext::MD_access_group,
4728 LLVMContext::MD_DIAssignID});
4729
4730 if (AATags)
4731 PStore->setAAMetadata(
4732 AATags.adjustForAccess(Offset: PartOffset, AccessTy: PLoad->getType(), DL));
4733 LLVM_DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n");
4734 }
4735
4736 // We want to immediately iterate on any allocas impacted by splitting
4737 // this store, and we have to track any promotable alloca (indicated by
4738 // a direct store) as needing to be resplit because it is no longer
4739 // promotable.
4740 if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(Val: StoreBasePtr)) {
4741 ResplitPromotableAllocas.insert(Ptr: OtherAI);
4742 Worklist.insert(X: OtherAI);
4743 } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
4744 Val: StoreBasePtr->stripInBoundsOffsets())) {
4745 Worklist.insert(X: OtherAI);
4746 }
4747
4748 // Mark the original store as dead.
4749 DeadInsts.push_back(Elt: SI);
4750 }
4751
4752 // Save the split loads if there are deferred stores among the users.
4753 if (DeferredStores)
4754 SplitLoadsMap.insert(KV: std::make_pair(x&: LI, y: std::move(SplitLoads)));
4755
4756 // Mark the original load as dead and kill the original slice.
4757 DeadInsts.push_back(Elt: LI);
4758 Offsets.S->kill();
4759 }
4760
4761 // Second, we rewrite all of the split stores. At this point, we know that
4762 // all loads from this alloca have been split already. For stores of such
4763 // loads, we can simply look up the pre-existing split loads. For stores of
4764 // other loads, we split those loads first and then write split stores of
4765 // them.
4766 for (StoreInst *SI : Stores) {
4767 auto *LI = cast<LoadInst>(Val: SI->getValueOperand());
4768 IntegerType *Ty = cast<IntegerType>(Val: LI->getType());
4769 assert(Ty->getBitWidth() % 8 == 0);
4770 uint64_t StoreSize = Ty->getBitWidth() / 8;
4771 assert(StoreSize > 0 && "Cannot have a zero-sized integer store!");
4772
4773 auto &Offsets = SplitOffsetsMap[SI];
4774 assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
4775 "Slice size should always match load size exactly!");
4776 uint64_t BaseOffset = Offsets.S->beginOffset();
4777 assert(BaseOffset + StoreSize > BaseOffset &&
4778 "Cannot represent alloca access size using 64-bit integers!");
4779
4780 Value *LoadBasePtr = LI->getPointerOperand();
4781 Instruction *StoreBasePtr = cast<Instruction>(Val: SI->getPointerOperand());
4782
4783 LLVM_DEBUG(dbgs() << " Splitting store: " << *SI << "\n");
4784
4785 // Check whether we have an already split load.
4786 auto SplitLoadsMapI = SplitLoadsMap.find(Val: LI);
4787 std::vector<LoadInst *> *SplitLoads = nullptr;
4788 if (SplitLoadsMapI != SplitLoadsMap.end()) {
4789 SplitLoads = &SplitLoadsMapI->second;
4790 assert(SplitLoads->size() == Offsets.Splits.size() + 1 &&
4791 "Too few split loads for the number of splits in the store!");
4792 } else {
4793 LLVM_DEBUG(dbgs() << " of load: " << *LI << "\n");
4794 }
4795
4796 uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
4797 int Idx = 0, Size = Offsets.Splits.size();
4798 for (;;) {
4799 auto *PartTy = Type::getIntNTy(C&: Ty->getContext(), N: PartSize * 8);
4800 auto *LoadPartPtrTy = LI->getPointerOperandType();
4801 auto *StorePartPtrTy = SI->getPointerOperandType();
4802
4803 // Either lookup a split load or create one.
4804 LoadInst *PLoad;
4805 if (SplitLoads) {
4806 PLoad = (*SplitLoads)[Idx];
4807 } else {
4808 IRB.SetInsertPoint(LI);
4809 auto AS = LI->getPointerAddressSpace();
4810 PLoad = IRB.CreateAlignedLoad(
4811 Ty: PartTy,
4812 Ptr: getAdjustedPtr(IRB, DL, Ptr: LoadBasePtr,
4813 Offset: APInt(DL.getIndexSizeInBits(AS), PartOffset),
4814 PointerTy: LoadPartPtrTy, NamePrefix: LoadBasePtr->getName() + "."),
4815 Align: getAdjustedAlignment(I: LI, Offset: PartOffset),
4816 /*IsVolatile*/ isVolatile: false, Name: LI->getName());
4817 PLoad->copyMetadata(SrcInst: *LI, WL: {LLVMContext::MD_mem_parallel_loop_access,
4818 LLVMContext::MD_access_group});
4819 }
4820
4821 // And store this partition.
4822 IRB.SetInsertPoint(SI);
4823 auto AS = SI->getPointerAddressSpace();
4824 StoreInst *PStore = IRB.CreateAlignedStore(
4825 Val: PLoad,
4826 Ptr: getAdjustedPtr(IRB, DL, Ptr: StoreBasePtr,
4827 Offset: APInt(DL.getIndexSizeInBits(AS), PartOffset),
4828 PointerTy: StorePartPtrTy, NamePrefix: StoreBasePtr->getName() + "."),
4829 Align: getAdjustedAlignment(I: SI, Offset: PartOffset),
4830 /*IsVolatile*/ isVolatile: false);
4831 PStore->copyMetadata(SrcInst: *SI, WL: {LLVMContext::MD_mem_parallel_loop_access,
4832 LLVMContext::MD_access_group});
4833
4834 // Now build a new slice for the alloca.
4835 NewSlices.push_back(
4836 Elt: Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
4837 &PStore->getOperandUse(i: PStore->getPointerOperandIndex()),
4838 /*IsSplittable*/ false));
4839 LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
4840 << ", " << NewSlices.back().endOffset()
4841 << "): " << *PStore << "\n");
4842 if (!SplitLoads) {
4843 LLVM_DEBUG(dbgs() << " of split load: " << *PLoad << "\n");
4844 }
4845
4846 // See if we've finished all the splits.
4847 if (Idx >= Size)
4848 break;
4849
4850 // Setup the next partition.
4851 PartOffset = Offsets.Splits[Idx];
4852 ++Idx;
4853 PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset;
4854 }
4855
4856 // We want to immediately iterate on any allocas impacted by splitting
4857 // this load, which is only relevant if it isn't a load of this alloca and
4858 // thus we didn't already split the loads above. We also have to keep track
4859 // of any promotable allocas we split loads on as they can no longer be
4860 // promoted.
4861 if (!SplitLoads) {
4862 if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(Val: LoadBasePtr)) {
4863 assert(OtherAI != &AI && "We can't re-split our own alloca!");
4864 ResplitPromotableAllocas.insert(Ptr: OtherAI);
4865 Worklist.insert(X: OtherAI);
4866 } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
4867 Val: LoadBasePtr->stripInBoundsOffsets())) {
4868 assert(OtherAI != &AI && "We can't re-split our own alloca!");
4869 Worklist.insert(X: OtherAI);
4870 }
4871 }
4872
4873 // Mark the original store as dead now that we've split it up and kill its
4874 // slice. Note that we leave the original load in place unless this store
4875 // was its only use. It may in turn be split up if it is an alloca load
4876 // for some other alloca, but it may be a normal load. This may introduce
4877 // redundant loads, but where those can be merged the rest of the optimizer
4878 // should handle the merging, and this uncovers SSA splits which is more
4879 // important. In practice, the original loads will almost always be fully
4880 // split and removed eventually, and the splits will be merged by any
4881 // trivial CSE, including instcombine.
4882 if (LI->hasOneUse()) {
4883 assert(*LI->user_begin() == SI && "Single use isn't this store!");
4884 DeadInsts.push_back(Elt: LI);
4885 }
4886 DeadInsts.push_back(Elt: SI);
4887 Offsets.S->kill();
4888 }
4889
4890 // Remove the killed slices that have ben pre-split.
4891 llvm::erase_if(C&: AS, P: [](const Slice &S) { return S.isDead(); });
4892
4893 // Insert our new slices. This will sort and merge them into the sorted
4894 // sequence.
4895 AS.insert(NewSlices);
4896
4897 LLVM_DEBUG(dbgs() << " Pre-split slices:\n");
4898#ifndef NDEBUG
4899 for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
4900 LLVM_DEBUG(AS.print(dbgs(), I, " "));
4901#endif
4902
4903 // Finally, don't try to promote any allocas that new require re-splitting.
4904 // They have already been added to the worklist above.
4905 PromotableAllocas.set_subtract(ResplitPromotableAllocas);
4906
4907 return true;
4908}
4909
4910/// Rewrite an alloca partition's users.
4911///
4912/// This routine drives both of the rewriting goals of the SROA pass. It tries
4913/// to rewrite uses of an alloca partition to be conducive for SSA value
4914/// promotion. If the partition needs a new, more refined alloca, this will
4915/// build that new alloca, preserving as much type information as possible, and
4916/// rewrite the uses of the old alloca to point at the new one and have the
4917/// appropriate new offsets. It also evaluates how successful the rewrite was
4918/// at enabling promotion and if it was successful queues the alloca to be
4919/// promoted.
4920AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
4921 Partition &P) {
4922 // Try to compute a friendly type for this partition of the alloca. This
4923 // won't always succeed, in which case we fall back to a legal integer type
4924 // or an i8 array of an appropriate size.
4925 Type *SliceTy = nullptr;
4926 VectorType *SliceVecTy = nullptr;
4927 const DataLayout &DL = AI.getDataLayout();
4928 unsigned VScale = AI.getFunction()->getVScaleValue();
4929
4930 std::pair<Type *, IntegerType *> CommonUseTy =
4931 findCommonType(B: P.begin(), E: P.end(), EndOffset: P.endOffset());
4932 // Do all uses operate on the same type?
4933 if (CommonUseTy.first) {
4934 TypeSize CommonUseSize = DL.getTypeAllocSize(Ty: CommonUseTy.first);
4935 if (CommonUseSize.isFixed() && CommonUseSize.getFixedValue() >= P.size()) {
4936 SliceTy = CommonUseTy.first;
4937 SliceVecTy = dyn_cast<VectorType>(Val: SliceTy);
4938 }
4939 }
4940 // If not, can we find an appropriate subtype in the original allocated type?
4941 if (!SliceTy)
4942 if (Type *TypePartitionTy = getTypePartition(DL, Ty: AI.getAllocatedType(),
4943 Offset: P.beginOffset(), Size: P.size()))
4944 SliceTy = TypePartitionTy;
4945
4946 // If still not, can we use the largest bitwidth integer type used?
4947 if (!SliceTy && CommonUseTy.second)
4948 if (DL.getTypeAllocSize(Ty: CommonUseTy.second).getFixedValue() >= P.size()) {
4949 SliceTy = CommonUseTy.second;
4950 SliceVecTy = dyn_cast<VectorType>(Val: SliceTy);
4951 }
4952 if ((!SliceTy || (SliceTy->isArrayTy() &&
4953 SliceTy->getArrayElementType()->isIntegerTy())) &&
4954 DL.isLegalInteger(Width: P.size() * 8)) {
4955 SliceTy = Type::getIntNTy(C&: *C, N: P.size() * 8);
4956 }
4957
4958 // If the common use types are not viable for promotion then attempt to find
4959 // another type that is viable.
4960 if (SliceVecTy && !checkVectorTypeForPromotion(P, VTy: SliceVecTy, DL, VScale))
4961 if (Type *TypePartitionTy = getTypePartition(DL, Ty: AI.getAllocatedType(),
4962 Offset: P.beginOffset(), Size: P.size())) {
4963 VectorType *TypePartitionVecTy = dyn_cast<VectorType>(Val: TypePartitionTy);
4964 if (TypePartitionVecTy &&
4965 checkVectorTypeForPromotion(P, VTy: TypePartitionVecTy, DL, VScale))
4966 SliceTy = TypePartitionTy;
4967 }
4968
4969 if (!SliceTy)
4970 SliceTy = ArrayType::get(ElementType: Type::getInt8Ty(C&: *C), NumElements: P.size());
4971 assert(DL.getTypeAllocSize(SliceTy).getFixedValue() >= P.size());
4972
4973 bool IsIntegerPromotable = isIntegerWideningViable(P, AllocaTy: SliceTy, DL);
4974
4975 VectorType *VecTy =
4976 IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL, VScale);
4977 if (VecTy)
4978 SliceTy = VecTy;
4979
4980 // Check for the case where we're going to rewrite to a new alloca of the
4981 // exact same type as the original, and with the same access offsets. In that
4982 // case, re-use the existing alloca, but still run through the rewriter to
4983 // perform phi and select speculation.
4984 // P.beginOffset() can be non-zero even with the same type in a case with
4985 // out-of-bounds access (e.g. @PR35657 function in SROA/basictest.ll).
4986 AllocaInst *NewAI;
4987 if (SliceTy == AI.getAllocatedType() && P.beginOffset() == 0) {
4988 NewAI = &AI;
4989 // FIXME: We should be able to bail at this point with "nothing changed".
4990 // FIXME: We might want to defer PHI speculation until after here.
4991 // FIXME: return nullptr;
4992 } else {
4993 // Make sure the alignment is compatible with P.beginOffset().
4994 const Align Alignment = commonAlignment(A: AI.getAlign(), Offset: P.beginOffset());
4995 // If we will get at least this much alignment from the type alone, leave
4996 // the alloca's alignment unconstrained.
4997 const bool IsUnconstrained = Alignment <= DL.getABITypeAlign(Ty: SliceTy);
4998 NewAI = new AllocaInst(
4999 SliceTy, AI.getAddressSpace(), nullptr,
5000 IsUnconstrained ? DL.getPrefTypeAlign(Ty: SliceTy) : Alignment,
5001 AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()),
5002 AI.getIterator());
5003 // Copy the old AI debug location over to the new one.
5004 NewAI->setDebugLoc(AI.getDebugLoc());
5005 ++NumNewAllocas;
5006 }
5007
5008 LLVM_DEBUG(dbgs() << "Rewriting alloca partition " << "[" << P.beginOffset()
5009 << "," << P.endOffset() << ") to: " << *NewAI << "\n");
5010
5011 // Track the high watermark on the worklist as it is only relevant for
5012 // promoted allocas. We will reset it to this point if the alloca is not in
5013 // fact scheduled for promotion.
5014 unsigned PPWOldSize = PostPromotionWorklist.size();
5015 unsigned NumUses = 0;
5016 SmallSetVector<PHINode *, 8> PHIUsers;
5017 SmallSetVector<SelectInst *, 8> SelectUsers;
5018
5019 AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(),
5020 P.endOffset(), IsIntegerPromotable, VecTy,
5021 PHIUsers, SelectUsers);
5022 bool Promotable = true;
5023 for (Slice *S : P.splitSliceTails()) {
5024 Promotable &= Rewriter.visit(I: S);
5025 ++NumUses;
5026 }
5027 for (Slice &S : P) {
5028 Promotable &= Rewriter.visit(I: &S);
5029 ++NumUses;
5030 }
5031
5032 NumAllocaPartitionUses += NumUses;
5033 MaxUsesPerAllocaPartition.updateMax(V: NumUses);
5034
5035 // Now that we've processed all the slices in the new partition, check if any
5036 // PHIs or Selects would block promotion.
5037 for (PHINode *PHI : PHIUsers)
5038 if (!isSafePHIToSpeculate(PN&: *PHI)) {
5039 Promotable = false;
5040 PHIUsers.clear();
5041 SelectUsers.clear();
5042 break;
5043 }
5044
5045 SmallVector<std::pair<SelectInst *, RewriteableMemOps>, 2>
5046 NewSelectsToRewrite;
5047 NewSelectsToRewrite.reserve(N: SelectUsers.size());
5048 for (SelectInst *Sel : SelectUsers) {
5049 std::optional<RewriteableMemOps> Ops =
5050 isSafeSelectToSpeculate(SI&: *Sel, PreserveCFG);
5051 if (!Ops) {
5052 Promotable = false;
5053 PHIUsers.clear();
5054 SelectUsers.clear();
5055 NewSelectsToRewrite.clear();
5056 break;
5057 }
5058 NewSelectsToRewrite.emplace_back(Args: std::make_pair(x&: Sel, y&: *Ops));
5059 }
5060
5061 if (Promotable) {
5062 for (Use *U : AS.getDeadUsesIfPromotable()) {
5063 auto *OldInst = dyn_cast<Instruction>(Val: U->get());
5064 Value::dropDroppableUse(U&: *U);
5065 if (OldInst)
5066 if (isInstructionTriviallyDead(I: OldInst))
5067 DeadInsts.push_back(Elt: OldInst);
5068 }
5069 if (PHIUsers.empty() && SelectUsers.empty()) {
5070 // Promote the alloca.
5071 PromotableAllocas.insert(X: NewAI);
5072 } else {
5073 // If we have either PHIs or Selects to speculate, add them to those
5074 // worklists and re-queue the new alloca so that we promote in on the
5075 // next iteration.
5076 SpeculatablePHIs.insert_range(R&: PHIUsers);
5077 SelectsToRewrite.reserve(NumEntries: SelectsToRewrite.size() +
5078 NewSelectsToRewrite.size());
5079 for (auto &&KV : llvm::make_range(
5080 x: std::make_move_iterator(i: NewSelectsToRewrite.begin()),
5081 y: std::make_move_iterator(i: NewSelectsToRewrite.end())))
5082 SelectsToRewrite.insert(KV: std::move(KV));
5083 Worklist.insert(X: NewAI);
5084 }
5085 } else {
5086 // Drop any post-promotion work items if promotion didn't happen.
5087 while (PostPromotionWorklist.size() > PPWOldSize)
5088 PostPromotionWorklist.pop_back();
5089
5090 // We couldn't promote and we didn't create a new partition, nothing
5091 // happened.
5092 if (NewAI == &AI)
5093 return nullptr;
5094
5095 // If we can't promote the alloca, iterate on it to check for new
5096 // refinements exposed by splitting the current alloca. Don't iterate on an
5097 // alloca which didn't actually change and didn't get promoted.
5098 Worklist.insert(X: NewAI);
5099 }
5100
5101 return NewAI;
5102}
5103
5104// There isn't a shared interface to get the "address" parts out of a
5105// dbg.declare and dbg.assign, so provide some wrappers now for
5106// both debug intrinsics and records.
5107const Value *getAddress(const DbgVariableIntrinsic *DVI) {
5108 if (const auto *DAI = dyn_cast<DbgAssignIntrinsic>(Val: DVI))
5109 return DAI->getAddress();
5110 return cast<DbgDeclareInst>(Val: DVI)->getAddress();
5111}
5112
5113const Value *getAddress(const DbgVariableRecord *DVR) {
5114 return DVR->getAddress();
5115}
5116
5117bool isKillAddress(const DbgVariableIntrinsic *DVI) {
5118 if (const auto *DAI = dyn_cast<DbgAssignIntrinsic>(Val: DVI))
5119 return DAI->isKillAddress();
5120 return cast<DbgDeclareInst>(Val: DVI)->isKillLocation();
5121}
5122
5123bool isKillAddress(const DbgVariableRecord *DVR) {
5124 if (DVR->getType() == DbgVariableRecord::LocationType::Assign)
5125 return DVR->isKillAddress();
5126 return DVR->isKillLocation();
5127}
5128
5129const DIExpression *getAddressExpression(const DbgVariableIntrinsic *DVI) {
5130 if (const auto *DAI = dyn_cast<DbgAssignIntrinsic>(Val: DVI))
5131 return DAI->getAddressExpression();
5132 return cast<DbgDeclareInst>(Val: DVI)->getExpression();
5133}
5134
5135const DIExpression *getAddressExpression(const DbgVariableRecord *DVR) {
5136 if (DVR->getType() == DbgVariableRecord::LocationType::Assign)
5137 return DVR->getAddressExpression();
5138 return DVR->getExpression();
5139}
5140
5141/// Create or replace an existing fragment in a DIExpression with \p Frag.
5142/// If the expression already contains a DW_OP_LLVM_extract_bits_[sz]ext
5143/// operation, add \p BitExtractOffset to the offset part.
5144///
5145/// Returns the new expression, or nullptr if this fails (see details below).
5146///
5147/// This function is similar to DIExpression::createFragmentExpression except
5148/// for 3 important distinctions:
5149/// 1. The new fragment isn't relative to an existing fragment.
5150/// 2. It assumes the computed location is a memory location. This means we
5151/// don't need to perform checks that creating the fragment preserves the
5152/// expression semantics.
5153/// 3. Existing extract_bits are modified independently of fragment changes
5154/// using \p BitExtractOffset. A change to the fragment offset or size
5155/// may affect a bit extract. But a bit extract offset can change
5156/// independently of the fragment dimensions.
5157///
5158/// Returns the new expression, or nullptr if one couldn't be created.
5159/// Ideally this is only used to signal that a bit-extract has become
5160/// zero-sized (and thus the new debug record has no size and can be
5161/// dropped), however, it fails for other reasons too - see the FIXME below.
5162///
5163/// FIXME: To keep the change that introduces this function NFC it bails
5164/// in some situations unecessarily, e.g. when fragment and bit extract
5165/// sizes differ.
5166static DIExpression *createOrReplaceFragment(const DIExpression *Expr,
5167 DIExpression::FragmentInfo Frag,
5168 int64_t BitExtractOffset) {
5169 SmallVector<uint64_t, 8> Ops;
5170 bool HasFragment = false;
5171 bool HasBitExtract = false;
5172
5173 for (auto &Op : Expr->expr_ops()) {
5174 if (Op.getOp() == dwarf::DW_OP_LLVM_fragment) {
5175 HasFragment = true;
5176 continue;
5177 }
5178 if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext ||
5179 Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_sext) {
5180 HasBitExtract = true;
5181 int64_t ExtractOffsetInBits = Op.getArg(I: 0);
5182 int64_t ExtractSizeInBits = Op.getArg(I: 1);
5183
5184 // DIExpression::createFragmentExpression doesn't know how to handle
5185 // a fragment that is smaller than the extract. Copy the behaviour
5186 // (bail) to avoid non-NFC changes.
5187 // FIXME: Don't do this.
5188 if (Frag.SizeInBits < uint64_t(ExtractSizeInBits))
5189 return nullptr;
5190
5191 assert(BitExtractOffset <= 0);
5192 int64_t AdjustedOffset = ExtractOffsetInBits + BitExtractOffset;
5193
5194 // DIExpression::createFragmentExpression doesn't know what to do
5195 // if the new extract starts "outside" the existing one. Copy the
5196 // behaviour (bail) to avoid non-NFC changes.
5197 // FIXME: Don't do this.
5198 if (AdjustedOffset < 0)
5199 return nullptr;
5200
5201 Ops.push_back(Elt: Op.getOp());
5202 Ops.push_back(Elt: std::max<int64_t>(a: 0, b: AdjustedOffset));
5203 Ops.push_back(Elt: ExtractSizeInBits);
5204 continue;
5205 }
5206 Op.appendToVector(V&: Ops);
5207 }
5208
5209 // Unsupported by createFragmentExpression, so don't support it here yet to
5210 // preserve NFC-ness.
5211 if (HasFragment && HasBitExtract)
5212 return nullptr;
5213
5214 if (!HasBitExtract) {
5215 Ops.push_back(Elt: dwarf::DW_OP_LLVM_fragment);
5216 Ops.push_back(Elt: Frag.OffsetInBits);
5217 Ops.push_back(Elt: Frag.SizeInBits);
5218 }
5219 return DIExpression::get(Context&: Expr->getContext(), Elements: Ops);
5220}
5221
5222/// Insert a new dbg.declare.
5223/// \p Orig Original to copy debug loc and variable from.
5224/// \p NewAddr Location's new base address.
5225/// \p NewAddrExpr New expression to apply to address.
5226/// \p BeforeInst Insert position.
5227/// \p NewFragment New fragment (absolute, non-relative).
5228/// \p BitExtractAdjustment Offset to apply to any extract_bits op.
5229static void
5230insertNewDbgInst(DIBuilder &DIB, DbgDeclareInst *Orig, AllocaInst *NewAddr,
5231 DIExpression *NewAddrExpr, Instruction *BeforeInst,
5232 std::optional<DIExpression::FragmentInfo> NewFragment,
5233 int64_t BitExtractAdjustment) {
5234 if (NewFragment)
5235 NewAddrExpr = createOrReplaceFragment(Expr: NewAddrExpr, Frag: *NewFragment,
5236 BitExtractOffset: BitExtractAdjustment);
5237 if (!NewAddrExpr)
5238 return;
5239
5240 DIB.insertDeclare(Storage: NewAddr, VarInfo: Orig->getVariable(), Expr: NewAddrExpr,
5241 DL: Orig->getDebugLoc(), InsertPt: BeforeInst->getIterator());
5242}
5243
5244/// Insert a new dbg.assign.
5245/// \p Orig Original to copy debug loc, variable, value and value expression
5246/// from.
5247/// \p NewAddr Location's new base address.
5248/// \p NewAddrExpr New expression to apply to address.
5249/// \p BeforeInst Insert position.
5250/// \p NewFragment New fragment (absolute, non-relative).
5251/// \p BitExtractAdjustment Offset to apply to any extract_bits op.
5252static void
5253insertNewDbgInst(DIBuilder &DIB, DbgAssignIntrinsic *Orig, AllocaInst *NewAddr,
5254 DIExpression *NewAddrExpr, Instruction *BeforeInst,
5255 std::optional<DIExpression::FragmentInfo> NewFragment,
5256 int64_t BitExtractAdjustment) {
5257 // DIBuilder::insertDbgAssign will insert the #dbg_assign after NewAddr.
5258 (void)BeforeInst;
5259
5260 // A dbg.assign puts fragment info in the value expression only. The address
5261 // expression has already been built: NewAddrExpr.
5262 DIExpression *NewFragmentExpr = Orig->getExpression();
5263 if (NewFragment)
5264 NewFragmentExpr = createOrReplaceFragment(Expr: NewFragmentExpr, Frag: *NewFragment,
5265 BitExtractOffset: BitExtractAdjustment);
5266 if (!NewFragmentExpr)
5267 return;
5268
5269 // Apply a DIAssignID to the store if it doesn't already have it.
5270 if (!NewAddr->hasMetadata(KindID: LLVMContext::MD_DIAssignID)) {
5271 NewAddr->setMetadata(KindID: LLVMContext::MD_DIAssignID,
5272 Node: DIAssignID::getDistinct(Context&: NewAddr->getContext()));
5273 }
5274
5275 Instruction *NewAssign = cast<Instruction *>(Val: DIB.insertDbgAssign(
5276 LinkedInstr: NewAddr, Val: Orig->getValue(), SrcVar: Orig->getVariable(), ValExpr: NewFragmentExpr, Addr: NewAddr,
5277 AddrExpr: NewAddrExpr, DL: Orig->getDebugLoc()));
5278 LLVM_DEBUG(dbgs() << "Created new assign intrinsic: " << *NewAssign << "\n");
5279 (void)NewAssign;
5280}
5281
5282/// Insert a new DbgRecord.
5283/// \p Orig Original to copy record type, debug loc and variable from, and
5284/// additionally value and value expression for dbg_assign records.
5285/// \p NewAddr Location's new base address.
5286/// \p NewAddrExpr New expression to apply to address.
5287/// \p BeforeInst Insert position.
5288/// \p NewFragment New fragment (absolute, non-relative).
5289/// \p BitExtractAdjustment Offset to apply to any extract_bits op.
5290static void
5291insertNewDbgInst(DIBuilder &DIB, DbgVariableRecord *Orig, AllocaInst *NewAddr,
5292 DIExpression *NewAddrExpr, Instruction *BeforeInst,
5293 std::optional<DIExpression::FragmentInfo> NewFragment,
5294 int64_t BitExtractAdjustment) {
5295 (void)DIB;
5296
5297 // A dbg_assign puts fragment info in the value expression only. The address
5298 // expression has already been built: NewAddrExpr. A dbg_declare puts the
5299 // new fragment info into NewAddrExpr (as it only has one expression).
5300 DIExpression *NewFragmentExpr =
5301 Orig->isDbgAssign() ? Orig->getExpression() : NewAddrExpr;
5302 if (NewFragment)
5303 NewFragmentExpr = createOrReplaceFragment(Expr: NewFragmentExpr, Frag: *NewFragment,
5304 BitExtractOffset: BitExtractAdjustment);
5305 if (!NewFragmentExpr)
5306 return;
5307
5308 if (Orig->isDbgDeclare()) {
5309 DbgVariableRecord *DVR = DbgVariableRecord::createDVRDeclare(
5310 Address: NewAddr, DV: Orig->getVariable(), Expr: NewFragmentExpr, DI: Orig->getDebugLoc());
5311 BeforeInst->getParent()->insertDbgRecordBefore(DR: DVR,
5312 Here: BeforeInst->getIterator());
5313 return;
5314 }
5315
5316 if (Orig->isDbgValue()) {
5317 DbgVariableRecord *DVR = DbgVariableRecord::createDbgVariableRecord(
5318 Location: NewAddr, DV: Orig->getVariable(), Expr: NewFragmentExpr, DI: Orig->getDebugLoc());
5319 // Drop debug information if the expression doesn't start with a
5320 // DW_OP_deref. This is because without a DW_OP_deref, the #dbg_value
5321 // describes the address of alloca rather than the value inside the alloca.
5322 if (!NewFragmentExpr->startsWithDeref())
5323 DVR->setKillAddress();
5324 BeforeInst->getParent()->insertDbgRecordBefore(DR: DVR,
5325 Here: BeforeInst->getIterator());
5326 return;
5327 }
5328
5329 // Apply a DIAssignID to the store if it doesn't already have it.
5330 if (!NewAddr->hasMetadata(KindID: LLVMContext::MD_DIAssignID)) {
5331 NewAddr->setMetadata(KindID: LLVMContext::MD_DIAssignID,
5332 Node: DIAssignID::getDistinct(Context&: NewAddr->getContext()));
5333 }
5334
5335 DbgVariableRecord *NewAssign = DbgVariableRecord::createLinkedDVRAssign(
5336 LinkedInstr: NewAddr, Val: Orig->getValue(), Variable: Orig->getVariable(), Expression: NewFragmentExpr, Address: NewAddr,
5337 AddressExpression: NewAddrExpr, DI: Orig->getDebugLoc());
5338 LLVM_DEBUG(dbgs() << "Created new DVRAssign: " << *NewAssign << "\n");
5339 (void)NewAssign;
5340}
5341
5342/// Walks the slices of an alloca and form partitions based on them,
5343/// rewriting each of their uses.
5344bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
5345 if (AS.begin() == AS.end())
5346 return false;
5347
5348 unsigned NumPartitions = 0;
5349 bool Changed = false;
5350 const DataLayout &DL = AI.getModule()->getDataLayout();
5351
5352 // First try to pre-split loads and stores.
5353 Changed |= presplitLoadsAndStores(AI, AS);
5354
5355 // Now that we have identified any pre-splitting opportunities,
5356 // mark loads and stores unsplittable except for the following case.
5357 // We leave a slice splittable if all other slices are disjoint or fully
5358 // included in the slice, such as whole-alloca loads and stores.
5359 // If we fail to split these during pre-splitting, we want to force them
5360 // to be rewritten into a partition.
5361 bool IsSorted = true;
5362
5363 uint64_t AllocaSize =
5364 DL.getTypeAllocSize(Ty: AI.getAllocatedType()).getFixedValue();
5365 const uint64_t MaxBitVectorSize = 1024;
5366 if (AllocaSize <= MaxBitVectorSize) {
5367 // If a byte boundary is included in any load or store, a slice starting or
5368 // ending at the boundary is not splittable.
5369 SmallBitVector SplittableOffset(AllocaSize + 1, true);
5370 for (Slice &S : AS)
5371 for (unsigned O = S.beginOffset() + 1;
5372 O < S.endOffset() && O < AllocaSize; O++)
5373 SplittableOffset.reset(Idx: O);
5374
5375 for (Slice &S : AS) {
5376 if (!S.isSplittable())
5377 continue;
5378
5379 if ((S.beginOffset() > AllocaSize || SplittableOffset[S.beginOffset()]) &&
5380 (S.endOffset() > AllocaSize || SplittableOffset[S.endOffset()]))
5381 continue;
5382
5383 if (isa<LoadInst>(Val: S.getUse()->getUser()) ||
5384 isa<StoreInst>(Val: S.getUse()->getUser())) {
5385 S.makeUnsplittable();
5386 IsSorted = false;
5387 }
5388 }
5389 } else {
5390 // We only allow whole-alloca splittable loads and stores
5391 // for a large alloca to avoid creating too large BitVector.
5392 for (Slice &S : AS) {
5393 if (!S.isSplittable())
5394 continue;
5395
5396 if (S.beginOffset() == 0 && S.endOffset() >= AllocaSize)
5397 continue;
5398
5399 if (isa<LoadInst>(Val: S.getUse()->getUser()) ||
5400 isa<StoreInst>(Val: S.getUse()->getUser())) {
5401 S.makeUnsplittable();
5402 IsSorted = false;
5403 }
5404 }
5405 }
5406
5407 if (!IsSorted)
5408 llvm::stable_sort(Range&: AS);
5409
5410 /// Describes the allocas introduced by rewritePartition in order to migrate
5411 /// the debug info.
5412 struct Fragment {
5413 AllocaInst *Alloca;
5414 uint64_t Offset;
5415 uint64_t Size;
5416 Fragment(AllocaInst *AI, uint64_t O, uint64_t S)
5417 : Alloca(AI), Offset(O), Size(S) {}
5418 };
5419 SmallVector<Fragment, 4> Fragments;
5420
5421 // Rewrite each partition.
5422 for (auto &P : AS.partitions()) {
5423 if (AllocaInst *NewAI = rewritePartition(AI, AS, P)) {
5424 Changed = true;
5425 if (NewAI != &AI) {
5426 uint64_t SizeOfByte = 8;
5427 uint64_t AllocaSize =
5428 DL.getTypeSizeInBits(Ty: NewAI->getAllocatedType()).getFixedValue();
5429 // Don't include any padding.
5430 uint64_t Size = std::min(a: AllocaSize, b: P.size() * SizeOfByte);
5431 Fragments.push_back(
5432 Elt: Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
5433 }
5434 }
5435 ++NumPartitions;
5436 }
5437
5438 NumAllocaPartitions += NumPartitions;
5439 MaxPartitionsPerAlloca.updateMax(V: NumPartitions);
5440
5441 // Migrate debug information from the old alloca to the new alloca(s)
5442 // and the individual partitions.
5443 auto MigrateOne = [&](auto *DbgVariable) {
5444 // Can't overlap with undef memory.
5445 if (isKillAddress(DbgVariable))
5446 return;
5447
5448 const Value *DbgPtr = getAddress(DbgVariable);
5449 DIExpression::FragmentInfo VarFrag =
5450 DbgVariable->getFragmentOrEntireVariable();
5451 // Get the address expression constant offset if one exists and the ops
5452 // that come after it.
5453 int64_t CurrentExprOffsetInBytes = 0;
5454 SmallVector<uint64_t> PostOffsetOps;
5455 if (!getAddressExpression(DbgVariable)
5456 ->extractLeadingOffset(CurrentExprOffsetInBytes, PostOffsetOps))
5457 return; // Couldn't interpret this DIExpression - drop the var.
5458
5459 // Offset defined by a DW_OP_LLVM_extract_bits_[sz]ext.
5460 int64_t ExtractOffsetInBits = 0;
5461 for (auto Op : getAddressExpression(DbgVariable)->expr_ops()) {
5462 if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext ||
5463 Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_sext) {
5464 ExtractOffsetInBits = Op.getArg(0);
5465 break;
5466 }
5467 }
5468
5469 DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
5470 for (auto Fragment : Fragments) {
5471 int64_t OffsetFromLocationInBits;
5472 std::optional<DIExpression::FragmentInfo> NewDbgFragment;
5473 // Find the variable fragment that the new alloca slice covers.
5474 // Drop debug info for this variable fragment if we can't compute an
5475 // intersect between it and the alloca slice.
5476 if (!DIExpression::calculateFragmentIntersect(
5477 DL, SliceStart: &AI, SliceOffsetInBits: Fragment.Offset, SliceSizeInBits: Fragment.Size, DbgPtr,
5478 DbgPtrOffsetInBits: CurrentExprOffsetInBytes * 8, DbgExtractOffsetInBits: ExtractOffsetInBits, VarFrag,
5479 Result&: NewDbgFragment, OffsetFromLocationInBits))
5480 continue; // Do not migrate this fragment to this slice.
5481
5482 // Zero sized fragment indicates there's no intersect between the variable
5483 // fragment and the alloca slice. Skip this slice for this variable
5484 // fragment.
5485 if (NewDbgFragment && !NewDbgFragment->SizeInBits)
5486 continue; // Do not migrate this fragment to this slice.
5487
5488 // No fragment indicates DbgVariable's variable or fragment exactly
5489 // overlaps the slice; copy its fragment (or nullopt if there isn't one).
5490 if (!NewDbgFragment)
5491 NewDbgFragment = DbgVariable->getFragment();
5492
5493 // Reduce the new expression offset by the bit-extract offset since
5494 // we'll be keeping that.
5495 int64_t OffestFromNewAllocaInBits =
5496 OffsetFromLocationInBits - ExtractOffsetInBits;
5497 // We need to adjust an existing bit extract if the offset expression
5498 // can't eat the slack (i.e., if the new offset would be negative).
5499 int64_t BitExtractOffset =
5500 std::min<int64_t>(a: 0, b: OffestFromNewAllocaInBits);
5501 // The magnitude of a negative value indicates the number of bits into
5502 // the existing variable fragment that the memory region begins. The new
5503 // variable fragment already excludes those bits - the new DbgPtr offset
5504 // only needs to be applied if it's positive.
5505 OffestFromNewAllocaInBits =
5506 std::max(a: int64_t(0), b: OffestFromNewAllocaInBits);
5507
5508 // Rebuild the expression:
5509 // {Offset(OffestFromNewAllocaInBits), PostOffsetOps, NewDbgFragment}
5510 // Add NewDbgFragment later, because dbg.assigns don't want it in the
5511 // address expression but the value expression instead.
5512 DIExpression *NewExpr = DIExpression::get(Context&: AI.getContext(), Elements: PostOffsetOps);
5513 if (OffestFromNewAllocaInBits > 0) {
5514 int64_t OffsetInBytes = (OffestFromNewAllocaInBits + 7) / 8;
5515 NewExpr = DIExpression::prepend(Expr: NewExpr, /*flags=*/Flags: 0, Offset: OffsetInBytes);
5516 }
5517
5518 // Remove any existing intrinsics on the new alloca describing
5519 // the variable fragment.
5520 auto RemoveOne = [DbgVariable](auto *OldDII) {
5521 auto SameVariableFragment = [](const auto *LHS, const auto *RHS) {
5522 return LHS->getVariable() == RHS->getVariable() &&
5523 LHS->getDebugLoc()->getInlinedAt() ==
5524 RHS->getDebugLoc()->getInlinedAt();
5525 };
5526 if (SameVariableFragment(OldDII, DbgVariable))
5527 OldDII->eraseFromParent();
5528 };
5529 for_each(findDbgDeclares(V: Fragment.Alloca), RemoveOne);
5530 for_each(findDVRDeclares(V: Fragment.Alloca), RemoveOne);
5531 for_each(findDVRValues(V: Fragment.Alloca), RemoveOne);
5532 insertNewDbgInst(DIB, DbgVariable, Fragment.Alloca, NewExpr, &AI,
5533 NewDbgFragment, BitExtractOffset);
5534 }
5535 };
5536
5537 // Migrate debug information from the old alloca to the new alloca(s)
5538 // and the individual partitions.
5539 for_each(Range: findDbgDeclares(V: &AI), F: MigrateOne);
5540 for_each(Range: findDVRDeclares(V: &AI), F: MigrateOne);
5541 for_each(Range: findDVRValues(V: &AI), F: MigrateOne);
5542 for_each(Range: at::getAssignmentMarkers(Inst: &AI), F: MigrateOne);
5543 for_each(Range: at::getDVRAssignmentMarkers(Inst: &AI), F: MigrateOne);
5544
5545 return Changed;
5546}
5547
5548/// Clobber a use with poison, deleting the used value if it becomes dead.
5549void SROA::clobberUse(Use &U) {
5550 Value *OldV = U;
5551 // Replace the use with an poison value.
5552 U = PoisonValue::get(T: OldV->getType());
5553
5554 // Check for this making an instruction dead. We have to garbage collect
5555 // all the dead instructions to ensure the uses of any alloca end up being
5556 // minimal.
5557 if (Instruction *OldI = dyn_cast<Instruction>(Val: OldV))
5558 if (isInstructionTriviallyDead(I: OldI)) {
5559 DeadInsts.push_back(Elt: OldI);
5560 }
5561}
5562
5563/// A basic LoadAndStorePromoter that does not remove store nodes.
5564class BasicLoadAndStorePromoter : public LoadAndStorePromoter {
5565public:
5566 BasicLoadAndStorePromoter(ArrayRef<const Instruction *> Insts, SSAUpdater &S,
5567 Type *ZeroType)
5568 : LoadAndStorePromoter(Insts, S), ZeroType(ZeroType) {}
5569 bool shouldDelete(Instruction *I) const override {
5570 return !isa<StoreInst>(Val: I) && !isa<AllocaInst>(Val: I);
5571 }
5572
5573 Value *getValueToUseForAlloca(Instruction *I) const override {
5574 return UndefValue::get(T: ZeroType);
5575 }
5576
5577private:
5578 Type *ZeroType;
5579};
5580
5581bool SROA::propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS) {
5582 // Look through each "partition", looking for slices with the same start/end
5583 // that do not overlap with any before them. The slices are sorted by
5584 // increasing beginOffset. We don't use AS.partitions(), as it will use a more
5585 // sophisticated algorithm that takes splittable slices into account.
5586 LLVM_DEBUG(dbgs() << "Attempting to propagate values on " << AI << "\n");
5587 bool AllSameAndValid = true;
5588 Type *PartitionType = nullptr;
5589 SmallVector<Instruction *> Insts;
5590 uint64_t BeginOffset = 0;
5591 uint64_t EndOffset = 0;
5592
5593 auto Flush = [&]() {
5594 if (AllSameAndValid && !Insts.empty()) {
5595 LLVM_DEBUG(dbgs() << "Propagate values on slice [" << BeginOffset << ", "
5596 << EndOffset << ")\n");
5597 SmallVector<PHINode *, 4> NewPHIs;
5598 SSAUpdater SSA(&NewPHIs);
5599 Insts.push_back(Elt: &AI);
5600 BasicLoadAndStorePromoter Promoter(Insts, SSA, PartitionType);
5601 Promoter.run(Insts);
5602 }
5603 AllSameAndValid = true;
5604 PartitionType = nullptr;
5605 Insts.clear();
5606 };
5607
5608 for (Slice &S : AS) {
5609 auto *User = cast<Instruction>(Val: S.getUse()->getUser());
5610 if (isAssumeLikeIntrinsic(I: User)) {
5611 LLVM_DEBUG({
5612 dbgs() << "Ignoring slice: ";
5613 AS.print(dbgs(), &S);
5614 });
5615 continue;
5616 }
5617 if (S.beginOffset() >= EndOffset) {
5618 Flush();
5619 BeginOffset = S.beginOffset();
5620 EndOffset = S.endOffset();
5621 } else if (S.beginOffset() != BeginOffset || S.endOffset() != EndOffset) {
5622 if (AllSameAndValid) {
5623 LLVM_DEBUG({
5624 dbgs() << "Slice does not match range [" << BeginOffset << ", "
5625 << EndOffset << ")";
5626 AS.print(dbgs(), &S);
5627 });
5628 AllSameAndValid = false;
5629 }
5630 EndOffset = std::max(a: EndOffset, b: S.endOffset());
5631 continue;
5632 }
5633
5634 if (auto *LI = dyn_cast<LoadInst>(Val: User)) {
5635 Type *UserTy = LI->getType();
5636 // LoadAndStorePromoter requires all the types to be the same.
5637 if (!LI->isSimple() || (PartitionType && UserTy != PartitionType))
5638 AllSameAndValid = false;
5639 PartitionType = UserTy;
5640 Insts.push_back(Elt: User);
5641 } else if (auto *SI = dyn_cast<StoreInst>(Val: User)) {
5642 Type *UserTy = SI->getValueOperand()->getType();
5643 if (!SI->isSimple() || (PartitionType && UserTy != PartitionType))
5644 AllSameAndValid = false;
5645 PartitionType = UserTy;
5646 Insts.push_back(Elt: User);
5647 } else {
5648 AllSameAndValid = false;
5649 }
5650 }
5651
5652 Flush();
5653 return true;
5654}
5655
5656/// Analyze an alloca for SROA.
5657///
5658/// This analyzes the alloca to ensure we can reason about it, builds
5659/// the slices of the alloca, and then hands it off to be split and
5660/// rewritten as needed.
5661std::pair<bool /*Changed*/, bool /*CFGChanged*/>
5662SROA::runOnAlloca(AllocaInst &AI) {
5663 bool Changed = false;
5664 bool CFGChanged = false;
5665
5666 LLVM_DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
5667 ++NumAllocasAnalyzed;
5668
5669 // Special case dead allocas, as they're trivial.
5670 if (AI.use_empty()) {
5671 AI.eraseFromParent();
5672 Changed = true;
5673 return {Changed, CFGChanged};
5674 }
5675 const DataLayout &DL = AI.getDataLayout();
5676
5677 // Skip alloca forms that this analysis can't handle.
5678 auto *AT = AI.getAllocatedType();
5679 TypeSize Size = DL.getTypeAllocSize(Ty: AT);
5680 if (AI.isArrayAllocation() || !AT->isSized() || Size.isScalable() ||
5681 Size.getFixedValue() == 0)
5682 return {Changed, CFGChanged};
5683
5684 // First, split any FCA loads and stores touching this alloca to promote
5685 // better splitting and promotion opportunities.
5686 IRBuilderTy IRB(&AI);
5687 AggLoadStoreRewriter AggRewriter(DL, IRB);
5688 Changed |= AggRewriter.rewrite(I&: AI);
5689
5690 // Build the slices using a recursive instruction-visiting builder.
5691 AllocaSlices AS(DL, AI);
5692 LLVM_DEBUG(AS.print(dbgs()));
5693 if (AS.isEscaped())
5694 return {Changed, CFGChanged};
5695
5696 if (AS.isEscapedReadOnly()) {
5697 Changed |= propagateStoredValuesToLoads(AI, AS);
5698 return {Changed, CFGChanged};
5699 }
5700
5701 // Delete all the dead users of this alloca before splitting and rewriting it.
5702 for (Instruction *DeadUser : AS.getDeadUsers()) {
5703 // Free up everything used by this instruction.
5704 for (Use &DeadOp : DeadUser->operands())
5705 clobberUse(U&: DeadOp);
5706
5707 // Now replace the uses of this instruction.
5708 DeadUser->replaceAllUsesWith(V: PoisonValue::get(T: DeadUser->getType()));
5709
5710 // And mark it for deletion.
5711 DeadInsts.push_back(Elt: DeadUser);
5712 Changed = true;
5713 }
5714 for (Use *DeadOp : AS.getDeadOperands()) {
5715 clobberUse(U&: *DeadOp);
5716 Changed = true;
5717 }
5718
5719 // No slices to split. Leave the dead alloca for a later pass to clean up.
5720 if (AS.begin() == AS.end())
5721 return {Changed, CFGChanged};
5722
5723 Changed |= splitAlloca(AI, AS);
5724
5725 LLVM_DEBUG(dbgs() << " Speculating PHIs\n");
5726 while (!SpeculatablePHIs.empty())
5727 speculatePHINodeLoads(IRB, PN&: *SpeculatablePHIs.pop_back_val());
5728
5729 LLVM_DEBUG(dbgs() << " Rewriting Selects\n");
5730 auto RemainingSelectsToRewrite = SelectsToRewrite.takeVector();
5731 while (!RemainingSelectsToRewrite.empty()) {
5732 const auto [K, V] = RemainingSelectsToRewrite.pop_back_val();
5733 CFGChanged |=
5734 rewriteSelectInstMemOps(SI&: *K, Ops: V, IRB, DTU: PreserveCFG ? nullptr : DTU);
5735 }
5736
5737 return {Changed, CFGChanged};
5738}
5739
5740/// Delete the dead instructions accumulated in this run.
5741///
5742/// Recursively deletes the dead instructions we've accumulated. This is done
5743/// at the very end to maximize locality of the recursive delete and to
5744/// minimize the problems of invalidated instruction pointers as such pointers
5745/// are used heavily in the intermediate stages of the algorithm.
5746///
5747/// We also record the alloca instructions deleted here so that they aren't
5748/// subsequently handed to mem2reg to promote.
5749bool SROA::deleteDeadInstructions(
5750 SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
5751 bool Changed = false;
5752 while (!DeadInsts.empty()) {
5753 Instruction *I = dyn_cast_or_null<Instruction>(Val: DeadInsts.pop_back_val());
5754 if (!I)
5755 continue;
5756 LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
5757
5758 // If the instruction is an alloca, find the possible dbg.declare connected
5759 // to it, and remove it too. We must do this before calling RAUW or we will
5760 // not be able to find it.
5761 if (AllocaInst *AI = dyn_cast<AllocaInst>(Val: I)) {
5762 DeletedAllocas.insert(Ptr: AI);
5763 for (DbgDeclareInst *OldDII : findDbgDeclares(V: AI))
5764 OldDII->eraseFromParent();
5765 for (DbgVariableRecord *OldDII : findDVRDeclares(V: AI))
5766 OldDII->eraseFromParent();
5767 }
5768
5769 at::deleteAssignmentMarkers(Inst: I);
5770 I->replaceAllUsesWith(V: UndefValue::get(T: I->getType()));
5771
5772 for (Use &Operand : I->operands())
5773 if (Instruction *U = dyn_cast<Instruction>(Val&: Operand)) {
5774 // Zero out the operand and see if it becomes trivially dead.
5775 Operand = nullptr;
5776 if (isInstructionTriviallyDead(I: U))
5777 DeadInsts.push_back(Elt: U);
5778 }
5779
5780 ++NumDeleted;
5781 I->eraseFromParent();
5782 Changed = true;
5783 }
5784 return Changed;
5785}
5786/// Promote the allocas, using the best available technique.
5787///
5788/// This attempts to promote whatever allocas have been identified as viable in
5789/// the PromotableAllocas list. If that list is empty, there is nothing to do.
5790/// This function returns whether any promotion occurred.
5791bool SROA::promoteAllocas() {
5792 if (PromotableAllocas.empty())
5793 return false;
5794
5795 if (SROASkipMem2Reg) {
5796 LLVM_DEBUG(dbgs() << "Not promoting allocas with mem2reg!\n");
5797 } else {
5798 LLVM_DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
5799 NumPromoted += PromotableAllocas.size();
5800 PromoteMemToReg(Allocas: PromotableAllocas.getArrayRef(), DT&: DTU->getDomTree(), AC);
5801 }
5802
5803 PromotableAllocas.clear();
5804 return true;
5805}
5806
5807std::pair<bool /*Changed*/, bool /*CFGChanged*/> SROA::runSROA(Function &F) {
5808 LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
5809
5810 const DataLayout &DL = F.getDataLayout();
5811 BasicBlock &EntryBB = F.getEntryBlock();
5812 for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(x: EntryBB.end());
5813 I != E; ++I) {
5814 if (AllocaInst *AI = dyn_cast<AllocaInst>(Val&: I)) {
5815 if (DL.getTypeAllocSize(Ty: AI->getAllocatedType()).isScalable() &&
5816 isAllocaPromotable(AI))
5817 PromotableAllocas.insert(X: AI);
5818 else
5819 Worklist.insert(X: AI);
5820 }
5821 }
5822
5823 bool Changed = false;
5824 bool CFGChanged = false;
5825 // A set of deleted alloca instruction pointers which should be removed from
5826 // the list of promotable allocas.
5827 SmallPtrSet<AllocaInst *, 4> DeletedAllocas;
5828
5829 do {
5830 while (!Worklist.empty()) {
5831 auto [IterationChanged, IterationCFGChanged] =
5832 runOnAlloca(AI&: *Worklist.pop_back_val());
5833 Changed |= IterationChanged;
5834 CFGChanged |= IterationCFGChanged;
5835
5836 Changed |= deleteDeadInstructions(DeletedAllocas);
5837
5838 // Remove the deleted allocas from various lists so that we don't try to
5839 // continue processing them.
5840 if (!DeletedAllocas.empty()) {
5841 Worklist.set_subtract(DeletedAllocas);
5842 PostPromotionWorklist.set_subtract(DeletedAllocas);
5843 PromotableAllocas.set_subtract(DeletedAllocas);
5844 DeletedAllocas.clear();
5845 }
5846 }
5847
5848 Changed |= promoteAllocas();
5849
5850 Worklist = PostPromotionWorklist;
5851 PostPromotionWorklist.clear();
5852 } while (!Worklist.empty());
5853
5854 assert((!CFGChanged || Changed) && "Can not only modify the CFG.");
5855 assert((!CFGChanged || !PreserveCFG) &&
5856 "Should not have modified the CFG when told to preserve it.");
5857
5858 if (Changed && isAssignmentTrackingEnabled(M: *F.getParent())) {
5859 for (auto &BB : F) {
5860 RemoveRedundantDbgInstrs(BB: &BB);
5861 }
5862 }
5863
5864 return {Changed, CFGChanged};
5865}
5866
5867PreservedAnalyses SROAPass::run(Function &F, FunctionAnalysisManager &AM) {
5868 DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(IR&: F);
5869 AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(IR&: F);
5870 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
5871 auto [Changed, CFGChanged] =
5872 SROA(&F.getContext(), &DTU, &AC, PreserveCFG).runSROA(F);
5873 if (!Changed)
5874 return PreservedAnalyses::all();
5875 PreservedAnalyses PA;
5876 if (!CFGChanged)
5877 PA.preserveSet<CFGAnalyses>();
5878 PA.preserve<DominatorTreeAnalysis>();
5879 return PA;
5880}
5881
5882void SROAPass::printPipeline(
5883 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
5884 static_cast<PassInfoMixin<SROAPass> *>(this)->printPipeline(
5885 OS, MapClassName2PassName);
5886 OS << (PreserveCFG == SROAOptions::PreserveCFG ? "<preserve-cfg>"
5887 : "<modify-cfg>");
5888}
5889
5890SROAPass::SROAPass(SROAOptions PreserveCFG) : PreserveCFG(PreserveCFG) {}
5891
5892namespace {
5893
5894/// A legacy pass for the legacy pass manager that wraps the \c SROA pass.
5895class SROALegacyPass : public FunctionPass {
5896 SROAOptions PreserveCFG;
5897
5898public:
5899 static char ID;
5900
5901 SROALegacyPass(SROAOptions PreserveCFG = SROAOptions::PreserveCFG)
5902 : FunctionPass(ID), PreserveCFG(PreserveCFG) {
5903 initializeSROALegacyPassPass(*PassRegistry::getPassRegistry());
5904 }
5905
5906 bool runOnFunction(Function &F) override {
5907 if (skipFunction(F))
5908 return false;
5909
5910 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
5911 AssumptionCache &AC =
5912 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
5913 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
5914 auto [Changed, _] =
5915 SROA(&F.getContext(), &DTU, &AC, PreserveCFG).runSROA(F);
5916 return Changed;
5917 }
5918
5919 void getAnalysisUsage(AnalysisUsage &AU) const override {
5920 AU.addRequired<AssumptionCacheTracker>();
5921 AU.addRequired<DominatorTreeWrapperPass>();
5922 AU.addPreserved<GlobalsAAWrapperPass>();
5923 AU.addPreserved<DominatorTreeWrapperPass>();
5924 }
5925
5926 StringRef getPassName() const override { return "SROA"; }
5927};
5928
5929} // end anonymous namespace
5930
5931char SROALegacyPass::ID = 0;
5932
5933FunctionPass *llvm::createSROAPass(bool PreserveCFG) {
5934 return new SROALegacyPass(PreserveCFG ? SROAOptions::PreserveCFG
5935 : SROAOptions::ModifyCFG);
5936}
5937
5938INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa",
5939 "Scalar Replacement Of Aggregates", false, false)
5940INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
5941INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
5942INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates",
5943 false, false)
5944