1//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This transformation implements the well known scalar replacement of
10/// aggregates transformation. It tries to identify promotable elements of an
11/// aggregate alloca, and promote them to registers. It will also try to
12/// convert uses of an element (or set of elements) of an alloca into a vector
13/// or bitfield-style integer scalar if appropriate.
14///
15/// It works to do this with minimal slicing of the alloca so that regions
16/// which are merely transferred in and out of external memory remain unchanged
17/// and are not decomposed to scalar code.
18///
19/// Because this also performs alloca promotion, it can be thought of as also
20/// serving the purpose of SSA formation. The algorithm iterates on the
21/// function until all opportunities for promotion have been realized.
22///
23//===----------------------------------------------------------------------===//
24
25#include "llvm/Transforms/Scalar/SROA.h"
26#include "llvm/ADT/APInt.h"
27#include "llvm/ADT/ArrayRef.h"
28#include "llvm/ADT/DenseMap.h"
29#include "llvm/ADT/MapVector.h"
30#include "llvm/ADT/PointerIntPair.h"
31#include "llvm/ADT/STLExtras.h"
32#include "llvm/ADT/SetVector.h"
33#include "llvm/ADT/SmallBitVector.h"
34#include "llvm/ADT/SmallPtrSet.h"
35#include "llvm/ADT/SmallVector.h"
36#include "llvm/ADT/Statistic.h"
37#include "llvm/ADT/StringRef.h"
38#include "llvm/ADT/Twine.h"
39#include "llvm/ADT/iterator.h"
40#include "llvm/ADT/iterator_range.h"
41#include "llvm/Analysis/AssumptionCache.h"
42#include "llvm/Analysis/DomTreeUpdater.h"
43#include "llvm/Analysis/GlobalsModRef.h"
44#include "llvm/Analysis/Loads.h"
45#include "llvm/Analysis/PtrUseVisitor.h"
46#include "llvm/Analysis/ValueTracking.h"
47#include "llvm/Analysis/VectorUtils.h"
48#include "llvm/IR/BasicBlock.h"
49#include "llvm/IR/Constant.h"
50#include "llvm/IR/ConstantFolder.h"
51#include "llvm/IR/Constants.h"
52#include "llvm/IR/DIBuilder.h"
53#include "llvm/IR/DataLayout.h"
54#include "llvm/IR/DebugInfo.h"
55#include "llvm/IR/DebugInfoMetadata.h"
56#include "llvm/IR/DerivedTypes.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/GlobalAlias.h"
60#include "llvm/IR/IRBuilder.h"
61#include "llvm/IR/InstVisitor.h"
62#include "llvm/IR/Instruction.h"
63#include "llvm/IR/Instructions.h"
64#include "llvm/IR/IntrinsicInst.h"
65#include "llvm/IR/LLVMContext.h"
66#include "llvm/IR/Metadata.h"
67#include "llvm/IR/Module.h"
68#include "llvm/IR/Operator.h"
69#include "llvm/IR/PassManager.h"
70#include "llvm/IR/Type.h"
71#include "llvm/IR/Use.h"
72#include "llvm/IR/User.h"
73#include "llvm/IR/Value.h"
74#include "llvm/IR/ValueHandle.h"
75#include "llvm/InitializePasses.h"
76#include "llvm/Pass.h"
77#include "llvm/Support/Casting.h"
78#include "llvm/Support/CommandLine.h"
79#include "llvm/Support/Compiler.h"
80#include "llvm/Support/Debug.h"
81#include "llvm/Support/ErrorHandling.h"
82#include "llvm/Support/raw_ostream.h"
83#include "llvm/Transforms/Scalar.h"
84#include "llvm/Transforms/Utils/BasicBlockUtils.h"
85#include "llvm/Transforms/Utils/Local.h"
86#include "llvm/Transforms/Utils/PromoteMemToReg.h"
87#include "llvm/Transforms/Utils/SSAUpdater.h"
88#include <algorithm>
89#include <cassert>
90#include <cstddef>
91#include <cstdint>
92#include <cstring>
93#include <iterator>
94#include <string>
95#include <tuple>
96#include <utility>
97#include <variant>
98#include <vector>
99
100using namespace llvm;
101
102#define DEBUG_TYPE "sroa"
103
104STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
105STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
106STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca");
107STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten");
108STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition");
109STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced");
110STATISTIC(NumPromoted, "Number of allocas promoted to SSA values");
111STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
112STATISTIC(NumLoadsPredicated,
113 "Number of loads rewritten into predicated loads to allow promotion");
114STATISTIC(
115 NumStoresPredicated,
116 "Number of stores rewritten into predicated loads to allow promotion");
117STATISTIC(NumDeleted, "Number of instructions deleted");
118STATISTIC(NumVectorized, "Number of vectorized aggregates");
119
120namespace llvm {
121/// Disable running mem2reg during SROA in order to test or debug SROA.
122static cl::opt<bool> SROASkipMem2Reg("sroa-skip-mem2reg", cl::init(Val: false),
123 cl::Hidden);
124extern cl::opt<bool> ProfcheckDisableMetadataFixes;
125} // namespace llvm
126
127namespace {
128
129class AllocaSliceRewriter;
130class AllocaSlices;
131class Partition;
132
133class SelectHandSpeculativity {
134 unsigned char Storage = 0; // None are speculatable by default.
135 using TrueVal = Bitfield::Element<bool, 0, 1>; // Low 0'th bit.
136 using FalseVal = Bitfield::Element<bool, 1, 1>; // Low 1'th bit.
137public:
138 SelectHandSpeculativity() = default;
139 SelectHandSpeculativity &setAsSpeculatable(bool isTrueVal);
140 bool isSpeculatable(bool isTrueVal) const;
141 bool areAllSpeculatable() const;
142 bool areAnySpeculatable() const;
143 bool areNoneSpeculatable() const;
144 // For interop as int half of PointerIntPair.
145 explicit operator intptr_t() const { return static_cast<intptr_t>(Storage); }
146 explicit SelectHandSpeculativity(intptr_t Storage_) : Storage(Storage_) {}
147};
148static_assert(sizeof(SelectHandSpeculativity) == sizeof(unsigned char));
149
150using PossiblySpeculatableLoad =
151 PointerIntPair<LoadInst *, 2, SelectHandSpeculativity>;
152using UnspeculatableStore = StoreInst *;
153using RewriteableMemOp =
154 std::variant<PossiblySpeculatableLoad, UnspeculatableStore>;
155using RewriteableMemOps = SmallVector<RewriteableMemOp, 2>;
156
157/// An optimization pass providing Scalar Replacement of Aggregates.
158///
159/// This pass takes allocations which can be completely analyzed (that is, they
160/// don't escape) and tries to turn them into scalar SSA values. There are
161/// a few steps to this process.
162///
163/// 1) It takes allocations of aggregates and analyzes the ways in which they
164/// are used to try to split them into smaller allocations, ideally of
165/// a single scalar data type. It will split up memcpy and memset accesses
166/// as necessary and try to isolate individual scalar accesses.
167/// 2) It will transform accesses into forms which are suitable for SSA value
168/// promotion. This can be replacing a memset with a scalar store of an
169/// integer value, or it can involve speculating operations on a PHI or
170/// select to be a PHI or select of the results.
171/// 3) Finally, this will try to detect a pattern of accesses which map cleanly
172/// onto insert and extract operations on a vector value, and convert them to
173/// this form. By doing so, it will enable promotion of vector aggregates to
174/// SSA vector values.
175class SROA {
176 LLVMContext *const C;
177 DomTreeUpdater *const DTU;
178 AssumptionCache *const AC;
179 const bool PreserveCFG;
180 const bool AggregateToVector;
181
182 /// Worklist of alloca instructions to simplify.
183 ///
184 /// Each alloca in the function is added to this. Each new alloca formed gets
185 /// added to it as well to recursively simplify unless that alloca can be
186 /// directly promoted. Finally, each time we rewrite a use of an alloca other
187 /// the one being actively rewritten, we add it back onto the list if not
188 /// already present to ensure it is re-visited.
189 SmallSetVector<AllocaInst *, 16> Worklist;
190
191 /// A collection of instructions to delete.
192 /// We try to batch deletions to simplify code and make things a bit more
193 /// efficient. We also make sure there is no dangling pointers.
194 SmallVector<WeakVH, 8> DeadInsts;
195
196 /// Post-promotion worklist.
197 ///
198 /// Sometimes we discover an alloca which has a high probability of becoming
199 /// viable for SROA after a round of promotion takes place. In those cases,
200 /// the alloca is enqueued here for re-processing.
201 ///
202 /// Note that we have to be very careful to clear allocas out of this list in
203 /// the event they are deleted.
204 SmallSetVector<AllocaInst *, 16> PostPromotionWorklist;
205
206 /// A collection of alloca instructions we can directly promote.
207 SetVector<AllocaInst *, SmallVector<AllocaInst *>,
208 SmallPtrSet<AllocaInst *, 16>, 16>
209 PromotableAllocas;
210
211 /// A worklist of PHIs to speculate prior to promoting allocas.
212 ///
213 /// All of these PHIs have been checked for the safety of speculation and by
214 /// being speculated will allow promoting allocas currently in the promotable
215 /// queue.
216 SmallSetVector<PHINode *, 8> SpeculatablePHIs;
217
218 /// A worklist of select instructions to rewrite prior to promoting
219 /// allocas.
220 SmallMapVector<SelectInst *, RewriteableMemOps, 8> SelectsToRewrite;
221
222 /// Select instructions that use an alloca and are subsequently loaded can be
223 /// rewritten to load both input pointers and then select between the result,
224 /// allowing the load of the alloca to be promoted.
225 /// From this:
226 /// %P2 = select i1 %cond, ptr %Alloca, ptr %Other
227 /// %V = load <type>, ptr %P2
228 /// to:
229 /// %V1 = load <type>, ptr %Alloca -> will be mem2reg'd
230 /// %V2 = load <type>, ptr %Other
231 /// %V = select i1 %cond, <type> %V1, <type> %V2
232 ///
233 /// We can do this to a select if its only uses are loads
234 /// and if either the operand to the select can be loaded unconditionally,
235 /// or if we are allowed to perform CFG modifications.
236 /// If found an intervening bitcast with a single use of the load,
237 /// allow the promotion.
238 static std::optional<RewriteableMemOps>
239 isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG);
240
241public:
242 SROA(LLVMContext *C, DomTreeUpdater *DTU, AssumptionCache *AC,
243 SROAOptions Options)
244 : C(C), DTU(DTU), AC(AC),
245 PreserveCFG(Options.CFG == SROAOptions::PreserveCFG),
246 AggregateToVector(Options.AggregateToVector) {}
247
248 /// Main run method used by both the SROAPass and by the legacy pass.
249 std::pair<bool /*Changed*/, bool /*CFGChanged*/> runSROA(Function &F);
250
251private:
252 friend class AllocaSliceRewriter;
253
254 bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS);
255 std::pair<AllocaInst *, uint64_t>
256 rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P);
257 bool splitAlloca(AllocaInst &AI, AllocaSlices &AS);
258 bool propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS);
259 std::pair<bool /*Changed*/, bool /*CFGChanged*/> runOnAlloca(AllocaInst &AI);
260 void clobberUse(Use &U);
261 bool deleteDeadInstructions(SmallPtrSetImpl<AllocaInst *> &DeletedAllocas);
262 bool promoteAllocas();
263};
264
265} // end anonymous namespace
266
267/// Calculate the fragment of a variable to use when slicing a store
268/// based on the slice dimensions, existing fragment, and base storage
269/// fragment.
270/// Results:
271/// UseFrag - Use Target as the new fragment.
272/// UseNoFrag - The new slice already covers the whole variable.
273/// Skip - The new alloca slice doesn't include this variable.
274/// FIXME: Can we use calculateFragmentIntersect instead?
275namespace {
276enum FragCalcResult { UseFrag, UseNoFrag, Skip };
277}
278static FragCalcResult
279calculateFragment(DILocalVariable *Variable,
280 uint64_t NewStorageSliceOffsetInBits,
281 uint64_t NewStorageSliceSizeInBits,
282 std::optional<DIExpression::FragmentInfo> StorageFragment,
283 std::optional<DIExpression::FragmentInfo> CurrentFragment,
284 DIExpression::FragmentInfo &Target) {
285 // If the base storage describes part of the variable apply the offset and
286 // the size constraint.
287 if (StorageFragment) {
288 Target.SizeInBits =
289 std::min(a: NewStorageSliceSizeInBits, b: StorageFragment->SizeInBits);
290 Target.OffsetInBits =
291 NewStorageSliceOffsetInBits + StorageFragment->OffsetInBits;
292 } else {
293 Target.SizeInBits = NewStorageSliceSizeInBits;
294 Target.OffsetInBits = NewStorageSliceOffsetInBits;
295 }
296
297 // If this slice extracts the entirety of an independent variable from a
298 // larger alloca, do not produce a fragment expression, as the variable is
299 // not fragmented.
300 if (!CurrentFragment) {
301 if (auto Size = Variable->getSizeInBits()) {
302 // Treat the current fragment as covering the whole variable.
303 CurrentFragment = DIExpression::FragmentInfo(*Size, 0);
304 if (Target == CurrentFragment)
305 return UseNoFrag;
306 }
307 }
308
309 // No additional work to do if there isn't a fragment already, or there is
310 // but it already exactly describes the new assignment.
311 if (!CurrentFragment || *CurrentFragment == Target)
312 return UseFrag;
313
314 // Reject the target fragment if it doesn't fit wholly within the current
315 // fragment. TODO: We could instead chop up the target to fit in the case of
316 // a partial overlap.
317 if (Target.startInBits() < CurrentFragment->startInBits() ||
318 Target.endInBits() > CurrentFragment->endInBits())
319 return Skip;
320
321 // Target fits within the current fragment, return it.
322 return UseFrag;
323}
324
325static DebugVariable getAggregateVariable(DbgVariableRecord *DVR) {
326 return DebugVariable(DVR->getVariable(), std::nullopt,
327 DVR->getDebugLoc().getInlinedAt());
328}
329
330/// Find linked dbg.assign and generate a new one with the correct
331/// FragmentInfo. Link Inst to the new dbg.assign. If Value is nullptr the
332/// value component is copied from the old dbg.assign to the new.
333/// \param OldAlloca Alloca for the variable before splitting.
334/// \param IsSplit True if the store (not necessarily alloca)
335/// is being split.
336/// \param OldAllocaOffsetInBits Offset of the slice taken from OldAlloca.
337/// \param SliceSizeInBits New number of bits being written to.
338/// \param OldInst Instruction that is being split.
339/// \param Inst New instruction performing this part of the
340/// split store.
341/// \param Dest Store destination.
342/// \param Value Stored value.
343/// \param DL Datalayout.
344static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
345 uint64_t OldAllocaOffsetInBits,
346 uint64_t SliceSizeInBits, Instruction *OldInst,
347 Instruction *Inst, Value *Dest, Value *Value,
348 const DataLayout &DL) {
349 // If we want allocas to be migrated using this helper then we need to ensure
350 // that the BaseFragments map code still works. A simple solution would be
351 // to choose to always clone alloca dbg_assigns (rather than sometimes
352 // "stealing" them).
353 assert(!isa<AllocaInst>(Inst) && "Unexpected alloca");
354
355 auto DVRAssignMarkerRange = at::getDVRAssignmentMarkers(Inst: OldInst);
356 // Nothing to do if OldInst has no linked dbg.assign intrinsics.
357 if (DVRAssignMarkerRange.empty())
358 return;
359
360 LLVM_DEBUG(dbgs() << " migrateDebugInfo\n");
361 LLVM_DEBUG(dbgs() << " OldAlloca: " << *OldAlloca << "\n");
362 LLVM_DEBUG(dbgs() << " IsSplit: " << IsSplit << "\n");
363 LLVM_DEBUG(dbgs() << " OldAllocaOffsetInBits: " << OldAllocaOffsetInBits
364 << "\n");
365 LLVM_DEBUG(dbgs() << " SliceSizeInBits: " << SliceSizeInBits << "\n");
366 LLVM_DEBUG(dbgs() << " OldInst: " << *OldInst << "\n");
367 LLVM_DEBUG(dbgs() << " Inst: " << *Inst << "\n");
368 LLVM_DEBUG(dbgs() << " Dest: " << *Dest << "\n");
369 if (Value)
370 LLVM_DEBUG(dbgs() << " Value: " << *Value << "\n");
371
372 /// Map of aggregate variables to their fragment associated with OldAlloca.
373 DenseMap<DebugVariable, std::optional<DIExpression::FragmentInfo>>
374 BaseFragments;
375 for (auto *DVR : at::getDVRAssignmentMarkers(Inst: OldAlloca))
376 BaseFragments[getAggregateVariable(DVR)] =
377 DVR->getExpression()->getFragmentInfo();
378
379 // The new inst needs a DIAssignID unique metadata tag (if OldInst has
380 // one). It shouldn't already have one: assert this assumption.
381 assert(!Inst->getMetadata(LLVMContext::MD_DIAssignID));
382 DIAssignID *NewID = nullptr;
383 auto &Ctx = Inst->getContext();
384 DIBuilder DIB(*OldInst->getModule(), /*AllowUnresolved*/ false);
385 assert(OldAlloca->isStaticAlloca());
386
387 auto MigrateDbgAssign = [&](DbgVariableRecord *DbgAssign) {
388 LLVM_DEBUG(dbgs() << " existing dbg.assign is: " << *DbgAssign
389 << "\n");
390 auto *Expr = DbgAssign->getExpression();
391 bool SetKillLocation = false;
392
393 if (IsSplit) {
394 std::optional<DIExpression::FragmentInfo> BaseFragment;
395 {
396 auto R = BaseFragments.find(Val: getAggregateVariable(DVR: DbgAssign));
397 if (R == BaseFragments.end())
398 return;
399 BaseFragment = R->second;
400 }
401 std::optional<DIExpression::FragmentInfo> CurrentFragment =
402 Expr->getFragmentInfo();
403 DIExpression::FragmentInfo NewFragment;
404 FragCalcResult Result = calculateFragment(
405 Variable: DbgAssign->getVariable(), NewStorageSliceOffsetInBits: OldAllocaOffsetInBits, NewStorageSliceSizeInBits: SliceSizeInBits,
406 StorageFragment: BaseFragment, CurrentFragment, Target&: NewFragment);
407
408 if (Result == Skip)
409 return;
410 if (Result == UseFrag && !(NewFragment == CurrentFragment)) {
411 if (CurrentFragment) {
412 // Rewrite NewFragment to be relative to the existing one (this is
413 // what createFragmentExpression wants). CalculateFragment has
414 // already resolved the size for us. FIXME: Should it return the
415 // relative fragment too?
416 NewFragment.OffsetInBits -= CurrentFragment->OffsetInBits;
417 }
418 // Add the new fragment info to the existing expression if possible.
419 if (auto E = DIExpression::createFragmentExpression(
420 Expr, OffsetInBits: NewFragment.OffsetInBits, SizeInBits: NewFragment.SizeInBits)) {
421 Expr = *E;
422 } else {
423 // Otherwise, add the new fragment info to an empty expression and
424 // discard the value component of this dbg.assign as the value cannot
425 // be computed with the new fragment.
426 Expr = *DIExpression::createFragmentExpression(
427 Expr: DIExpression::get(Context&: Expr->getContext(), Elements: {}),
428 OffsetInBits: NewFragment.OffsetInBits, SizeInBits: NewFragment.SizeInBits);
429 SetKillLocation = true;
430 }
431 }
432 }
433
434 // If we haven't created a DIAssignID ID do that now and attach it to Inst.
435 if (!NewID) {
436 NewID = DIAssignID::getDistinct(Context&: Ctx);
437 Inst->setMetadata(KindID: LLVMContext::MD_DIAssignID, Node: NewID);
438 }
439
440 DbgVariableRecord *NewAssign;
441 if (IsSplit) {
442 ::Value *NewValue = Value ? Value : DbgAssign->getValue();
443 NewAssign = cast<DbgVariableRecord>(Val: cast<DbgRecord *>(
444 Val: DIB.insertDbgAssign(LinkedInstr: Inst, Val: NewValue, SrcVar: DbgAssign->getVariable(), ValExpr: Expr,
445 Addr: Dest, AddrExpr: DIExpression::get(Context&: Expr->getContext(), Elements: {}),
446 DL: DbgAssign->getDebugLoc())));
447 } else {
448 // The store is not split, simply steal the existing dbg_assign.
449 NewAssign = DbgAssign;
450 NewAssign->setAssignId(NewID); // FIXME: Can we avoid generating new IDs?
451 NewAssign->setAddress(Dest);
452 if (Value)
453 NewAssign->replaceVariableLocationOp(OpIdx: 0u, NewValue: Value);
454 assert(Expr == NewAssign->getExpression());
455 }
456
457 // If we've updated the value but the original dbg.assign has an arglist
458 // then kill it now - we can't use the requested new value.
459 // We can't replace the DIArgList with the new value as it'd leave
460 // the DIExpression in an invalid state (DW_OP_LLVM_arg operands without
461 // an arglist). And we can't keep the DIArgList in case the linked store
462 // is being split - in which case the DIArgList + expression may no longer
463 // be computing the correct value.
464 // This should be a very rare situation as it requires the value being
465 // stored to differ from the dbg.assign (i.e., the value has been
466 // represented differently in the debug intrinsic for some reason).
467 SetKillLocation |=
468 Value && (DbgAssign->hasArgList() ||
469 !DbgAssign->getExpression()->isSingleLocationExpression());
470 if (SetKillLocation)
471 NewAssign->setKillLocation();
472
473 // We could use more precision here at the cost of some additional (code)
474 // complexity - if the original dbg.assign was adjacent to its store, we
475 // could position this new dbg.assign adjacent to its store rather than the
476 // old dbg.assgn. That would result in interleaved dbg.assigns rather than
477 // what we get now:
478 // split store !1
479 // split store !2
480 // dbg.assign !1
481 // dbg.assign !2
482 // This (current behaviour) results results in debug assignments being
483 // noted as slightly offset (in code) from the store. In practice this
484 // should have little effect on the debugging experience due to the fact
485 // that all the split stores should get the same line number.
486 if (NewAssign != DbgAssign) {
487 NewAssign->moveBefore(MoveBefore: DbgAssign->getIterator());
488 NewAssign->setDebugLoc(DbgAssign->getDebugLoc());
489 }
490 LLVM_DEBUG(dbgs() << "Created new assign: " << *NewAssign << "\n");
491 };
492
493 for_each(Range&: DVRAssignMarkerRange, F: MigrateDbgAssign);
494}
495
496namespace {
497
498/// A custom IRBuilder inserter which prefixes all names, but only in
499/// Assert builds.
500class IRBuilderPrefixedInserter final : public IRBuilderDefaultInserter {
501 std::string Prefix;
502
503 Twine getNameWithPrefix(const Twine &Name) const {
504 return Name.isTriviallyEmpty() ? Name : Prefix + Name;
505 }
506
507public:
508 void SetNamePrefix(const Twine &P) { Prefix = P.str(); }
509
510 void InsertHelper(Instruction *I, const Twine &Name,
511 BasicBlock::iterator InsertPt) const override {
512 IRBuilderDefaultInserter::InsertHelper(I, Name: getNameWithPrefix(Name),
513 InsertPt);
514 }
515};
516
517/// Provide a type for IRBuilder that drops names in release builds.
518using IRBuilderTy = IRBuilder<ConstantFolder, IRBuilderPrefixedInserter>;
519
520/// A used slice of an alloca.
521///
522/// This structure represents a slice of an alloca used by some instruction. It
523/// stores both the begin and end offsets of this use, a pointer to the use
524/// itself, and a flag indicating whether we can classify the use as splittable
525/// or not when forming partitions of the alloca.
526class Slice {
527 /// The beginning offset of the range.
528 uint64_t BeginOffset = 0;
529
530 /// The ending offset, not included in the range.
531 uint64_t EndOffset = 0;
532
533 /// Storage for both the use of this slice and whether it can be
534 /// split.
535 PointerIntPair<Use *, 1, bool> UseAndIsSplittable;
536
537public:
538 Slice() = default;
539
540 Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable)
541 : BeginOffset(BeginOffset), EndOffset(EndOffset),
542 UseAndIsSplittable(U, IsSplittable) {}
543
544 uint64_t beginOffset() const { return BeginOffset; }
545 uint64_t endOffset() const { return EndOffset; }
546
547 bool isSplittable() const { return UseAndIsSplittable.getInt(); }
548 void makeUnsplittable() { UseAndIsSplittable.setInt(false); }
549
550 Use *getUse() const { return UseAndIsSplittable.getPointer(); }
551
552 bool isDead() const { return getUse() == nullptr; }
553 void kill() { UseAndIsSplittable.setPointer(nullptr); }
554
555 /// Support for ordering ranges.
556 ///
557 /// This provides an ordering over ranges such that start offsets are
558 /// always increasing, and within equal start offsets, the end offsets are
559 /// decreasing. Thus the spanning range comes first in a cluster with the
560 /// same start position.
561 bool operator<(const Slice &RHS) const {
562 if (beginOffset() < RHS.beginOffset())
563 return true;
564 if (beginOffset() > RHS.beginOffset())
565 return false;
566 if (isSplittable() != RHS.isSplittable())
567 return !isSplittable();
568 if (endOffset() > RHS.endOffset())
569 return true;
570 return false;
571 }
572
573 /// Support comparison with a single offset to allow binary searches.
574 [[maybe_unused]] friend bool operator<(const Slice &LHS, uint64_t RHSOffset) {
575 return LHS.beginOffset() < RHSOffset;
576 }
577 [[maybe_unused]] friend bool operator<(uint64_t LHSOffset, const Slice &RHS) {
578 return LHSOffset < RHS.beginOffset();
579 }
580
581 bool operator==(const Slice &RHS) const {
582 return isSplittable() == RHS.isSplittable() &&
583 beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset();
584 }
585 bool operator!=(const Slice &RHS) const { return !operator==(RHS); }
586};
587
588/// Representation of the alloca slices.
589///
590/// This class represents the slices of an alloca which are formed by its
591/// various uses. If a pointer escapes, we can't fully build a representation
592/// for the slices used and we reflect that in this structure. The uses are
593/// stored, sorted by increasing beginning offset and with unsplittable slices
594/// starting at a particular offset before splittable slices.
595class AllocaSlices {
596public:
597 /// Construct the slices of a particular alloca.
598 AllocaSlices(const DataLayout &DL, AllocaInst &AI);
599
600 /// Test whether a pointer to the allocation escapes our analysis.
601 ///
602 /// If this is true, the slices are never fully built and should be
603 /// ignored.
604 bool isEscaped() const { return PointerEscapingInstr; }
605 bool isEscapedReadOnly() const { return PointerEscapingInstrReadOnly; }
606
607 /// Support for iterating over the slices.
608 /// @{
609 using iterator = SmallVectorImpl<Slice>::iterator;
610 using range = iterator_range<iterator>;
611
612 iterator begin() { return Slices.begin(); }
613 iterator end() { return Slices.end(); }
614
615 using const_iterator = SmallVectorImpl<Slice>::const_iterator;
616 using const_range = iterator_range<const_iterator>;
617
618 const_iterator begin() const { return Slices.begin(); }
619 const_iterator end() const { return Slices.end(); }
620 /// @}
621
622 /// Erase a range of slices.
623 void erase(iterator Start, iterator Stop) { Slices.erase(CS: Start, CE: Stop); }
624
625 /// Insert new slices for this alloca.
626 ///
627 /// This moves the slices into the alloca's slices collection, and re-sorts
628 /// everything so that the usual ordering properties of the alloca's slices
629 /// hold.
630 void insert(ArrayRef<Slice> NewSlices) {
631 int OldSize = Slices.size();
632 Slices.append(in_start: NewSlices.begin(), in_end: NewSlices.end());
633 auto SliceI = Slices.begin() + OldSize;
634 std::stable_sort(first: SliceI, last: Slices.end());
635 std::inplace_merge(first: Slices.begin(), middle: SliceI, last: Slices.end());
636 }
637
638 // Forward declare the iterator and range accessor for walking the
639 // partitions.
640 class partition_iterator;
641 iterator_range<partition_iterator> partitions();
642
643 /// Access the dead users for this alloca.
644 ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
645
646 /// Access Uses that should be dropped if the alloca is promotable.
647 ArrayRef<Use *> getDeadUsesIfPromotable() const {
648 return DeadUseIfPromotable;
649 }
650
651 /// Access the dead operands referring to this alloca.
652 ///
653 /// These are operands which have cannot actually be used to refer to the
654 /// alloca as they are outside its range and the user doesn't correct for
655 /// that. These mostly consist of PHI node inputs and the like which we just
656 /// need to replace with undef.
657 ArrayRef<Use *> getDeadOperands() const { return DeadOperands; }
658
659#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
660 void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const;
661 void printSlice(raw_ostream &OS, const_iterator I,
662 StringRef Indent = " ") const;
663 void printUse(raw_ostream &OS, const_iterator I,
664 StringRef Indent = " ") const;
665 void print(raw_ostream &OS) const;
666 void dump(const_iterator I) const;
667 void dump() const;
668#endif
669
670private:
671 template <typename DerivedT, typename RetT = void> class BuilderBase;
672 class SliceBuilder;
673
674 friend class AllocaSlices::SliceBuilder;
675
676#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
677 /// Handle to alloca instruction to simplify method interfaces.
678 AllocaInst &AI;
679#endif
680
681 /// The instruction responsible for this alloca not having a known set
682 /// of slices.
683 ///
684 /// When an instruction (potentially) escapes the pointer to the alloca, we
685 /// store a pointer to that here and abort trying to form slices of the
686 /// alloca. This will be null if the alloca slices are analyzed successfully.
687 Instruction *PointerEscapingInstr;
688 Instruction *PointerEscapingInstrReadOnly;
689
690 /// The slices of the alloca.
691 ///
692 /// We store a vector of the slices formed by uses of the alloca here. This
693 /// vector is sorted by increasing begin offset, and then the unsplittable
694 /// slices before the splittable ones. See the Slice inner class for more
695 /// details.
696 SmallVector<Slice, 8> Slices;
697
698 /// Instructions which will become dead if we rewrite the alloca.
699 ///
700 /// Note that these are not separated by slice. This is because we expect an
701 /// alloca to be completely rewritten or not rewritten at all. If rewritten,
702 /// all these instructions can simply be removed and replaced with poison as
703 /// they come from outside of the allocated space.
704 SmallVector<Instruction *, 8> DeadUsers;
705
706 /// Uses which will become dead if can promote the alloca.
707 SmallVector<Use *, 8> DeadUseIfPromotable;
708
709 /// Operands which will become dead if we rewrite the alloca.
710 ///
711 /// These are operands that in their particular use can be replaced with
712 /// poison when we rewrite the alloca. These show up in out-of-bounds inputs
713 /// to PHI nodes and the like. They aren't entirely dead (there might be
714 /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
715 /// want to swap this particular input for poison to simplify the use lists of
716 /// the alloca.
717 SmallVector<Use *, 8> DeadOperands;
718};
719
720/// A partition of the slices.
721///
722/// An ephemeral representation for a range of slices which can be viewed as
723/// a partition of the alloca. This range represents a span of the alloca's
724/// memory which cannot be split, and provides access to all of the slices
725/// overlapping some part of the partition.
726///
727/// Objects of this type are produced by traversing the alloca's slices, but
728/// are only ephemeral and not persistent.
729class Partition {
730private:
731 friend class AllocaSlices;
732 friend class AllocaSlices::partition_iterator;
733
734 using iterator = AllocaSlices::iterator;
735
736 /// The beginning and ending offsets of the alloca for this
737 /// partition.
738 uint64_t BeginOffset = 0, EndOffset = 0;
739
740 /// The start and end iterators of this partition.
741 iterator SI, SJ;
742
743 /// A collection of split slice tails overlapping the partition.
744 SmallVector<Slice *, 4> SplitTails;
745
746 /// Raw constructor builds an empty partition starting and ending at
747 /// the given iterator.
748 Partition(iterator SI) : SI(SI), SJ(SI) {}
749
750public:
751 /// The start offset of this partition.
752 ///
753 /// All of the contained slices start at or after this offset.
754 uint64_t beginOffset() const { return BeginOffset; }
755
756 /// The end offset of this partition.
757 ///
758 /// All of the contained slices end at or before this offset.
759 uint64_t endOffset() const { return EndOffset; }
760
761 /// The size of the partition.
762 ///
763 /// Note that this can never be zero.
764 uint64_t size() const {
765 assert(BeginOffset < EndOffset && "Partitions must span some bytes!");
766 return EndOffset - BeginOffset;
767 }
768
769 /// Test whether this partition contains no slices, and merely spans
770 /// a region occupied by split slices.
771 bool empty() const { return SI == SJ; }
772
773 /// \name Iterate slices that start within the partition.
774 /// These may be splittable or unsplittable. They have a begin offset >= the
775 /// partition begin offset.
776 /// @{
777 // FIXME: We should probably define a "concat_iterator" helper and use that
778 // to stitch together pointee_iterators over the split tails and the
779 // contiguous iterators of the partition. That would give a much nicer
780 // interface here. We could then additionally expose filtered iterators for
781 // split, unsplit, and unsplittable splices based on the usage patterns.
782 iterator begin() const { return SI; }
783 iterator end() const { return SJ; }
784 /// @}
785
786 /// Get the sequence of split slice tails.
787 ///
788 /// These tails are of slices which start before this partition but are
789 /// split and overlap into the partition. We accumulate these while forming
790 /// partitions.
791 ArrayRef<Slice *> splitSliceTails() const { return SplitTails; }
792};
793
794} // end anonymous namespace
795
796/// An iterator over partitions of the alloca's slices.
797///
798/// This iterator implements the core algorithm for partitioning the alloca's
799/// slices. It is a forward iterator as we don't support backtracking for
800/// efficiency reasons, and re-use a single storage area to maintain the
801/// current set of split slices.
802///
803/// It is templated on the slice iterator type to use so that it can operate
804/// with either const or non-const slice iterators.
805class AllocaSlices::partition_iterator
806 : public iterator_facade_base<partition_iterator, std::forward_iterator_tag,
807 Partition> {
808 friend class AllocaSlices;
809
810 /// Most of the state for walking the partitions is held in a class
811 /// with a nice interface for examining them.
812 Partition P;
813
814 /// We need to keep the end of the slices to know when to stop.
815 AllocaSlices::iterator SE;
816
817 /// We also need to keep track of the maximum split end offset seen.
818 /// FIXME: Do we really?
819 uint64_t MaxSplitSliceEndOffset = 0;
820
821 /// Sets the partition to be empty at given iterator, and sets the
822 /// end iterator.
823 partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE)
824 : P(SI), SE(SE) {
825 // If not already at the end, advance our state to form the initial
826 // partition.
827 if (SI != SE)
828 advance();
829 }
830
831 /// Advance the iterator to the next partition.
832 ///
833 /// Requires that the iterator not be at the end of the slices.
834 void advance() {
835 assert((P.SI != SE || !P.SplitTails.empty()) &&
836 "Cannot advance past the end of the slices!");
837
838 // Clear out any split uses which have ended.
839 if (!P.SplitTails.empty()) {
840 if (P.EndOffset >= MaxSplitSliceEndOffset) {
841 // If we've finished all splits, this is easy.
842 P.SplitTails.clear();
843 MaxSplitSliceEndOffset = 0;
844 } else {
845 // Remove the uses which have ended in the prior partition. This
846 // cannot change the max split slice end because we just checked that
847 // the prior partition ended prior to that max.
848 llvm::erase_if(C&: P.SplitTails,
849 P: [&](Slice *S) { return S->endOffset() <= P.EndOffset; });
850 assert(llvm::any_of(P.SplitTails,
851 [&](Slice *S) {
852 return S->endOffset() == MaxSplitSliceEndOffset;
853 }) &&
854 "Could not find the current max split slice offset!");
855 assert(llvm::all_of(P.SplitTails,
856 [&](Slice *S) {
857 return S->endOffset() <= MaxSplitSliceEndOffset;
858 }) &&
859 "Max split slice end offset is not actually the max!");
860 }
861 }
862
863 // If P.SI is already at the end, then we've cleared the split tail and
864 // now have an end iterator.
865 if (P.SI == SE) {
866 assert(P.SplitTails.empty() && "Failed to clear the split slices!");
867 return;
868 }
869
870 // If we had a non-empty partition previously, set up the state for
871 // subsequent partitions.
872 if (P.SI != P.SJ) {
873 // Accumulate all the splittable slices which started in the old
874 // partition into the split list.
875 for (Slice &S : P)
876 if (S.isSplittable() && S.endOffset() > P.EndOffset) {
877 P.SplitTails.push_back(Elt: &S);
878 MaxSplitSliceEndOffset =
879 std::max(a: S.endOffset(), b: MaxSplitSliceEndOffset);
880 }
881
882 // Start from the end of the previous partition.
883 P.SI = P.SJ;
884
885 // If P.SI is now at the end, we at most have a tail of split slices.
886 if (P.SI == SE) {
887 P.BeginOffset = P.EndOffset;
888 P.EndOffset = MaxSplitSliceEndOffset;
889 return;
890 }
891
892 // If the we have split slices and the next slice is after a gap and is
893 // not splittable immediately form an empty partition for the split
894 // slices up until the next slice begins.
895 if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset &&
896 !P.SI->isSplittable()) {
897 P.BeginOffset = P.EndOffset;
898 P.EndOffset = P.SI->beginOffset();
899 return;
900 }
901 }
902
903 // OK, we need to consume new slices. Set the end offset based on the
904 // current slice, and step SJ past it. The beginning offset of the
905 // partition is the beginning offset of the next slice unless we have
906 // pre-existing split slices that are continuing, in which case we begin
907 // at the prior end offset.
908 P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset;
909 P.EndOffset = P.SI->endOffset();
910 ++P.SJ;
911
912 // There are two strategies to form a partition based on whether the
913 // partition starts with an unsplittable slice or a splittable slice.
914 if (!P.SI->isSplittable()) {
915 // When we're forming an unsplittable region, it must always start at
916 // the first slice and will extend through its end.
917 assert(P.BeginOffset == P.SI->beginOffset());
918
919 // Form a partition including all of the overlapping slices with this
920 // unsplittable slice.
921 while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
922 if (!P.SJ->isSplittable())
923 P.EndOffset = std::max(a: P.EndOffset, b: P.SJ->endOffset());
924 ++P.SJ;
925 }
926
927 // We have a partition across a set of overlapping unsplittable
928 // partitions.
929 return;
930 }
931
932 // If we're starting with a splittable slice, then we need to form
933 // a synthetic partition spanning it and any other overlapping splittable
934 // splices.
935 assert(P.SI->isSplittable() && "Forming a splittable partition!");
936
937 // Collect all of the overlapping splittable slices.
938 while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset &&
939 P.SJ->isSplittable()) {
940 P.EndOffset = std::max(a: P.EndOffset, b: P.SJ->endOffset());
941 ++P.SJ;
942 }
943
944 // Back upiP.EndOffset if we ended the span early when encountering an
945 // unsplittable slice. This synthesizes the early end offset of
946 // a partition spanning only splittable slices.
947 if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
948 assert(!P.SJ->isSplittable());
949 P.EndOffset = P.SJ->beginOffset();
950 }
951 }
952
953public:
954 bool operator==(const partition_iterator &RHS) const {
955 assert(SE == RHS.SE &&
956 "End iterators don't match between compared partition iterators!");
957
958 // The observed positions of partitions is marked by the P.SI iterator and
959 // the emptiness of the split slices. The latter is only relevant when
960 // P.SI == SE, as the end iterator will additionally have an empty split
961 // slices list, but the prior may have the same P.SI and a tail of split
962 // slices.
963 if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) {
964 assert(P.SJ == RHS.P.SJ &&
965 "Same set of slices formed two different sized partitions!");
966 assert(P.SplitTails.size() == RHS.P.SplitTails.size() &&
967 "Same slice position with differently sized non-empty split "
968 "slice tails!");
969 return true;
970 }
971 return false;
972 }
973
974 partition_iterator &operator++() {
975 advance();
976 return *this;
977 }
978
979 Partition &operator*() { return P; }
980};
981
982/// A forward range over the partitions of the alloca's slices.
983///
984/// This accesses an iterator range over the partitions of the alloca's
985/// slices. It computes these partitions on the fly based on the overlapping
986/// offsets of the slices and the ability to split them. It will visit "empty"
987/// partitions to cover regions of the alloca only accessed via split
988/// slices.
989iterator_range<AllocaSlices::partition_iterator> AllocaSlices::partitions() {
990 return make_range(x: partition_iterator(begin(), end()),
991 y: partition_iterator(end(), end()));
992}
993
994static Value *foldSelectInst(SelectInst &SI) {
995 // If the condition being selected on is a constant or the same value is
996 // being selected between, fold the select. Yes this does (rarely) happen
997 // early on.
998 if (ConstantInt *CI = dyn_cast<ConstantInt>(Val: SI.getCondition()))
999 return SI.getOperand(i_nocapture: 1 + CI->isZero());
1000 if (SI.getOperand(i_nocapture: 1) == SI.getOperand(i_nocapture: 2))
1001 return SI.getOperand(i_nocapture: 1);
1002
1003 return nullptr;
1004}
1005
1006/// A helper that folds a PHI node or a select.
1007static Value *foldPHINodeOrSelectInst(Instruction &I) {
1008 if (PHINode *PN = dyn_cast<PHINode>(Val: &I)) {
1009 // If PN merges together the same value, return that value.
1010 return PN->hasConstantValue();
1011 }
1012 return foldSelectInst(SI&: cast<SelectInst>(Val&: I));
1013}
1014
1015/// Builder for the alloca slices.
1016///
1017/// This class builds a set of alloca slices by recursively visiting the uses
1018/// of an alloca and making a slice for each load and store at each offset.
1019class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
1020 friend class PtrUseVisitor<SliceBuilder>;
1021 friend class InstVisitor<SliceBuilder>;
1022
1023 using Base = PtrUseVisitor<SliceBuilder>;
1024
1025 const uint64_t AllocSize;
1026 AllocaSlices &AS;
1027
1028 SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap;
1029 SmallDenseMap<Instruction *, uint64_t> PHIOrSelectSizes;
1030
1031 /// Set to de-duplicate dead instructions found in the use walk.
1032 SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
1033
1034public:
1035 SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
1036 : PtrUseVisitor<SliceBuilder>(DL),
1037 AllocSize(AI.getAllocationSize(DL)->getFixedValue()), AS(AS) {}
1038
1039private:
1040 void markAsDead(Instruction &I) {
1041 if (VisitedDeadInsts.insert(Ptr: &I).second)
1042 AS.DeadUsers.push_back(Elt: &I);
1043 }
1044
1045 void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
1046 bool IsSplittable = false) {
1047 // Completely skip uses which have a zero size or start either before or
1048 // past the end of the allocation.
1049 if (Size == 0 || Offset.uge(RHS: AllocSize)) {
1050 LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @"
1051 << Offset
1052 << " which has zero size or starts outside of the "
1053 << AllocSize << " byte alloca:\n"
1054 << " alloca: " << AS.AI << "\n"
1055 << " use: " << I << "\n");
1056 return markAsDead(I);
1057 }
1058
1059 uint64_t BeginOffset = Offset.getZExtValue();
1060 uint64_t EndOffset = BeginOffset + Size;
1061
1062 // Clamp the end offset to the end of the allocation. Note that this is
1063 // formulated to handle even the case where "BeginOffset + Size" overflows.
1064 // This may appear superficially to be something we could ignore entirely,
1065 // but that is not so! There may be widened loads or PHI-node uses where
1066 // some instructions are dead but not others. We can't completely ignore
1067 // them, and so have to record at least the information here.
1068 assert(AllocSize >= BeginOffset); // Established above.
1069 if (Size > AllocSize - BeginOffset) {
1070 LLVM_DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @"
1071 << Offset << " to remain within the " << AllocSize
1072 << " byte alloca:\n"
1073 << " alloca: " << AS.AI << "\n"
1074 << " use: " << I << "\n");
1075 EndOffset = AllocSize;
1076 }
1077
1078 AS.Slices.push_back(Elt: Slice(BeginOffset, EndOffset, U, IsSplittable));
1079 }
1080
1081 void visitBitCastInst(BitCastInst &BC) {
1082 if (BC.use_empty())
1083 return markAsDead(I&: BC);
1084
1085 return Base::visitBitCastInst(BC);
1086 }
1087
1088 void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
1089 if (ASC.use_empty())
1090 return markAsDead(I&: ASC);
1091
1092 return Base::visitAddrSpaceCastInst(ASC);
1093 }
1094
1095 void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
1096 if (GEPI.use_empty())
1097 return markAsDead(I&: GEPI);
1098
1099 return Base::visitGetElementPtrInst(GEPI);
1100 }
1101
1102 void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
1103 uint64_t Size, bool IsVolatile) {
1104 // We allow splitting of non-volatile loads and stores where the type is an
1105 // integer type. These may be used to implement 'memcpy' or other "transfer
1106 // of bits" patterns.
1107 bool IsSplittable =
1108 Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty);
1109
1110 insertUse(I, Offset, Size, IsSplittable);
1111 }
1112
1113 void visitLoadInst(LoadInst &LI) {
1114 assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
1115 "All simple FCA loads should have been pre-split");
1116
1117 // If there is a load with an unknown offset, we can still perform store
1118 // to load forwarding for other known-offset loads.
1119 if (!IsOffsetKnown)
1120 return PI.setEscapedReadOnly(&LI);
1121
1122 TypeSize Size = DL.getTypeStoreSize(Ty: LI.getType());
1123 if (Size.isScalable()) {
1124 unsigned VScale = LI.getFunction()->getVScaleValue();
1125 if (!VScale)
1126 return PI.setAborted(&LI);
1127
1128 Size = TypeSize::getFixed(ExactSize: Size.getKnownMinValue() * VScale);
1129 }
1130
1131 return handleLoadOrStore(Ty: LI.getType(), I&: LI, Offset, Size: Size.getFixedValue(),
1132 IsVolatile: LI.isVolatile());
1133 }
1134
1135 void visitStoreInst(StoreInst &SI) {
1136 Value *ValOp = SI.getValueOperand();
1137 if (ValOp == *U)
1138 return PI.setEscapedAndAborted(&SI);
1139 if (!IsOffsetKnown)
1140 return PI.setAborted(&SI);
1141
1142 TypeSize StoreSize = DL.getTypeStoreSize(Ty: ValOp->getType());
1143 if (StoreSize.isScalable()) {
1144 unsigned VScale = SI.getFunction()->getVScaleValue();
1145 if (!VScale)
1146 return PI.setAborted(&SI);
1147
1148 StoreSize = TypeSize::getFixed(ExactSize: StoreSize.getKnownMinValue() * VScale);
1149 }
1150
1151 uint64_t Size = StoreSize.getFixedValue();
1152
1153 // If this memory access can be shown to *statically* extend outside the
1154 // bounds of the allocation, it's behavior is undefined, so simply
1155 // ignore it. Note that this is more strict than the generic clamping
1156 // behavior of insertUse. We also try to handle cases which might run the
1157 // risk of overflow.
1158 // FIXME: We should instead consider the pointer to have escaped if this
1159 // function is being instrumented for addressing bugs or race conditions.
1160 if (Size > AllocSize || Offset.ugt(RHS: AllocSize - Size)) {
1161 LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @"
1162 << Offset << " which extends past the end of the "
1163 << AllocSize << " byte alloca:\n"
1164 << " alloca: " << AS.AI << "\n"
1165 << " use: " << SI << "\n");
1166 return markAsDead(I&: SI);
1167 }
1168
1169 assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
1170 "All simple FCA stores should have been pre-split");
1171 handleLoadOrStore(Ty: ValOp->getType(), I&: SI, Offset, Size, IsVolatile: SI.isVolatile());
1172 }
1173
1174 void visitMemSetInst(MemSetInst &II) {
1175 assert(II.getRawDest() == *U && "Pointer use is not the destination?");
1176 ConstantInt *Length = dyn_cast<ConstantInt>(Val: II.getLength());
1177 if ((Length && Length->getValue() == 0) ||
1178 (IsOffsetKnown && Offset.uge(RHS: AllocSize)))
1179 // Zero-length mem transfer intrinsics can be ignored entirely.
1180 return markAsDead(I&: II);
1181
1182 if (!IsOffsetKnown)
1183 return PI.setAborted(&II);
1184
1185 insertUse(I&: II, Offset,
1186 Size: Length ? Length->getLimitedValue()
1187 : AllocSize - Offset.getLimitedValue(),
1188 IsSplittable: (bool)Length);
1189 }
1190
1191 void visitMemTransferInst(MemTransferInst &II) {
1192 ConstantInt *Length = dyn_cast<ConstantInt>(Val: II.getLength());
1193 if (Length && Length->getValue() == 0)
1194 // Zero-length mem transfer intrinsics can be ignored entirely.
1195 return markAsDead(I&: II);
1196
1197 // Because we can visit these intrinsics twice, also check to see if the
1198 // first time marked this instruction as dead. If so, skip it.
1199 if (VisitedDeadInsts.count(Ptr: &II))
1200 return;
1201
1202 if (!IsOffsetKnown)
1203 return PI.setAborted(&II);
1204
1205 // This side of the transfer is completely out-of-bounds, and so we can
1206 // nuke the entire transfer. However, we also need to nuke the other side
1207 // if already added to our partitions.
1208 // FIXME: Yet another place we really should bypass this when
1209 // instrumenting for ASan.
1210 if (Offset.uge(RHS: AllocSize)) {
1211 auto MTPI = MemTransferSliceMap.find(Val: &II);
1212 if (MTPI != MemTransferSliceMap.end())
1213 AS.Slices[MTPI->second].kill();
1214 return markAsDead(I&: II);
1215 }
1216
1217 uint64_t RawOffset = Offset.getLimitedValue();
1218 uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset;
1219
1220 // Check for the special case where the same exact value is used for both
1221 // source and dest.
1222 if (*U == II.getRawDest() && *U == II.getRawSource()) {
1223 // For non-volatile transfers this is a no-op.
1224 if (!II.isVolatile())
1225 return markAsDead(I&: II);
1226
1227 return insertUse(I&: II, Offset, Size, /*IsSplittable=*/false);
1228 }
1229
1230 // If we have seen both source and destination for a mem transfer, then
1231 // they both point to the same alloca.
1232 bool Inserted;
1233 SmallDenseMap<Instruction *, unsigned>::iterator MTPI;
1234 std::tie(args&: MTPI, args&: Inserted) =
1235 MemTransferSliceMap.insert(KV: std::make_pair(x: &II, y: AS.Slices.size()));
1236 unsigned PrevIdx = MTPI->second;
1237 if (!Inserted) {
1238 Slice &PrevP = AS.Slices[PrevIdx];
1239
1240 // Check if the begin offsets match and this is a non-volatile transfer.
1241 // In that case, we can completely elide the transfer.
1242 if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) {
1243 PrevP.kill();
1244 return markAsDead(I&: II);
1245 }
1246
1247 // Otherwise we have an offset transfer within the same alloca. We can't
1248 // split those.
1249 PrevP.makeUnsplittable();
1250 }
1251
1252 // Insert the use now that we've fixed up the splittable nature.
1253 insertUse(I&: II, Offset, Size, /*IsSplittable=*/Inserted && Length);
1254
1255 // Check that we ended up with a valid index in the map.
1256 assert(AS.Slices[PrevIdx].getUse()->getUser() == &II &&
1257 "Map index doesn't point back to a slice with this user.");
1258 }
1259
1260 // Disable SRoA for any intrinsics except for lifetime invariants.
1261 // FIXME: What about debug intrinsics? This matches old behavior, but
1262 // doesn't make sense.
1263 void visitIntrinsicInst(IntrinsicInst &II) {
1264 if (II.isDroppable()) {
1265 AS.DeadUseIfPromotable.push_back(Elt: U);
1266 return;
1267 }
1268
1269 if (!IsOffsetKnown)
1270 return PI.setAborted(&II);
1271
1272 if (II.isLifetimeStartOrEnd()) {
1273 insertUse(I&: II, Offset, Size: AllocSize, IsSplittable: true);
1274 return;
1275 }
1276
1277 Base::visitIntrinsicInst(II);
1278 }
1279
1280 Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) {
1281 // We consider any PHI or select that results in a direct load or store of
1282 // the same offset to be a viable use for slicing purposes. These uses
1283 // are considered unsplittable and the size is the maximum loaded or stored
1284 // size.
1285 SmallPtrSet<Instruction *, 4> Visited;
1286 SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses;
1287 Visited.insert(Ptr: Root);
1288 Uses.push_back(Elt: std::make_pair(x: cast<Instruction>(Val&: *U), y&: Root));
1289 const DataLayout &DL = Root->getDataLayout();
1290 // If there are no loads or stores, the access is dead. We mark that as
1291 // a size zero access.
1292 Size = 0;
1293 do {
1294 Instruction *I, *UsedI;
1295 std::tie(args&: UsedI, args&: I) = Uses.pop_back_val();
1296
1297 if (LoadInst *LI = dyn_cast<LoadInst>(Val: I)) {
1298 TypeSize LoadSize = DL.getTypeStoreSize(Ty: LI->getType());
1299 if (LoadSize.isScalable()) {
1300 PI.setAborted(LI);
1301 return nullptr;
1302 }
1303 Size = std::max(a: Size, b: LoadSize.getFixedValue());
1304 continue;
1305 }
1306 if (StoreInst *SI = dyn_cast<StoreInst>(Val: I)) {
1307 Value *Op = SI->getOperand(i_nocapture: 0);
1308 if (Op == UsedI)
1309 return SI;
1310 TypeSize StoreSize = DL.getTypeStoreSize(Ty: Op->getType());
1311 if (StoreSize.isScalable()) {
1312 PI.setAborted(SI);
1313 return nullptr;
1314 }
1315 Size = std::max(a: Size, b: StoreSize.getFixedValue());
1316 continue;
1317 }
1318
1319 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: I)) {
1320 if (!GEP->hasAllZeroIndices())
1321 return GEP;
1322 } else if (!isa<BitCastInst>(Val: I) && !isa<PHINode>(Val: I) &&
1323 !isa<SelectInst>(Val: I) && !isa<AddrSpaceCastInst>(Val: I)) {
1324 return I;
1325 }
1326
1327 for (User *U : I->users())
1328 if (Visited.insert(Ptr: cast<Instruction>(Val: U)).second)
1329 Uses.push_back(Elt: std::make_pair(x&: I, y: cast<Instruction>(Val: U)));
1330 } while (!Uses.empty());
1331
1332 return nullptr;
1333 }
1334
1335 void visitPHINodeOrSelectInst(Instruction &I) {
1336 assert(isa<PHINode>(I) || isa<SelectInst>(I));
1337 if (I.use_empty())
1338 return markAsDead(I);
1339
1340 // If this is a PHI node before a catchswitch, we cannot insert any non-PHI
1341 // instructions in this BB, which may be required during rewriting. Bail out
1342 // on these cases.
1343 if (isa<PHINode>(Val: I) && !I.getParent()->hasInsertionPt())
1344 return PI.setAborted(&I);
1345
1346 // TODO: We could use simplifyInstruction here to fold PHINodes and
1347 // SelectInsts. However, doing so requires to change the current
1348 // dead-operand-tracking mechanism. For instance, suppose neither loading
1349 // from %U nor %other traps. Then "load (select undef, %U, %other)" does not
1350 // trap either. However, if we simply replace %U with undef using the
1351 // current dead-operand-tracking mechanism, "load (select undef, undef,
1352 // %other)" may trap because the select may return the first operand
1353 // "undef".
1354 if (Value *Result = foldPHINodeOrSelectInst(I)) {
1355 if (Result == *U)
1356 // If the result of the constant fold will be the pointer, recurse
1357 // through the PHI/select as if we had RAUW'ed it.
1358 enqueueUsers(I);
1359 else
1360 // Otherwise the operand to the PHI/select is dead, and we can replace
1361 // it with poison.
1362 AS.DeadOperands.push_back(Elt: U);
1363
1364 return;
1365 }
1366
1367 if (!IsOffsetKnown)
1368 return PI.setAborted(&I);
1369
1370 // See if we already have computed info on this node.
1371 uint64_t &Size = PHIOrSelectSizes[&I];
1372 if (!Size) {
1373 // This is a new PHI/Select, check for an unsafe use of it.
1374 if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(Root: &I, Size))
1375 return PI.setAborted(UnsafeI);
1376 }
1377
1378 // For PHI and select operands outside the alloca, we can't nuke the entire
1379 // phi or select -- the other side might still be relevant, so we special
1380 // case them here and use a separate structure to track the operands
1381 // themselves which should be replaced with poison.
1382 // FIXME: This should instead be escaped in the event we're instrumenting
1383 // for address sanitization.
1384 if (Offset.uge(RHS: AllocSize)) {
1385 AS.DeadOperands.push_back(Elt: U);
1386 return;
1387 }
1388
1389 insertUse(I, Offset, Size);
1390 }
1391
1392 void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(I&: PN); }
1393
1394 void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(I&: SI); }
1395
1396 /// Disable SROA entirely if there are unhandled users of the alloca.
1397 void visitInstruction(Instruction &I) { PI.setAborted(&I); }
1398
1399 void visitCallBase(CallBase &CB) {
1400 // If the call operand is read-only and only does a read-only or address
1401 // capture, then we mark it as EscapedReadOnly.
1402 if (CB.isDataOperand(U) &&
1403 !capturesFullProvenance(CC: CB.getCaptureInfo(OpNo: U->getOperandNo())) &&
1404 CB.onlyReadsMemory(OpNo: U->getOperandNo())) {
1405 PI.setEscapedReadOnly(&CB);
1406 return;
1407 }
1408
1409 Base::visitCallBase(CB);
1410 }
1411};
1412
1413AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
1414 :
1415#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1416 AI(AI),
1417#endif
1418 PointerEscapingInstr(nullptr), PointerEscapingInstrReadOnly(nullptr) {
1419 SliceBuilder PB(DL, AI, *this);
1420 SliceBuilder::PtrInfo PtrI = PB.visitPtr(I&: AI);
1421 if (PtrI.isEscaped() || PtrI.isAborted()) {
1422 // FIXME: We should sink the escape vs. abort info into the caller nicely,
1423 // possibly by just storing the PtrInfo in the AllocaSlices.
1424 PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst()
1425 : PtrI.getAbortingInst();
1426 assert(PointerEscapingInstr && "Did not track a bad instruction");
1427 return;
1428 }
1429 PointerEscapingInstrReadOnly = PtrI.getEscapedReadOnlyInst();
1430
1431 llvm::erase_if(C&: Slices, P: [](const Slice &S) { return S.isDead(); });
1432
1433 // Sort the uses. This arranges for the offsets to be in ascending order,
1434 // and the sizes to be in descending order.
1435 llvm::stable_sort(Range&: Slices);
1436}
1437
1438#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1439
1440void AllocaSlices::print(raw_ostream &OS, const_iterator I,
1441 StringRef Indent) const {
1442 printSlice(OS, I, Indent);
1443 OS << "\n";
1444 printUse(OS, I, Indent);
1445}
1446
1447void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I,
1448 StringRef Indent) const {
1449 OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")"
1450 << " slice #" << (I - begin())
1451 << (I->isSplittable() ? " (splittable)" : "");
1452}
1453
1454void AllocaSlices::printUse(raw_ostream &OS, const_iterator I,
1455 StringRef Indent) const {
1456 OS << Indent << " used by: " << *I->getUse()->getUser() << "\n";
1457}
1458
1459void AllocaSlices::print(raw_ostream &OS) const {
1460 if (PointerEscapingInstr) {
1461 OS << "Can't analyze slices for alloca: " << AI << "\n"
1462 << " A pointer to this alloca escaped by:\n"
1463 << " " << *PointerEscapingInstr << "\n";
1464 return;
1465 }
1466
1467 if (PointerEscapingInstrReadOnly)
1468 OS << "Escapes into ReadOnly: " << *PointerEscapingInstrReadOnly << "\n";
1469
1470 OS << "Slices of alloca: " << AI << "\n";
1471 for (const_iterator I = begin(), E = end(); I != E; ++I)
1472 print(OS, I);
1473}
1474
1475LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const {
1476 print(dbgs(), I);
1477}
1478LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }
1479
1480#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1481
1482/// Walk the range of a partitioning looking for a common type to cover this
1483/// sequence of slices.
1484static std::pair<Type *, IntegerType *>
1485findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E,
1486 uint64_t EndOffset) {
1487 Type *Ty = nullptr;
1488 bool TyIsCommon = true;
1489 IntegerType *ITy = nullptr;
1490
1491 // Note that we need to look at *every* alloca slice's Use to ensure we
1492 // always get consistent results regardless of the order of slices.
1493 for (AllocaSlices::const_iterator I = B; I != E; ++I) {
1494 Use *U = I->getUse();
1495 if (isa<IntrinsicInst>(Val: *U->getUser()))
1496 continue;
1497 if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset)
1498 continue;
1499
1500 Type *UserTy = nullptr;
1501 if (LoadInst *LI = dyn_cast<LoadInst>(Val: U->getUser())) {
1502 UserTy = LI->getType();
1503 } else if (StoreInst *SI = dyn_cast<StoreInst>(Val: U->getUser())) {
1504 UserTy = SI->getValueOperand()->getType();
1505 }
1506
1507 if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(Val: UserTy)) {
1508 // If the type is larger than the partition, skip it. We only encounter
1509 // this for split integer operations where we want to use the type of the
1510 // entity causing the split. Also skip if the type is not a byte width
1511 // multiple.
1512 if (UserITy->getBitWidth() % 8 != 0 ||
1513 UserITy->getBitWidth() / 8 > (EndOffset - B->beginOffset()))
1514 continue;
1515
1516 // Track the largest bitwidth integer type used in this way in case there
1517 // is no common type.
1518 if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth())
1519 ITy = UserITy;
1520 }
1521
1522 // To avoid depending on the order of slices, Ty and TyIsCommon must not
1523 // depend on types skipped above.
1524 if (!UserTy || (Ty && Ty != UserTy))
1525 TyIsCommon = false; // Give up on anything but an iN type.
1526 else
1527 Ty = UserTy;
1528 }
1529
1530 return {TyIsCommon ? Ty : nullptr, ITy};
1531}
1532
1533/// PHI instructions that use an alloca and are subsequently loaded can be
1534/// rewritten to load both input pointers in the pred blocks and then PHI the
1535/// results, allowing the load of the alloca to be promoted.
1536/// From this:
1537/// %P2 = phi [i32* %Alloca, i32* %Other]
1538/// %V = load i32* %P2
1539/// to:
1540/// %V1 = load i32* %Alloca -> will be mem2reg'd
1541/// ...
1542/// %V2 = load i32* %Other
1543/// ...
1544/// %V = phi [i32 %V1, i32 %V2]
1545///
1546/// We can do this to a select if its only uses are loads and if the operands
1547/// to the select can be loaded unconditionally.
1548///
1549/// FIXME: This should be hoisted into a generic utility, likely in
1550/// Transforms/Util/Local.h
1551static bool isSafePHIToSpeculate(PHINode &PN) {
1552 const DataLayout &DL = PN.getDataLayout();
1553
1554 // For now, we can only do this promotion if the load is in the same block
1555 // as the PHI, and if there are no stores between the phi and load.
1556 // TODO: Allow recursive phi users.
1557 // TODO: Allow stores.
1558 BasicBlock *BB = PN.getParent();
1559 Align MaxAlign;
1560 uint64_t APWidth = DL.getIndexTypeSizeInBits(Ty: PN.getType());
1561 Type *LoadType = nullptr;
1562 for (User *U : PN.users()) {
1563 LoadInst *LI = dyn_cast<LoadInst>(Val: U);
1564 if (!LI || !LI->isSimple())
1565 return false;
1566
1567 // For now we only allow loads in the same block as the PHI. This is
1568 // a common case that happens when instcombine merges two loads through
1569 // a PHI.
1570 if (LI->getParent() != BB)
1571 return false;
1572
1573 if (LoadType) {
1574 if (LoadType != LI->getType())
1575 return false;
1576 } else {
1577 LoadType = LI->getType();
1578 }
1579
1580 // Ensure that there are no instructions between the PHI and the load that
1581 // could store.
1582 for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI)
1583 if (BBI->mayWriteToMemory())
1584 return false;
1585
1586 MaxAlign = std::max(a: MaxAlign, b: LI->getAlign());
1587 }
1588
1589 if (!LoadType)
1590 return false;
1591
1592 APInt LoadSize =
1593 APInt(APWidth, DL.getTypeStoreSize(Ty: LoadType).getFixedValue());
1594
1595 // We can only transform this if it is safe to push the loads into the
1596 // predecessor blocks. The only thing to watch out for is that we can't put
1597 // a possibly trapping load in the predecessor if it is a critical edge.
1598 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1599 Instruction *TI = PN.getIncomingBlock(i: Idx)->getTerminator();
1600 Value *InVal = PN.getIncomingValue(i: Idx);
1601
1602 // If the value is produced by the terminator of the predecessor (an
1603 // invoke) or it has side-effects, there is no valid place to put a load
1604 // in the predecessor.
1605 if (TI == InVal || TI->mayHaveSideEffects())
1606 return false;
1607
1608 // If the predecessor has a single successor, then the edge isn't
1609 // critical.
1610 if (TI->getNumSuccessors() == 1)
1611 continue;
1612
1613 // If this pointer is always safe to load, or if we can prove that there
1614 // is already a load in the block, then we can move the load to the pred
1615 // block.
1616 if (isSafeToLoadUnconditionally(V: InVal, Alignment: MaxAlign, Size: LoadSize, DL, ScanFrom: TI))
1617 continue;
1618
1619 return false;
1620 }
1621
1622 return true;
1623}
1624
1625static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN) {
1626 LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
1627
1628 LoadInst *SomeLoad = cast<LoadInst>(Val: PN.user_back());
1629 Type *LoadTy = SomeLoad->getType();
1630 IRB.SetInsertPoint(&PN);
1631 PHINode *NewPN = IRB.CreatePHI(Ty: LoadTy, NumReservedValues: PN.getNumIncomingValues(),
1632 Name: PN.getName() + ".sroa.speculated");
1633
1634 // Get the AA tags and alignment to use from one of the loads. It does not
1635 // matter which one we get and if any differ.
1636 AAMDNodes AATags = SomeLoad->getAAMetadata();
1637 Align Alignment = SomeLoad->getAlign();
1638
1639 // Rewrite all loads of the PN to use the new PHI.
1640 while (!PN.use_empty()) {
1641 LoadInst *LI = cast<LoadInst>(Val: PN.user_back());
1642 LI->replaceAllUsesWith(V: NewPN);
1643 LI->eraseFromParent();
1644 }
1645
1646 // Inject loads into all of the pred blocks.
1647 DenseMap<BasicBlock *, Value *> InjectedLoads;
1648 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1649 BasicBlock *Pred = PN.getIncomingBlock(i: Idx);
1650 Value *InVal = PN.getIncomingValue(i: Idx);
1651
1652 // A PHI node is allowed to have multiple (duplicated) entries for the same
1653 // basic block, as long as the value is the same. So if we already injected
1654 // a load in the predecessor, then we should reuse the same load for all
1655 // duplicated entries.
1656 if (Value *V = InjectedLoads.lookup(Val: Pred)) {
1657 NewPN->addIncoming(V, BB: Pred);
1658 continue;
1659 }
1660
1661 Instruction *TI = Pred->getTerminator();
1662 IRB.SetInsertPoint(TI);
1663
1664 LoadInst *Load = IRB.CreateAlignedLoad(
1665 Ty: LoadTy, Ptr: InVal, Align: Alignment,
1666 Name: (PN.getName() + ".sroa.speculate.load." + Pred->getName()));
1667 ++NumLoadsSpeculated;
1668 if (AATags)
1669 Load->setAAMetadata(AATags);
1670 NewPN->addIncoming(V: Load, BB: Pred);
1671 InjectedLoads[Pred] = Load;
1672 }
1673
1674 LLVM_DEBUG(dbgs() << " speculated to: " << *NewPN << "\n");
1675 PN.eraseFromParent();
1676}
1677
1678SelectHandSpeculativity &
1679SelectHandSpeculativity::setAsSpeculatable(bool isTrueVal) {
1680 if (isTrueVal)
1681 Bitfield::set<SelectHandSpeculativity::TrueVal>(Packed&: Storage, Value: true);
1682 else
1683 Bitfield::set<SelectHandSpeculativity::FalseVal>(Packed&: Storage, Value: true);
1684 return *this;
1685}
1686
1687bool SelectHandSpeculativity::isSpeculatable(bool isTrueVal) const {
1688 return isTrueVal ? Bitfield::get<SelectHandSpeculativity::TrueVal>(Packed: Storage)
1689 : Bitfield::get<SelectHandSpeculativity::FalseVal>(Packed: Storage);
1690}
1691
1692bool SelectHandSpeculativity::areAllSpeculatable() const {
1693 return isSpeculatable(/*isTrueVal=*/true) &&
1694 isSpeculatable(/*isTrueVal=*/false);
1695}
1696
1697bool SelectHandSpeculativity::areAnySpeculatable() const {
1698 return isSpeculatable(/*isTrueVal=*/true) ||
1699 isSpeculatable(/*isTrueVal=*/false);
1700}
1701bool SelectHandSpeculativity::areNoneSpeculatable() const {
1702 return !areAnySpeculatable();
1703}
1704
1705static SelectHandSpeculativity
1706isSafeLoadOfSelectToSpeculate(LoadInst &LI, SelectInst &SI, bool PreserveCFG) {
1707 assert(LI.isSimple() && "Only for simple loads");
1708 SelectHandSpeculativity Spec;
1709
1710 const DataLayout &DL = SI.getDataLayout();
1711 for (Value *Value : {SI.getTrueValue(), SI.getFalseValue()})
1712 if (isSafeToLoadUnconditionally(V: Value, Ty: LI.getType(), Alignment: LI.getAlign(), DL,
1713 ScanFrom: &LI))
1714 Spec.setAsSpeculatable(/*isTrueVal=*/Value == SI.getTrueValue());
1715 else if (PreserveCFG)
1716 return Spec;
1717
1718 return Spec;
1719}
1720
1721std::optional<RewriteableMemOps>
1722SROA::isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG) {
1723 RewriteableMemOps Ops;
1724
1725 for (User *U : SI.users()) {
1726 if (auto *BC = dyn_cast<BitCastInst>(Val: U); BC && BC->hasOneUse())
1727 U = *BC->user_begin();
1728
1729 if (auto *Store = dyn_cast<StoreInst>(Val: U)) {
1730 // Note that atomic stores can be transformed; atomic semantics do not
1731 // have any meaning for a local alloca. Stores are not speculatable,
1732 // however, so if we can't turn it into a predicated store, we are done.
1733 if (Store->isVolatile() || PreserveCFG)
1734 return {}; // Give up on this `select`.
1735 Ops.emplace_back(Args&: Store);
1736 continue;
1737 }
1738
1739 auto *LI = dyn_cast<LoadInst>(Val: U);
1740
1741 // Note that atomic loads can be transformed;
1742 // atomic semantics do not have any meaning for a local alloca.
1743 if (!LI || LI->isVolatile())
1744 return {}; // Give up on this `select`.
1745
1746 PossiblySpeculatableLoad Load(LI);
1747 if (!LI->isSimple()) {
1748 // If the `load` is not simple, we can't speculatively execute it,
1749 // but we could handle this via a CFG modification. But can we?
1750 if (PreserveCFG)
1751 return {}; // Give up on this `select`.
1752 Ops.emplace_back(Args&: Load);
1753 continue;
1754 }
1755
1756 SelectHandSpeculativity Spec =
1757 isSafeLoadOfSelectToSpeculate(LI&: *LI, SI, PreserveCFG);
1758 if (PreserveCFG && !Spec.areAllSpeculatable())
1759 return {}; // Give up on this `select`.
1760
1761 Load.setInt(Spec);
1762 Ops.emplace_back(Args&: Load);
1763 }
1764
1765 return Ops;
1766}
1767
1768static void speculateSelectInstLoads(SelectInst &SI, LoadInst &LI,
1769 IRBuilderTy &IRB) {
1770 LLVM_DEBUG(dbgs() << " original load: " << SI << "\n");
1771
1772 Value *TV = SI.getTrueValue();
1773 Value *FV = SI.getFalseValue();
1774 // Replace the given load of the select with a select of two loads.
1775
1776 assert(LI.isSimple() && "We only speculate simple loads");
1777
1778 IRB.SetInsertPoint(&LI);
1779
1780 LoadInst *TL =
1781 IRB.CreateAlignedLoad(Ty: LI.getType(), Ptr: TV, Align: LI.getAlign(),
1782 Name: LI.getName() + ".sroa.speculate.load.true");
1783 LoadInst *FL =
1784 IRB.CreateAlignedLoad(Ty: LI.getType(), Ptr: FV, Align: LI.getAlign(),
1785 Name: LI.getName() + ".sroa.speculate.load.false");
1786 NumLoadsSpeculated += 2;
1787
1788 // Transfer alignment and AA info if present.
1789 TL->setAlignment(LI.getAlign());
1790 FL->setAlignment(LI.getAlign());
1791
1792 AAMDNodes Tags = LI.getAAMetadata();
1793 if (Tags) {
1794 TL->setAAMetadata(Tags);
1795 FL->setAAMetadata(Tags);
1796 }
1797
1798 Value *V = IRB.CreateSelect(C: SI.getCondition(), True: TL, False: FL,
1799 Name: LI.getName() + ".sroa.speculated",
1800 MDFrom: ProfcheckDisableMetadataFixes ? nullptr : &SI);
1801
1802 LLVM_DEBUG(dbgs() << " speculated to: " << *V << "\n");
1803 LI.replaceAllUsesWith(V);
1804}
1805
1806template <typename T>
1807static void rewriteMemOpOfSelect(SelectInst &SI, T &I,
1808 SelectHandSpeculativity Spec,
1809 DomTreeUpdater &DTU) {
1810 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && "Only for load and store!");
1811 LLVM_DEBUG(dbgs() << " original mem op: " << I << "\n");
1812 BasicBlock *Head = I.getParent();
1813 Instruction *ThenTerm = nullptr;
1814 Instruction *ElseTerm = nullptr;
1815 if (Spec.areNoneSpeculatable())
1816 SplitBlockAndInsertIfThenElse(SI.getCondition(), &I, &ThenTerm, &ElseTerm,
1817 SI.getMetadata(KindID: LLVMContext::MD_prof), &DTU);
1818 else {
1819 SplitBlockAndInsertIfThen(SI.getCondition(), &I, /*Unreachable=*/false,
1820 SI.getMetadata(KindID: LLVMContext::MD_prof), &DTU,
1821 /*LI=*/nullptr, /*ThenBlock=*/nullptr);
1822 if (Spec.isSpeculatable(/*isTrueVal=*/true))
1823 cast<CondBrInst>(Val: Head->getTerminator())->swapSuccessors();
1824 }
1825 auto *HeadBI = cast<CondBrInst>(Val: Head->getTerminator());
1826 Spec = {}; // Do not use `Spec` beyond this point.
1827 BasicBlock *Tail = I.getParent();
1828 Tail->setName(Head->getName() + ".cont");
1829 PHINode *PN;
1830 if (isa<LoadInst>(I))
1831 PN = PHINode::Create(Ty: I.getType(), NumReservedValues: 2, NameStr: "", InsertBefore: I.getIterator());
1832 for (BasicBlock *SuccBB : successors(BB: Head)) {
1833 bool IsThen = SuccBB == HeadBI->getSuccessor(i: 0);
1834 int SuccIdx = IsThen ? 0 : 1;
1835 auto *NewMemOpBB = SuccBB == Tail ? Head : SuccBB;
1836 auto &CondMemOp = cast<T>(*I.clone());
1837 if (NewMemOpBB != Head) {
1838 NewMemOpBB->setName(Head->getName() + (IsThen ? ".then" : ".else"));
1839 if (isa<LoadInst>(I))
1840 ++NumLoadsPredicated;
1841 else
1842 ++NumStoresPredicated;
1843 } else {
1844 CondMemOp.dropUBImplyingAttrsAndMetadata();
1845 ++NumLoadsSpeculated;
1846 }
1847 CondMemOp.insertBefore(NewMemOpBB->getTerminator()->getIterator());
1848 Value *Ptr = SI.getOperand(i_nocapture: 1 + SuccIdx);
1849 CondMemOp.setOperand(I.getPointerOperandIndex(), Ptr);
1850 if (isa<LoadInst>(I)) {
1851 CondMemOp.setName(I.getName() + (IsThen ? ".then" : ".else") + ".val");
1852 PN->addIncoming(V: &CondMemOp, BB: NewMemOpBB);
1853 } else
1854 LLVM_DEBUG(dbgs() << " to: " << CondMemOp << "\n");
1855 }
1856 if (isa<LoadInst>(I)) {
1857 PN->takeName(V: &I);
1858 LLVM_DEBUG(dbgs() << " to: " << *PN << "\n");
1859 I.replaceAllUsesWith(PN);
1860 }
1861}
1862
1863static void rewriteMemOpOfSelect(SelectInst &SelInst, Instruction &I,
1864 SelectHandSpeculativity Spec,
1865 DomTreeUpdater &DTU) {
1866 if (auto *LI = dyn_cast<LoadInst>(Val: &I))
1867 rewriteMemOpOfSelect(SI&: SelInst, I&: *LI, Spec, DTU);
1868 else if (auto *SI = dyn_cast<StoreInst>(Val: &I))
1869 rewriteMemOpOfSelect(SI&: SelInst, I&: *SI, Spec, DTU);
1870 else
1871 llvm_unreachable_internal(msg: "Only for load and store.");
1872}
1873
1874static bool rewriteSelectInstMemOps(SelectInst &SI,
1875 const RewriteableMemOps &Ops,
1876 IRBuilderTy &IRB, DomTreeUpdater *DTU) {
1877 bool CFGChanged = false;
1878 LLVM_DEBUG(dbgs() << " original select: " << SI << "\n");
1879
1880 for (const RewriteableMemOp &Op : Ops) {
1881 SelectHandSpeculativity Spec;
1882 Instruction *I;
1883 if (auto *const *US = std::get_if<UnspeculatableStore>(ptr: &Op)) {
1884 I = *US;
1885 } else {
1886 auto PSL = std::get<PossiblySpeculatableLoad>(v: Op);
1887 I = PSL.getPointer();
1888 Spec = PSL.getInt();
1889 }
1890 if (Spec.areAllSpeculatable()) {
1891 speculateSelectInstLoads(SI, LI&: cast<LoadInst>(Val&: *I), IRB);
1892 } else {
1893 assert(DTU && "Should not get here when not allowed to modify the CFG!");
1894 rewriteMemOpOfSelect(SelInst&: SI, I&: *I, Spec, DTU&: *DTU);
1895 CFGChanged = true;
1896 }
1897 I->eraseFromParent();
1898 }
1899
1900 for (User *U : make_early_inc_range(Range: SI.users()))
1901 cast<BitCastInst>(Val: U)->eraseFromParent();
1902 SI.eraseFromParent();
1903 return CFGChanged;
1904}
1905
1906/// Compute an adjusted pointer from Ptr by Offset bytes where the
1907/// resulting pointer has PointerTy.
1908static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
1909 APInt Offset, Type *PointerTy,
1910 const Twine &NamePrefix) {
1911 if (Offset != 0)
1912 Ptr = IRB.CreateInBoundsPtrAdd(Ptr, Offset: IRB.getInt(AI: Offset),
1913 Name: NamePrefix + "sroa_idx");
1914 return IRB.CreatePointerBitCastOrAddrSpaceCast(V: Ptr, DestTy: PointerTy,
1915 Name: NamePrefix + "sroa_cast");
1916}
1917
1918/// Compute the adjusted alignment for a load or store from an offset.
1919static Align getAdjustedAlignment(Instruction *I, uint64_t Offset) {
1920 return commonAlignment(A: getLoadStoreAlignment(I), Offset);
1921}
1922
1923/// Test whether we can convert a value from the old to the new type.
1924///
1925/// This predicate should be used to guard calls to convertValue in order to
1926/// ensure that we only try to convert viable values. The strategy is that we
1927/// will peel off single element struct and array wrappings to get to an
1928/// underlying value, and convert that value.
1929static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy,
1930 unsigned VScale = 0) {
1931 if (OldTy == NewTy)
1932 return true;
1933
1934 // For integer types, we can't handle any bit-width differences. This would
1935 // break both vector conversions with extension and introduce endianness
1936 // issues when in conjunction with loads and stores.
1937 if (isa<IntegerType>(Val: OldTy) && isa<IntegerType>(Val: NewTy)) {
1938 assert(cast<IntegerType>(OldTy)->getBitWidth() !=
1939 cast<IntegerType>(NewTy)->getBitWidth() &&
1940 "We can't have the same bitwidth for different int types");
1941 return false;
1942 }
1943
1944 TypeSize NewSize = DL.getTypeSizeInBits(Ty: NewTy);
1945 TypeSize OldSize = DL.getTypeSizeInBits(Ty: OldTy);
1946
1947 if ((isa<ScalableVectorType>(Val: NewTy) && isa<FixedVectorType>(Val: OldTy)) ||
1948 (isa<ScalableVectorType>(Val: OldTy) && isa<FixedVectorType>(Val: NewTy))) {
1949 // Conversion is only possible when the size of scalable vectors is known.
1950 if (!VScale)
1951 return false;
1952
1953 // For ptr-to-int and int-to-ptr casts, the pointer side is resolved within
1954 // a single domain (either fixed or scalable). Any additional conversion
1955 // between fixed and scalable types is handled through integer types.
1956 auto OldVTy = OldTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(OldTy) : OldTy;
1957 auto NewVTy = NewTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(NewTy) : NewTy;
1958
1959 if (isa<ScalableVectorType>(Val: NewTy)) {
1960 if (!VectorType::getWithSizeAndScalar(SizeTy: cast<VectorType>(Val: NewVTy), EltTy: OldVTy))
1961 return false;
1962
1963 NewSize = TypeSize::getFixed(ExactSize: NewSize.getKnownMinValue() * VScale);
1964 } else {
1965 if (!VectorType::getWithSizeAndScalar(SizeTy: cast<VectorType>(Val: OldVTy), EltTy: NewVTy))
1966 return false;
1967
1968 OldSize = TypeSize::getFixed(ExactSize: OldSize.getKnownMinValue() * VScale);
1969 }
1970 }
1971
1972 if (NewSize != OldSize)
1973 return false;
1974 if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
1975 return false;
1976
1977 // We can convert pointers to integers and vice-versa. Same for vectors
1978 // of pointers and integers.
1979 OldTy = OldTy->getScalarType();
1980 NewTy = NewTy->getScalarType();
1981 if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
1982 if (NewTy->isPointerTy() && OldTy->isPointerTy()) {
1983 unsigned OldAS = OldTy->getPointerAddressSpace();
1984 unsigned NewAS = NewTy->getPointerAddressSpace();
1985 // Convert pointers if they are pointers from the same address space or
1986 // different integral (not non-integral) address spaces with the same
1987 // pointer size.
1988 return OldAS == NewAS ||
1989 (!DL.isNonIntegralAddressSpace(AddrSpace: OldAS) &&
1990 !DL.isNonIntegralAddressSpace(AddrSpace: NewAS) &&
1991 DL.getPointerSize(AS: OldAS) == DL.getPointerSize(AS: NewAS));
1992 }
1993
1994 // We can convert integers to integral pointers, but not to non-integral
1995 // pointers.
1996 if (OldTy->isIntegerTy())
1997 return !DL.isNonIntegralPointerType(Ty: NewTy);
1998
1999 // We can convert integral pointers to integers, but non-integral pointers
2000 // need to remain pointers.
2001 if (!DL.isNonIntegralPointerType(Ty: OldTy))
2002 return NewTy->isIntegerTy();
2003
2004 return false;
2005 }
2006
2007 if (OldTy->isTargetExtTy() || NewTy->isTargetExtTy())
2008 return false;
2009
2010 return true;
2011}
2012
2013/// Test whether the given slice use can be promoted to a vector.
2014///
2015/// This function is called to test each entry in a partition which is slated
2016/// for a single slice.
2017static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
2018 VectorType *Ty,
2019 uint64_t ElementSize,
2020 const DataLayout &DL,
2021 unsigned VScale) {
2022 // First validate the slice offsets.
2023 uint64_t BeginOffset =
2024 std::max(a: S.beginOffset(), b: P.beginOffset()) - P.beginOffset();
2025 uint64_t BeginIndex = BeginOffset / ElementSize;
2026 if (BeginIndex * ElementSize != BeginOffset ||
2027 BeginIndex >= cast<FixedVectorType>(Val: Ty)->getNumElements())
2028 return false;
2029 uint64_t EndOffset = std::min(a: S.endOffset(), b: P.endOffset()) - P.beginOffset();
2030 uint64_t EndIndex = EndOffset / ElementSize;
2031 if (EndIndex * ElementSize != EndOffset ||
2032 EndIndex > cast<FixedVectorType>(Val: Ty)->getNumElements())
2033 return false;
2034
2035 assert(EndIndex > BeginIndex && "Empty vector!");
2036 uint64_t NumElements = EndIndex - BeginIndex;
2037 Type *SliceTy = (NumElements == 1)
2038 ? Ty->getElementType()
2039 : FixedVectorType::get(ElementType: Ty->getElementType(), NumElts: NumElements);
2040
2041 Type *SplitIntTy =
2042 Type::getIntNTy(C&: Ty->getContext(), N: NumElements * ElementSize * 8);
2043
2044 Use *U = S.getUse();
2045
2046 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Val: U->getUser())) {
2047 if (MI->isVolatile())
2048 return false;
2049 if (!S.isSplittable())
2050 return false; // Skip any unsplittable intrinsics.
2051 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: U->getUser())) {
2052 if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
2053 return false;
2054 } else if (LoadInst *LI = dyn_cast<LoadInst>(Val: U->getUser())) {
2055 if (LI->isVolatile())
2056 return false;
2057 Type *LTy = LI->getType();
2058 // Disable vector promotion when there are loads or stores of an FCA.
2059 if (LTy->isStructTy())
2060 return false;
2061 if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
2062 assert(LTy->isIntegerTy());
2063 LTy = SplitIntTy;
2064 }
2065 if (!canConvertValue(DL, OldTy: SliceTy, NewTy: LTy, VScale))
2066 return false;
2067 } else if (StoreInst *SI = dyn_cast<StoreInst>(Val: U->getUser())) {
2068 if (SI->isVolatile())
2069 return false;
2070 Type *STy = SI->getValueOperand()->getType();
2071 // Disable vector promotion when there are loads or stores of an FCA.
2072 if (STy->isStructTy())
2073 return false;
2074 if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
2075 assert(STy->isIntegerTy());
2076 STy = SplitIntTy;
2077 }
2078 if (!canConvertValue(DL, OldTy: STy, NewTy: SliceTy, VScale))
2079 return false;
2080 } else {
2081 return false;
2082 }
2083
2084 return true;
2085}
2086
2087/// Test whether any vector type in \p CandidateTys is viable for promotion.
2088///
2089/// This implements the necessary checking for \c isVectorPromotionViable over
2090/// all slices of the alloca for the given VectorType.
2091static VectorType *
2092checkVectorTypesForPromotion(Partition &P, const DataLayout &DL,
2093 SmallVectorImpl<VectorType *> &CandidateTys,
2094 bool HaveCommonEltTy, Type *CommonEltTy,
2095 bool HaveVecPtrTy, bool HaveCommonVecPtrTy,
2096 VectorType *CommonVecPtrTy, unsigned VScale) {
2097 // If we didn't find a vector type, nothing to do here.
2098 if (CandidateTys.empty())
2099 return nullptr;
2100
2101 // Pointer-ness is sticky, if we had a vector-of-pointers candidate type,
2102 // then we should choose it, not some other alternative.
2103 // But, we can't perform a no-op pointer address space change via bitcast,
2104 // so if we didn't have a common pointer element type, bail.
2105 if (HaveVecPtrTy && !HaveCommonVecPtrTy)
2106 return nullptr;
2107
2108 // Try to pick the "best" element type out of the choices.
2109 if (!HaveCommonEltTy && HaveVecPtrTy) {
2110 // If there was a pointer element type, there's really only one choice.
2111 CandidateTys.clear();
2112 CandidateTys.push_back(Elt: CommonVecPtrTy);
2113 } else if (!HaveCommonEltTy && !HaveVecPtrTy) {
2114 // Integer-ify vector types.
2115 for (VectorType *&VTy : CandidateTys) {
2116 if (!VTy->getElementType()->isIntegerTy())
2117 VTy = cast<VectorType>(Val: VTy->getWithNewType(EltTy: IntegerType::getIntNTy(
2118 C&: VTy->getContext(), N: VTy->getScalarSizeInBits())));
2119 }
2120
2121 // Rank the remaining candidate vector types. This is easy because we know
2122 // they're all integer vectors. We sort by ascending number of elements.
2123 auto RankVectorTypesComp = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
2124 (void)DL;
2125 assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
2126 DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
2127 "Cannot have vector types of different sizes!");
2128 assert(RHSTy->getElementType()->isIntegerTy() &&
2129 "All non-integer types eliminated!");
2130 assert(LHSTy->getElementType()->isIntegerTy() &&
2131 "All non-integer types eliminated!");
2132 return cast<FixedVectorType>(Val: RHSTy)->getNumElements() <
2133 cast<FixedVectorType>(Val: LHSTy)->getNumElements();
2134 };
2135 auto RankVectorTypesEq = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
2136 (void)DL;
2137 assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
2138 DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
2139 "Cannot have vector types of different sizes!");
2140 assert(RHSTy->getElementType()->isIntegerTy() &&
2141 "All non-integer types eliminated!");
2142 assert(LHSTy->getElementType()->isIntegerTy() &&
2143 "All non-integer types eliminated!");
2144 return cast<FixedVectorType>(Val: RHSTy)->getNumElements() ==
2145 cast<FixedVectorType>(Val: LHSTy)->getNumElements();
2146 };
2147 llvm::sort(C&: CandidateTys, Comp: RankVectorTypesComp);
2148 CandidateTys.erase(CS: llvm::unique(R&: CandidateTys, P: RankVectorTypesEq),
2149 CE: CandidateTys.end());
2150 } else {
2151// The only way to have the same element type in every vector type is to
2152// have the same vector type. Check that and remove all but one.
2153#ifndef NDEBUG
2154 for (VectorType *VTy : CandidateTys) {
2155 assert(VTy->getElementType() == CommonEltTy &&
2156 "Unaccounted for element type!");
2157 assert(VTy == CandidateTys[0] &&
2158 "Different vector types with the same element type!");
2159 }
2160#endif
2161 CandidateTys.resize(N: 1);
2162 }
2163
2164 // FIXME: hack. Do we have a named constant for this?
2165 // SDAG SDNode can't have more than 65535 operands.
2166 llvm::erase_if(C&: CandidateTys, P: [](VectorType *VTy) {
2167 return cast<FixedVectorType>(Val: VTy)->getNumElements() >
2168 std::numeric_limits<unsigned short>::max();
2169 });
2170
2171 // Find a vector type viable for promotion by iterating over all slices.
2172 auto *VTy = llvm::find_if(Range&: CandidateTys, P: [&](VectorType *VTy) -> bool {
2173 uint64_t ElementSize =
2174 DL.getTypeSizeInBits(Ty: VTy->getElementType()).getFixedValue();
2175
2176 // While the definition of LLVM vectors is bitpacked, we don't support sizes
2177 // that aren't byte sized.
2178 if (ElementSize % 8)
2179 return false;
2180 assert((DL.getTypeSizeInBits(VTy).getFixedValue() % 8) == 0 &&
2181 "vector size not a multiple of element size?");
2182 ElementSize /= 8;
2183
2184 for (const Slice &S : P)
2185 if (!isVectorPromotionViableForSlice(P, S, Ty: VTy, ElementSize, DL, VScale))
2186 return false;
2187
2188 for (const Slice *S : P.splitSliceTails())
2189 if (!isVectorPromotionViableForSlice(P, S: *S, Ty: VTy, ElementSize, DL, VScale))
2190 return false;
2191
2192 return true;
2193 });
2194 return VTy != CandidateTys.end() ? *VTy : nullptr;
2195}
2196
2197static VectorType *createAndCheckVectorTypesForPromotion(
2198 SetVector<Type *> &OtherTys, ArrayRef<VectorType *> CandidateTysCopy,
2199 function_ref<void(Type *)> CheckCandidateType, Partition &P,
2200 const DataLayout &DL, SmallVectorImpl<VectorType *> &CandidateTys,
2201 bool &HaveCommonEltTy, Type *&CommonEltTy, bool &HaveVecPtrTy,
2202 bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy, unsigned VScale) {
2203 [[maybe_unused]] VectorType *OriginalElt =
2204 CandidateTysCopy.size() ? CandidateTysCopy[0] : nullptr;
2205 // Consider additional vector types where the element type size is a
2206 // multiple of load/store element size.
2207 for (Type *Ty : OtherTys) {
2208 if (!VectorType::isValidElementType(ElemTy: Ty))
2209 continue;
2210 unsigned TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
2211 // Make a copy of CandidateTys and iterate through it, because we
2212 // might append to CandidateTys in the loop.
2213 for (VectorType *const VTy : CandidateTysCopy) {
2214 // The elements in the copy should remain invariant throughout the loop
2215 assert(CandidateTysCopy[0] == OriginalElt && "Different Element");
2216 unsigned VectorSize = DL.getTypeSizeInBits(Ty: VTy).getFixedValue();
2217 unsigned ElementSize =
2218 DL.getTypeSizeInBits(Ty: VTy->getElementType()).getFixedValue();
2219 if (TypeSize != VectorSize && TypeSize != ElementSize &&
2220 VectorSize % TypeSize == 0) {
2221 VectorType *NewVTy = VectorType::get(ElementType: Ty, NumElements: VectorSize / TypeSize, Scalable: false);
2222 CheckCandidateType(NewVTy);
2223 }
2224 }
2225 }
2226
2227 return checkVectorTypesForPromotion(
2228 P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
2229 HaveCommonVecPtrTy, CommonVecPtrTy, VScale);
2230}
2231
2232/// Test whether the given alloca partitioning and range of slices can be
2233/// promoted to a vector.
2234///
2235/// This is a quick test to check whether we can rewrite a particular alloca
2236/// partition (and its newly formed alloca) into a vector alloca with only
2237/// whole-vector loads and stores such that it could be promoted to a vector
2238/// SSA value. We only can ensure this for a limited set of operations, and we
2239/// don't want to do the rewrites unless we are confident that the result will
2240/// be promotable, so we have an early test here.
2241static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL,
2242 unsigned VScale) {
2243 // Collect the candidate types for vector-based promotion. Also track whether
2244 // we have different element types.
2245 SmallVector<VectorType *, 4> CandidateTys;
2246 SetVector<Type *> LoadStoreTys;
2247 SetVector<Type *> DeferredTys;
2248 Type *CommonEltTy = nullptr;
2249 VectorType *CommonVecPtrTy = nullptr;
2250 bool HaveVecPtrTy = false;
2251 bool HaveCommonEltTy = true;
2252 bool HaveCommonVecPtrTy = true;
2253 auto CheckCandidateType = [&](Type *Ty) {
2254 if (auto *VTy = dyn_cast<FixedVectorType>(Val: Ty)) {
2255 // Return if bitcast to vectors is different for total size in bits.
2256 if (!CandidateTys.empty()) {
2257 VectorType *V = CandidateTys[0];
2258 if (DL.getTypeSizeInBits(Ty: VTy).getFixedValue() !=
2259 DL.getTypeSizeInBits(Ty: V).getFixedValue()) {
2260 CandidateTys.clear();
2261 return;
2262 }
2263 }
2264 CandidateTys.push_back(Elt: VTy);
2265 Type *EltTy = VTy->getElementType();
2266
2267 if (!CommonEltTy)
2268 CommonEltTy = EltTy;
2269 else if (CommonEltTy != EltTy)
2270 HaveCommonEltTy = false;
2271
2272 if (EltTy->isPointerTy()) {
2273 HaveVecPtrTy = true;
2274 if (!CommonVecPtrTy)
2275 CommonVecPtrTy = VTy;
2276 else if (CommonVecPtrTy != VTy)
2277 HaveCommonVecPtrTy = false;
2278 }
2279 }
2280 };
2281
2282 // Put load and store types into a set for de-duplication.
2283 for (const Slice &S : P) {
2284 Type *Ty;
2285 if (auto *LI = dyn_cast<LoadInst>(Val: S.getUse()->getUser()))
2286 Ty = LI->getType();
2287 else if (auto *SI = dyn_cast<StoreInst>(Val: S.getUse()->getUser()))
2288 Ty = SI->getValueOperand()->getType();
2289 else
2290 continue;
2291
2292 auto CandTy = Ty->getScalarType();
2293 if (CandTy->isPointerTy() && (S.beginOffset() != P.beginOffset() ||
2294 S.endOffset() != P.endOffset())) {
2295 DeferredTys.insert(X: Ty);
2296 continue;
2297 }
2298
2299 LoadStoreTys.insert(X: Ty);
2300 // Consider any loads or stores that are the exact size of the slice.
2301 if (S.beginOffset() == P.beginOffset() && S.endOffset() == P.endOffset())
2302 CheckCandidateType(Ty);
2303 }
2304
2305 SmallVector<VectorType *, 4> CandidateTysCopy = CandidateTys;
2306 if (auto *VTy = createAndCheckVectorTypesForPromotion(
2307 OtherTys&: LoadStoreTys, CandidateTysCopy, CheckCandidateType, P, DL,
2308 CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
2309 HaveCommonVecPtrTy, CommonVecPtrTy, VScale))
2310 return VTy;
2311
2312 CandidateTys.clear();
2313 return createAndCheckVectorTypesForPromotion(
2314 OtherTys&: DeferredTys, CandidateTysCopy, CheckCandidateType, P, DL, CandidateTys,
2315 HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, HaveCommonVecPtrTy,
2316 CommonVecPtrTy, VScale);
2317}
2318
2319/// Test whether a slice of an alloca is valid for integer widening.
2320///
2321/// This implements the necessary checking for the \c isIntegerWideningViable
2322/// test below on a single slice of the alloca.
2323static bool isIntegerWideningViableForSlice(const Slice &S,
2324 uint64_t AllocBeginOffset,
2325 Type *AllocaTy,
2326 const DataLayout &DL,
2327 bool &WholeAllocaOp) {
2328 uint64_t Size = DL.getTypeStoreSize(Ty: AllocaTy).getFixedValue();
2329
2330 uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
2331 uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
2332
2333 Use *U = S.getUse();
2334
2335 // Lifetime intrinsics operate over the whole alloca whose sizes are usually
2336 // larger than other load/store slices (RelEnd > Size). But lifetime are
2337 // always promotable and should not impact other slices' promotability of the
2338 // partition.
2339 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: U->getUser())) {
2340 if (II->isLifetimeStartOrEnd() || II->isDroppable())
2341 return true;
2342 }
2343
2344 // We can't reasonably handle cases where the load or store extends past
2345 // the end of the alloca's type and into its padding.
2346 if (RelEnd > Size)
2347 return false;
2348
2349 if (LoadInst *LI = dyn_cast<LoadInst>(Val: U->getUser())) {
2350 if (LI->isVolatile())
2351 return false;
2352 // We can't handle loads that extend past the allocated memory.
2353 TypeSize LoadSize = DL.getTypeStoreSize(Ty: LI->getType());
2354 if (!LoadSize.isFixed() || LoadSize.getFixedValue() > Size)
2355 return false;
2356 // So far, AllocaSliceRewriter does not support widening split slice tails
2357 // in rewriteIntegerLoad.
2358 if (S.beginOffset() < AllocBeginOffset)
2359 return false;
2360 // Note that we don't count vector loads or stores as whole-alloca
2361 // operations which enable integer widening because we would prefer to use
2362 // vector widening instead.
2363 if (!isa<VectorType>(Val: LI->getType()) && RelBegin == 0 && RelEnd == Size)
2364 WholeAllocaOp = true;
2365 if (IntegerType *ITy = dyn_cast<IntegerType>(Val: LI->getType())) {
2366 if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(Ty: ITy).getFixedValue())
2367 return false;
2368 } else if (RelBegin != 0 || RelEnd != Size ||
2369 !canConvertValue(DL, OldTy: AllocaTy, NewTy: LI->getType())) {
2370 // Non-integer loads need to be convertible from the alloca type so that
2371 // they are promotable.
2372 return false;
2373 }
2374 } else if (StoreInst *SI = dyn_cast<StoreInst>(Val: U->getUser())) {
2375 Type *ValueTy = SI->getValueOperand()->getType();
2376 if (SI->isVolatile())
2377 return false;
2378 // We can't handle stores that extend past the allocated memory.
2379 TypeSize StoreSize = DL.getTypeStoreSize(Ty: ValueTy);
2380 if (!StoreSize.isFixed() || StoreSize.getFixedValue() > Size)
2381 return false;
2382 // So far, AllocaSliceRewriter does not support widening split slice tails
2383 // in rewriteIntegerStore.
2384 if (S.beginOffset() < AllocBeginOffset)
2385 return false;
2386 // Note that we don't count vector loads or stores as whole-alloca
2387 // operations which enable integer widening because we would prefer to use
2388 // vector widening instead.
2389 if (!isa<VectorType>(Val: ValueTy) && RelBegin == 0 && RelEnd == Size)
2390 WholeAllocaOp = true;
2391 if (IntegerType *ITy = dyn_cast<IntegerType>(Val: ValueTy)) {
2392 if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(Ty: ITy).getFixedValue())
2393 return false;
2394 } else if (RelBegin != 0 || RelEnd != Size ||
2395 !canConvertValue(DL, OldTy: ValueTy, NewTy: AllocaTy)) {
2396 // Non-integer stores need to be convertible to the alloca type so that
2397 // they are promotable.
2398 return false;
2399 }
2400 } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Val: U->getUser())) {
2401 if (MI->isVolatile() || !isa<Constant>(Val: MI->getLength()))
2402 return false;
2403 if (!S.isSplittable())
2404 return false; // Skip any unsplittable intrinsics.
2405 } else {
2406 return false;
2407 }
2408
2409 return true;
2410}
2411
2412/// Test whether the given alloca partition's integer operations can be
2413/// widened to promotable ones.
2414///
2415/// This is a quick test to check whether we can rewrite the integer loads and
2416/// stores to a particular alloca into wider loads and stores and be able to
2417/// promote the resulting alloca.
2418static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
2419 const DataLayout &DL) {
2420 uint64_t SizeInBits = DL.getTypeSizeInBits(Ty: AllocaTy).getFixedValue();
2421 // Don't create integer types larger than the maximum bitwidth.
2422 if (SizeInBits > IntegerType::MAX_INT_BITS)
2423 return false;
2424
2425 // Don't try to handle allocas with bit-padding.
2426 if (SizeInBits != DL.getTypeStoreSizeInBits(Ty: AllocaTy).getFixedValue())
2427 return false;
2428
2429 // We need to ensure that an integer type with the appropriate bitwidth can
2430 // be converted to the alloca type, whatever that is. We don't want to force
2431 // the alloca itself to have an integer type if there is a more suitable one.
2432 Type *IntTy = Type::getIntNTy(C&: AllocaTy->getContext(), N: SizeInBits);
2433 if (!canConvertValue(DL, OldTy: AllocaTy, NewTy: IntTy) ||
2434 !canConvertValue(DL, OldTy: IntTy, NewTy: AllocaTy))
2435 return false;
2436
2437 // While examining uses, we ensure that the alloca has a covering load or
2438 // store. We don't want to widen the integer operations only to fail to
2439 // promote due to some other unsplittable entry (which we may make splittable
2440 // later). However, if there are only splittable uses, go ahead and assume
2441 // that we cover the alloca.
2442 // FIXME: We shouldn't consider split slices that happen to start in the
2443 // partition here...
2444 bool WholeAllocaOp = P.empty() && DL.isLegalInteger(Width: SizeInBits);
2445
2446 for (const Slice &S : P)
2447 if (!isIntegerWideningViableForSlice(S, AllocBeginOffset: P.beginOffset(), AllocaTy, DL,
2448 WholeAllocaOp))
2449 return false;
2450
2451 for (const Slice *S : P.splitSliceTails())
2452 if (!isIntegerWideningViableForSlice(S: *S, AllocBeginOffset: P.beginOffset(), AllocaTy, DL,
2453 WholeAllocaOp))
2454 return false;
2455
2456 return WholeAllocaOp;
2457}
2458
2459static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
2460 IntegerType *Ty, uint64_t Offset,
2461 const Twine &Name) {
2462 LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2463 IntegerType *IntTy = cast<IntegerType>(Val: V->getType());
2464 assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <=
2465 DL.getTypeStoreSize(IntTy).getFixedValue() &&
2466 "Element extends past full value");
2467 uint64_t ShAmt = 8 * Offset;
2468 if (DL.isBigEndian())
2469 ShAmt = 8 * (DL.getTypeStoreSize(Ty: IntTy).getFixedValue() -
2470 DL.getTypeStoreSize(Ty).getFixedValue() - Offset);
2471 if (ShAmt) {
2472 V = IRB.CreateLShr(LHS: V, RHS: ShAmt, Name: Name + ".shift");
2473 LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2474 }
2475 assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2476 "Cannot extract to a larger integer!");
2477 if (Ty != IntTy) {
2478 V = IRB.CreateTrunc(V, DestTy: Ty, Name: Name + ".trunc");
2479 LLVM_DEBUG(dbgs() << " trunced: " << *V << "\n");
2480 }
2481 return V;
2482}
2483
2484static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
2485 Value *V, uint64_t Offset, const Twine &Name) {
2486 IntegerType *IntTy = cast<IntegerType>(Val: Old->getType());
2487 IntegerType *Ty = cast<IntegerType>(Val: V->getType());
2488 assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2489 "Cannot insert a larger integer!");
2490 LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
2491 if (Ty != IntTy) {
2492 V = IRB.CreateZExt(V, DestTy: IntTy, Name: Name + ".ext");
2493 LLVM_DEBUG(dbgs() << " extended: " << *V << "\n");
2494 }
2495 assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <=
2496 DL.getTypeStoreSize(IntTy).getFixedValue() &&
2497 "Element store outside of alloca store");
2498 uint64_t ShAmt = 8 * Offset;
2499 if (DL.isBigEndian())
2500 ShAmt = 8 * (DL.getTypeStoreSize(Ty: IntTy).getFixedValue() -
2501 DL.getTypeStoreSize(Ty).getFixedValue() - Offset);
2502 if (ShAmt) {
2503 V = IRB.CreateShl(LHS: V, RHS: ShAmt, Name: Name + ".shift");
2504 LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
2505 }
2506
2507 if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) {
2508 APInt Mask = ~Ty->getMask().zext(width: IntTy->getBitWidth()).shl(shiftAmt: ShAmt);
2509 Old = IRB.CreateAnd(LHS: Old, RHS: Mask, Name: Name + ".mask");
2510 LLVM_DEBUG(dbgs() << " masked: " << *Old << "\n");
2511 V = IRB.CreateOr(LHS: Old, RHS: V, Name: Name + ".insert");
2512 LLVM_DEBUG(dbgs() << " inserted: " << *V << "\n");
2513 }
2514 return V;
2515}
2516
2517static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
2518 unsigned EndIndex, const Twine &Name) {
2519 auto *VecTy = cast<FixedVectorType>(Val: V->getType());
2520 unsigned NumElements = EndIndex - BeginIndex;
2521 assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
2522
2523 if (NumElements == VecTy->getNumElements())
2524 return V;
2525
2526 if (NumElements == 1) {
2527 V = IRB.CreateExtractElement(Vec: V, Idx: IRB.getInt32(C: BeginIndex),
2528 Name: Name + ".extract");
2529 LLVM_DEBUG(dbgs() << " extract: " << *V << "\n");
2530 return V;
2531 }
2532
2533 auto Mask = llvm::to_vector<8>(Range: llvm::seq<int>(Begin: BeginIndex, End: EndIndex));
2534 V = IRB.CreateShuffleVector(V, Mask, Name: Name + ".extract");
2535 LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2536 return V;
2537}
2538
2539static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
2540 unsigned BeginIndex, const Twine &Name) {
2541 VectorType *VecTy = cast<VectorType>(Val: Old->getType());
2542 assert(VecTy && "Can only insert a vector into a vector");
2543
2544 VectorType *Ty = dyn_cast<VectorType>(Val: V->getType());
2545 if (!Ty) {
2546 // Single element to insert.
2547 V = IRB.CreateInsertElement(Vec: Old, NewElt: V, Idx: IRB.getInt32(C: BeginIndex),
2548 Name: Name + ".insert");
2549 LLVM_DEBUG(dbgs() << " insert: " << *V << "\n");
2550 return V;
2551 }
2552
2553 unsigned NumSubElements = cast<FixedVectorType>(Val: Ty)->getNumElements();
2554 unsigned NumElements = cast<FixedVectorType>(Val: VecTy)->getNumElements();
2555
2556 assert(NumSubElements <= NumElements && "Too many elements!");
2557 if (NumSubElements == NumElements) {
2558 assert(V->getType() == VecTy && "Vector type mismatch");
2559 return V;
2560 }
2561 unsigned EndIndex = BeginIndex + NumSubElements;
2562
2563 // When inserting a smaller vector into the larger to store, we first
2564 // use a shuffle vector to widen it with undef elements, and then
2565 // a second shuffle vector to select between the loaded vector and the
2566 // incoming vector.
2567 SmallVector<int, 8> Mask;
2568 Mask.reserve(N: NumElements);
2569 for (unsigned Idx = 0; Idx != NumElements; ++Idx)
2570 if (Idx >= BeginIndex && Idx < EndIndex)
2571 Mask.push_back(Elt: Idx - BeginIndex);
2572 else
2573 Mask.push_back(Elt: -1);
2574 V = IRB.CreateShuffleVector(V, Mask, Name: Name + ".expand");
2575 LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
2576
2577 Mask.clear();
2578 for (unsigned Idx = 0; Idx != NumElements; ++Idx)
2579 if (Idx >= BeginIndex && Idx < EndIndex)
2580 Mask.push_back(Elt: Idx);
2581 else
2582 Mask.push_back(Elt: Idx + NumElements);
2583 V = IRB.CreateShuffleVector(V1: V, V2: Old, Mask, Name: Name + "blend");
2584 LLVM_DEBUG(dbgs() << " blend: " << *V << "\n");
2585 return V;
2586}
2587
2588/// This function takes two vector values and combines them into a single vector
2589/// by concatenating their elements. The function handles:
2590///
2591/// 1. Element type mismatch: If either vector's element type differs from
2592/// NewAIEltType, the function bitcasts the vector to use NewAIEltType while
2593/// preserving the total bit width (adjusting the number of elements
2594/// accordingly).
2595///
2596/// 2. Size mismatch: After transforming the vectors to have the desired element
2597/// type, if the two vectors have different numbers of elements, the smaller
2598/// vector is extended with poison values to match the size of the larger
2599/// vector before concatenation.
2600///
2601/// 3. Concatenation: The vectors are merged using a shuffle operation that
2602/// places all elements of V0 first, followed by all elements of V1.
2603///
2604/// \param V0 The first vector to merge (must be a vector type)
2605/// \param V1 The second vector to merge (must be a vector type)
2606/// \param DL The data layout for size calculations
2607/// \param NewAIEltTy The desired element type for the result vector
2608/// \param Builder IRBuilder for creating new instructions
2609/// \return A new vector containing all elements from V0 followed by all
2610/// elements from V1
2611static Value *mergeTwoVectors(Value *V0, Value *V1, const DataLayout &DL,
2612 Type *NewAIEltTy, IRBuilder<> &Builder) {
2613 // V0 and V1 are vectors
2614 // Create a new vector type with combined elements
2615 // Use ShuffleVector to concatenate the vectors
2616 auto *VecType0 = cast<FixedVectorType>(Val: V0->getType());
2617 auto *VecType1 = cast<FixedVectorType>(Val: V1->getType());
2618
2619 // If V0/V1 element types are different from NewAllocaElementType,
2620 // we need to introduce bitcasts before merging them
2621 auto BitcastIfNeeded = [&](Value *&V, FixedVectorType *&VecType,
2622 const char *DebugName) {
2623 Type *EltType = VecType->getElementType();
2624 if (EltType != NewAIEltTy) {
2625 // Calculate new number of elements to maintain same bit width
2626 unsigned TotalBits =
2627 VecType->getNumElements() * DL.getTypeSizeInBits(Ty: EltType);
2628 unsigned NewNumElts = TotalBits / DL.getTypeSizeInBits(Ty: NewAIEltTy);
2629
2630 auto *NewVecType = FixedVectorType::get(ElementType: NewAIEltTy, NumElts: NewNumElts);
2631 V = Builder.CreateBitCast(V, DestTy: NewVecType);
2632 VecType = NewVecType;
2633 LLVM_DEBUG(dbgs() << " bitcast " << DebugName << ": " << *V << "\n");
2634 }
2635 };
2636
2637 BitcastIfNeeded(V0, VecType0, "V0");
2638 BitcastIfNeeded(V1, VecType1, "V1");
2639
2640 unsigned NumElts0 = VecType0->getNumElements();
2641 unsigned NumElts1 = VecType1->getNumElements();
2642
2643 SmallVector<int, 16> ShuffleMask;
2644
2645 if (NumElts0 == NumElts1) {
2646 for (unsigned i = 0; i < NumElts0 + NumElts1; ++i)
2647 ShuffleMask.push_back(Elt: i);
2648 } else {
2649 // If two vectors have different sizes, we need to extend
2650 // the smaller vector to the size of the larger vector.
2651 unsigned SmallSize = std::min(a: NumElts0, b: NumElts1);
2652 unsigned LargeSize = std::max(a: NumElts0, b: NumElts1);
2653 bool IsV0Smaller = NumElts0 < NumElts1;
2654 Value *&ExtendedVec = IsV0Smaller ? V0 : V1;
2655 SmallVector<int, 16> ExtendMask;
2656 for (unsigned i = 0; i < SmallSize; ++i)
2657 ExtendMask.push_back(Elt: i);
2658 for (unsigned i = SmallSize; i < LargeSize; ++i)
2659 ExtendMask.push_back(Elt: PoisonMaskElem);
2660 ExtendedVec = Builder.CreateShuffleVector(
2661 V1: ExtendedVec, V2: PoisonValue::get(T: ExtendedVec->getType()), Mask: ExtendMask);
2662 LLVM_DEBUG(dbgs() << " shufflevector: " << *ExtendedVec << "\n");
2663 for (unsigned i = 0; i < NumElts0; ++i)
2664 ShuffleMask.push_back(Elt: i);
2665 for (unsigned i = 0; i < NumElts1; ++i)
2666 ShuffleMask.push_back(Elt: LargeSize + i);
2667 }
2668
2669 return Builder.CreateShuffleVector(V1: V0, V2: V1, Mask: ShuffleMask);
2670}
2671
2672namespace {
2673
2674/// Visitor to rewrite instructions using p particular slice of an alloca
2675/// to use a new alloca.
2676///
2677/// Also implements the rewriting to vector-based accesses when the partition
2678/// passes the isVectorPromotionViable predicate. Most of the rewriting logic
2679/// lives here.
2680class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
2681 // Befriend the base class so it can delegate to private visit methods.
2682 friend class InstVisitor<AllocaSliceRewriter, bool>;
2683
2684 using Base = InstVisitor<AllocaSliceRewriter, bool>;
2685
2686 const DataLayout &DL;
2687 AllocaSlices &AS;
2688 SROA &Pass;
2689 AllocaInst &OldAI, &NewAI;
2690 const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
2691 Type *NewAllocaTy;
2692
2693 // This is a convenience and flag variable that will be null unless the new
2694 // alloca's integer operations should be widened to this integer type due to
2695 // passing isIntegerWideningViable above. If it is non-null, the desired
2696 // integer type will be stored here for easy access during rewriting.
2697 IntegerType *IntTy;
2698
2699 // If we are rewriting an alloca partition which can be written as pure
2700 // vector operations, we stash extra information here. When VecTy is
2701 // non-null, we have some strict guarantees about the rewritten alloca:
2702 // - The new alloca is exactly the size of the vector type here.
2703 // - The accesses all either map to the entire vector or to a single
2704 // element.
2705 // - The set of accessing instructions is only one of those handled above
2706 // in isVectorPromotionViable. Generally these are the same access kinds
2707 // which are promotable via mem2reg.
2708 VectorType *VecTy;
2709 Type *ElementTy;
2710 uint64_t ElementSize;
2711
2712 // The original offset of the slice currently being rewritten relative to
2713 // the original alloca.
2714 uint64_t BeginOffset = 0;
2715 uint64_t EndOffset = 0;
2716
2717 // The new offsets of the slice currently being rewritten relative to the
2718 // original alloca.
2719 uint64_t NewBeginOffset = 0, NewEndOffset = 0;
2720
2721 uint64_t SliceSize = 0;
2722 bool IsSplittable = false;
2723 bool IsSplit = false;
2724 Use *OldUse = nullptr;
2725 Instruction *OldPtr = nullptr;
2726
2727 // Track post-rewrite users which are PHI nodes and Selects.
2728 SmallSetVector<PHINode *, 8> &PHIUsers;
2729 SmallSetVector<SelectInst *, 8> &SelectUsers;
2730
2731 // Utility IR builder, whose name prefix is setup for each visited use, and
2732 // the insertion point is set to point to the user.
2733 IRBuilderTy IRB;
2734
2735 // Return the new alloca, addrspacecasted if required to avoid changing the
2736 // addrspace of a volatile access.
2737 Value *getPtrToNewAI(unsigned AddrSpace, bool IsVolatile) {
2738 if (!IsVolatile || AddrSpace == NewAI.getType()->getPointerAddressSpace())
2739 return &NewAI;
2740
2741 Type *AccessTy = IRB.getPtrTy(AddrSpace);
2742 return IRB.CreateAddrSpaceCast(V: &NewAI, DestTy: AccessTy);
2743 }
2744
2745public:
2746 AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROA &Pass,
2747 AllocaInst &OldAI, AllocaInst &NewAI, Type *NewAllocaTy,
2748 uint64_t NewAllocaBeginOffset,
2749 uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
2750 VectorType *PromotableVecTy,
2751 SmallSetVector<PHINode *, 8> &PHIUsers,
2752 SmallSetVector<SelectInst *, 8> &SelectUsers)
2753 : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
2754 NewAllocaBeginOffset(NewAllocaBeginOffset),
2755 NewAllocaEndOffset(NewAllocaEndOffset), NewAllocaTy(NewAllocaTy),
2756 IntTy(IsIntegerPromotable
2757 ? Type::getIntNTy(
2758 C&: NewAI.getContext(),
2759 N: DL.getTypeSizeInBits(Ty: NewAllocaTy).getFixedValue())
2760 : nullptr),
2761 VecTy(PromotableVecTy),
2762 ElementTy(VecTy ? VecTy->getElementType() : nullptr),
2763 ElementSize(VecTy ? DL.getTypeSizeInBits(Ty: ElementTy).getFixedValue() / 8
2764 : 0),
2765 PHIUsers(PHIUsers), SelectUsers(SelectUsers),
2766 IRB(NewAI.getContext(), ConstantFolder()) {
2767 if (VecTy) {
2768 assert((DL.getTypeSizeInBits(ElementTy).getFixedValue() % 8) == 0 &&
2769 "Only multiple-of-8 sized vector elements are viable");
2770 ++NumVectorized;
2771 }
2772 assert((!IntTy && !VecTy) || (IntTy && !VecTy) || (!IntTy && VecTy));
2773 }
2774
2775 bool visit(AllocaSlices::const_iterator I) {
2776 bool CanSROA = true;
2777 BeginOffset = I->beginOffset();
2778 EndOffset = I->endOffset();
2779 IsSplittable = I->isSplittable();
2780 IsSplit =
2781 BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
2782 LLVM_DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : ""));
2783 LLVM_DEBUG(AS.printSlice(dbgs(), I, ""));
2784 LLVM_DEBUG(dbgs() << "\n");
2785
2786 // Compute the intersecting offset range.
2787 assert(BeginOffset < NewAllocaEndOffset);
2788 assert(EndOffset > NewAllocaBeginOffset);
2789 NewBeginOffset = std::max(a: BeginOffset, b: NewAllocaBeginOffset);
2790 NewEndOffset = std::min(a: EndOffset, b: NewAllocaEndOffset);
2791
2792 SliceSize = NewEndOffset - NewBeginOffset;
2793 LLVM_DEBUG(dbgs() << " Begin:(" << BeginOffset << ", " << EndOffset
2794 << ") NewBegin:(" << NewBeginOffset << ", "
2795 << NewEndOffset << ") NewAllocaBegin:("
2796 << NewAllocaBeginOffset << ", " << NewAllocaEndOffset
2797 << ")\n");
2798 assert(IsSplit || NewBeginOffset == BeginOffset);
2799 OldUse = I->getUse();
2800 OldPtr = cast<Instruction>(Val: OldUse->get());
2801
2802 Instruction *OldUserI = cast<Instruction>(Val: OldUse->getUser());
2803 IRB.SetInsertPoint(OldUserI);
2804 IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
2805 // Avoid materializing the name prefix when it is discarded anyway.
2806 if (!IRB.getContext().shouldDiscardValueNames())
2807 IRB.getInserter().SetNamePrefix(Twine(NewAI.getName()) + "." +
2808 Twine(BeginOffset) + ".");
2809
2810 CanSROA &= visit(I: cast<Instruction>(Val: OldUse->getUser()));
2811 if (VecTy || IntTy)
2812 assert(CanSROA);
2813 return CanSROA;
2814 }
2815
2816 /// Attempts to rewrite a partition using tree-structured merge optimization.
2817 ///
2818 /// This function handles two patterns. Both produce an O(log n) tree of
2819 /// shufflevectors in place of the linear expand+blend chain that SROA would
2820 /// otherwise emit for each partial store.
2821 ///
2822 /// Pattern 1 (stores-only):
2823 /// Multiple non-overlapping partial stores completely fill the alloca
2824 /// and there is exactly one full-width load coming after the stores.
2825 /// The stores are tree-merged into a single vector and stored once.
2826 ///
2827 /// Example transformation:
2828 /// Before: (stores do not have to be in order)
2829 /// %alloca = alloca <8 x float>
2830 /// store <2 x float> %val0, ptr %alloca ; offset 0-1
2831 /// store <2 x float> %val2, ptr %alloca+16 ; offset 4-5
2832 /// store <2 x float> %val1, ptr %alloca+8 ; offset 2-3
2833 /// store <2 x float> %val3, ptr %alloca+24 ; offset 6-7
2834 /// %r = load <8 x float>, ptr %alloca
2835 ///
2836 /// After: tree of shufflevectors producing <8 x float> directly.
2837 ///
2838 /// Pattern 2 (init + RMW, possibly multi-round):
2839 /// A single full-width init store, followed by partial loads and
2840 /// partial stores that read-modify-write the alloca one or more
2841 /// times, optionally followed by a full-width load. The only
2842 /// structural requirement is that the distinct [begin, end) ranges
2843 /// touched by the partial loads and stores, taken together, tile
2844 /// the alloca disjointly.
2845 ///
2846 /// We keep a map from each slice range to the SSA value that
2847 /// currently lives there, `SliceValues[r] -> Value*`:
2848 /// - initialize each entry to the corresponding piece of the
2849 /// init store's value (via a shufflevector picking the
2850 /// range's elements out of the init value),
2851 /// - walk partial loads and stores in block order,
2852 /// - for a partial load at range r: RAUW with `SliceValues[r]`,
2853 /// - for a partial store at range r: update `SliceValues[r]` to
2854 /// the stored value and drop the store.
2855 /// At the end, the final `SliceValues[r]` entries are tree-merged
2856 /// (in range order) into a single store to the alloca, and the
2857 /// optional full-width load is replaced by a load of the alloca.
2858 ///
2859 /// Because the ranges are disjoint by construction, a store at one
2860 /// range cannot affect another range's tracked value, so a single
2861 /// block-order walk correctly tracks the memory state at each
2862 /// range. The algorithm handles multi-round RMW, partial loads
2863 /// and stores interleaved in any order, read-only slices (the
2864 /// tracked value stays at the init extract), and write-only
2865 /// slices (the tracked value never flows into a load).
2866 ///
2867 /// \param P The partition to analyze and potentially rewrite
2868 /// \return An optional vector of values that were deleted during the
2869 /// rewrite, or std::nullopt if the partition cannot be optimized.
2870 std::optional<SmallVector<Value *, 4>>
2871 rewriteTreeStructuredMerge(Partition &P) {
2872 // No tail slices that overlap with the partition
2873 if (P.splitSliceTails().size() > 0)
2874 return std::nullopt;
2875
2876 // Structure to hold store information
2877 struct StoreInfo {
2878 StoreInst *Store;
2879 uint64_t BeginOffset;
2880 uint64_t EndOffset;
2881 Value *StoredValue;
2882 StoreInfo(StoreInst *SI, uint64_t Begin, uint64_t End, Value *Val)
2883 : Store(SI), BeginOffset(Begin), EndOffset(End), StoredValue(Val) {}
2884 };
2885 struct LoadInfo {
2886 LoadInst *Load;
2887 uint64_t BeginOffset;
2888 uint64_t EndOffset;
2889 };
2890
2891 SmallVector<StoreInfo, 4> StoreInfos; // partial stores only
2892 SmallVector<LoadInfo, 4> LoadInfos; // partial loads only
2893 LoadInst *FullLoad = nullptr; // optional full-width load
2894 StoreInst *InitStore = nullptr; // optional full-width init store
2895
2896 // If the new alloca is a fixed vector type, we use its element type as the
2897 // allocated element type, otherwise we use i8 as the allocated element
2898 Type *AllocatedEltTy =
2899 isa<FixedVectorType>(Val: NewAllocaTy)
2900 ? cast<FixedVectorType>(Val: NewAllocaTy)->getElementType()
2901 : Type::getInt8Ty(C&: NewAI.getContext());
2902 unsigned AllocatedEltTySize = DL.getTypeSizeInBits(Ty: AllocatedEltTy);
2903
2904 // Helper to check if a type is
2905 // 1. A fixed vector type
2906 // 2. The element type is not a pointer
2907 // 3. The element type size is byte-aligned
2908 // We only handle the cases that the ld/st meet these conditions
2909 auto IsTypeValidForTreeStructuredMerge = [&](Type *Ty) -> bool {
2910 auto *FixedVecTy = dyn_cast<FixedVectorType>(Val: Ty);
2911 return FixedVecTy &&
2912 DL.getTypeSizeInBits(Ty: FixedVecTy->getElementType()) % 8 == 0 &&
2913 !FixedVecTy->getElementType()->isPointerTy();
2914 };
2915
2916 for (Slice &S : P) {
2917 auto *User = cast<Instruction>(Val: S.getUse()->getUser());
2918 // A "full-width" slice spans the entire alloca; it's either the single
2919 // init store (Pattern 2) or the single final load (both patterns).
2920 bool IsFullWidth = (S.beginOffset() == NewAllocaBeginOffset &&
2921 S.endOffset() == NewAllocaEndOffset);
2922 if (auto *LI = dyn_cast<LoadInst>(Val: User)) {
2923 // Only handle simple (non-volatile, non-atomic) loads.
2924 if (!LI->isSimple() ||
2925 !IsTypeValidForTreeStructuredMerge(LI->getType()))
2926 return std::nullopt;
2927 if (IsFullWidth) {
2928 // We accept at most one full-width load (the "final" load, after
2929 // all the partial stores).
2930 if (FullLoad)
2931 return std::nullopt;
2932 FullLoad = LI;
2933 } else {
2934 // Partial load (RMW pattern only).
2935 LoadInfos.push_back(Elt: {.Load: LI, .BeginOffset: S.beginOffset(), .EndOffset: S.endOffset()});
2936 }
2937 } else if (auto *SI = dyn_cast<StoreInst>(Val: User)) {
2938 // Do not handle the case if
2939 // 1. The store does not meet the conditions in the helper function
2940 // 2. The store is not simple — we drop stores as part of the
2941 // rewrite, so volatile stores (which must be kept) and atomic
2942 // stores (which carry memory-ordering semantics) are unsound
2943 // to replace with SSA bookkeeping.
2944 // 3. The total store size is not a multiple of the allocated
2945 // element type size (required so the tree merge can produce a
2946 // vector whose element type matches the alloca).
2947 if (!SI->isSimple() || !IsTypeValidForTreeStructuredMerge(
2948 SI->getValueOperand()->getType()))
2949 return std::nullopt;
2950 auto *StVecTy = cast<FixedVectorType>(Val: SI->getValueOperand()->getType());
2951 unsigned NumElts = StVecTy->getNumElements();
2952 unsigned EltSize = DL.getTypeSizeInBits(Ty: StVecTy->getElementType());
2953 if (NumElts * EltSize % AllocatedEltTySize != 0)
2954 return std::nullopt;
2955 if (IsFullWidth) {
2956 // At most one full-width store is allowed — it's the init store
2957 // for the RMW pattern.
2958 if (InitStore)
2959 return std::nullopt;
2960 InitStore = SI;
2961 } else {
2962 StoreInfos.emplace_back(Args&: SI, Args: S.beginOffset(), Args: S.endOffset(),
2963 Args: SI->getValueOperand());
2964 }
2965 } else {
2966 // If we have instructions other than load and store, we cannot do
2967 // the tree structured merge.
2968 return std::nullopt;
2969 }
2970 }
2971
2972 // Need at least two partial stores to benefit from tree-merging; a
2973 // single store is already optimal as-is. This applies to both patterns
2974 // below, so check it before classifying.
2975 if (StoreInfos.size() < 2)
2976 return std::nullopt;
2977
2978 // Classify the pattern by looking at what we collected:
2979 // Pattern 1 (stores-only): only partial stores + exactly one full load.
2980 // Pattern 2 (RMW): one full init store + partial loads + partial stores
2981 // (+ optional full final load). RMW also needs VecTy to be set
2982 // because we use getIndex() to convert byte offsets to element
2983 // indices, which requires a promoted vector alloca.
2984 bool IsRMWPattern = InitStore && VecTy && !LoadInfos.empty();
2985 bool IsStoresOnlyPattern = !InitStore && FullLoad && LoadInfos.empty();
2986 if (!IsRMWPattern && !IsStoresOnlyPattern)
2987 return std::nullopt;
2988
2989 // All partial stores must live in the same basic block — the tree merge
2990 // is built in a single BB using block-order ordering (comesBefore).
2991 BasicBlock *StoreBB = StoreInfos[0].Store->getParent();
2992 for (auto &Info : StoreInfos)
2993 if (Info.Store->getParent() != StoreBB)
2994 return std::nullopt;
2995
2996 SmallVector<Value *, 4> DeletedValues;
2997
2998 // Helper: pairwise tree-merge a list of vectors into a single vector.
2999 // At each iteration we merge each adjacent pair via mergeTwoVectors,
3000 // collect the merged values into Next, and (if Vals had odd length)
3001 // carry the trailing element through unchanged. Loop until one value
3002 // remains — the fully-merged vector.
3003 auto TreeMerge = [&](SmallVectorImpl<Value *> &Vals,
3004 IRBuilder<> &B) -> Value * {
3005 LLVM_DEBUG(dbgs() << " Rewrite stores into shufflevectors:\n");
3006 while (Vals.size() > 1) {
3007 SmallVector<Value *, 8> Next;
3008 for (unsigned I = 0, E = Vals.size(); I + 1 < E; I += 2) {
3009 Value *M =
3010 mergeTwoVectors(V0: Vals[I], V1: Vals[I + 1], DL, NewAIEltTy: AllocatedEltTy, Builder&: B);
3011 LLVM_DEBUG(dbgs() << " shufflevector: " << *M << "\n");
3012 Next.push_back(Elt: M);
3013 }
3014 if (Vals.size() % 2 == 1)
3015 Next.push_back(Elt: Vals.back());
3016 Vals = std::move(Next);
3017 }
3018 return Vals[0];
3019 };
3020
3021 // Replace a full-width load with a load of the freshly-merged alloca.
3022 // The merge stored a value of type Merged->getType() into NewAI; we load
3023 // that same type back so every access to NewAI stays consistently typed
3024 // (otherwise the alloca is no longer promotable).
3025 auto ReplaceFullLoad = [&](LoadInst *LoadToReplace, Value *Merged) {
3026 IRBuilder<> LoadBuilder(LoadToReplace);
3027 Value *NewLoad = LoadBuilder.CreateAlignedLoad(
3028 Ty: Merged->getType(), Ptr: &NewAI, Align: getSliceAlign(),
3029 isVolatile: LoadToReplace->isVolatile(),
3030 Name: LoadToReplace->getName() + ".sroa.new.load");
3031 if (NewLoad->getType() != LoadToReplace->getType())
3032 NewLoad = LoadBuilder.CreateBitCast(V: NewLoad, DestTy: LoadToReplace->getType());
3033 LoadToReplace->replaceAllUsesWith(V: NewLoad);
3034 DeletedValues.push_back(Elt: LoadToReplace);
3035 };
3036
3037 if (IsStoresOnlyPattern) {
3038 // Stores should not overlap and should cover the whole alloca.
3039 // Sort by begin offset to verify this with a single linear scan.
3040 llvm::sort(C&: StoreInfos, Comp: [](const StoreInfo &A, const StoreInfo &B) {
3041 return A.BeginOffset < B.BeginOffset;
3042 });
3043 // Check for gap or overlap: each begin offset must equal the previous
3044 // end offset, i.e. the store ranges must tile [NewAllocaBeginOffset,
3045 // NewAllocaEndOffset) exactly.
3046 uint64_t Expected = NewAllocaBeginOffset;
3047 for (auto &Info : StoreInfos) {
3048 if (Info.BeginOffset != Expected)
3049 return std::nullopt;
3050 Expected = Info.EndOffset;
3051 }
3052 // Stores cover the entire alloca (no trailing gap either).
3053 if (Expected != NewAllocaEndOffset)
3054 return std::nullopt;
3055
3056 // The load should not be in the middle of the stores.
3057 // Note:
3058 // If the load is in a different basic block from the stores, we can
3059 // still do the tree-structured merge. We don't have store->load
3060 // forwarding here — the merged vector is stored back to NewAI and
3061 // the new load loads from NewAI. The forwarding will be handled
3062 // later when NewAI is promoted.
3063 BasicBlock *LoadBB = FullLoad->getParent();
3064 if (LoadBB == StoreBB) {
3065 for (auto &Info : StoreInfos)
3066 if (!Info.Store->comesBefore(Other: FullLoad))
3067 return std::nullopt;
3068 }
3069
3070 LLVM_DEBUG({
3071 dbgs() << "Tree structured merge rewrite (stores-only):\n";
3072 dbgs() << " Load: " << *FullLoad << "\n Ordered stores:\n";
3073 for (auto [I, Info] : enumerate(StoreInfos)) {
3074 dbgs() << " [" << I << "] Range[" << Info.BeginOffset << ", "
3075 << Info.EndOffset << ") \tStore: " << *Info.Store
3076 << "\tValue: " << *Info.StoredValue << "\n";
3077 }
3078 });
3079
3080 // StoreInfos is sorted by offset, not by block order. Anchoring to
3081 // StoreInfos.back().Store (last by offset) can place shuffles before
3082 // operands that appear later in the block (invalid SSA). Insert before
3083 // FullLoad when it shares the store block (after all stores, before
3084 // any later IR in that block). Otherwise insert before the store
3085 // block's terminator so the merge runs after every store and any
3086 // trailing instructions in that block.
3087 IRBuilder<> Builder(LoadBB == StoreBB ? cast<Instruction>(Val: FullLoad)
3088 : StoreBB->getTerminator());
3089 SmallVector<Value *, 8> Vals;
3090 for (const auto &Info : StoreInfos) {
3091 DeletedValues.push_back(Elt: Info.Store);
3092 Vals.push_back(Elt: Info.StoredValue);
3093 }
3094 // Merge all stored values and store the merged value into the alloca.
3095 Value *Merged = TreeMerge(Vals, Builder);
3096 Builder.CreateAlignedStore(Val: Merged, Ptr: &NewAI, Align: getSliceAlign());
3097
3098 // Replace the original load with a load of the newly-merged alloca.
3099 ReplaceFullLoad(FullLoad, Merged);
3100 return DeletedValues;
3101 }
3102
3103 // RMW pattern handling starts from here.
3104 // Like StoreBB above: keep the init store, all partial loads and all
3105 // partial stores in one basic block so we can reason about ordering
3106 // with comesBefore and build SSA without PHIs.
3107 if (InitStore->getParent() != StoreBB)
3108 return std::nullopt;
3109 if (any_of(Range&: LoadInfos, P: [&](const LoadInfo &I) {
3110 return I.Load->getParent() != StoreBB;
3111 }))
3112 return std::nullopt;
3113 // FullLoad (if any) is allowed to live in a different basic block. See
3114 // the note on the stores-only path: we don't do store->load forwarding
3115 // directly — the merged vector is stored to NewAI and the new load
3116 // loads from NewAI, so cross-BB ordering is resolved later when NewAI
3117 // is promoted.
3118
3119 // Collect the combined partial-load/partial-store accesses sorted
3120 // by block order. Used both for ordering checks and for the rewrite
3121 // walk below.
3122 struct Access {
3123 Instruction *Inst;
3124 uint64_t BeginOffset, EndOffset;
3125 bool IsStore;
3126 };
3127 SmallVector<Access, 16> Accesses;
3128 Accesses.reserve(N: LoadInfos.size() + StoreInfos.size());
3129 for (const auto &L : LoadInfos)
3130 Accesses.push_back(Elt: {.Inst: L.Load, .BeginOffset: L.BeginOffset, .EndOffset: L.EndOffset, .IsStore: false});
3131 for (const auto &S : StoreInfos)
3132 Accesses.push_back(Elt: {.Inst: S.Store, .BeginOffset: S.BeginOffset, .EndOffset: S.EndOffset, .IsStore: true});
3133 llvm::sort(C&: Accesses, Comp: [](const Access &A, const Access &B) {
3134 return A.Inst->comesBefore(Other: B.Inst);
3135 });
3136
3137 // Ordering constraint 1: InitStore must come before every partial
3138 // access — they read/write the RMW state initialised by InitStore.
3139 // Accesses is sorted by block order, so the first element is the
3140 // earliest; checking it is enough.
3141 if (!InitStore->comesBefore(Other: Accesses.front().Inst))
3142 return std::nullopt;
3143 // Ordering constraint 2: when FullLoad shares the block with the
3144 // partial accesses, it must come after every one of them — otherwise
3145 // it could read a stale value. Accesses is sorted, so the last
3146 // element is the latest; checking it is enough. If FullLoad is in
3147 // another block, mem2reg forwards the merged store to it.
3148 if (FullLoad && FullLoad->getParent() == StoreBB &&
3149 !Accesses.back().Inst->comesBefore(Other: FullLoad))
3150 return std::nullopt;
3151
3152 // Coverage check: the distinct [begin, end) ranges touched by the
3153 // partial loads and stores must tile the alloca disjointly. That is
3154 // the only precondition the per-range SliceValues tracking below
3155 // needs — a disjoint tile guarantees the entries don't alias each
3156 // other. We don't check per-range load/store counts: a range with
3157 // only loads ends with SliceValues[r] = the init extract
3158 // (contributed to the final tree-merge), and a range with only
3159 // stores ends with SliceValues[r] = its last stored value. Both are
3160 // correct.
3161 using SliceRange = std::pair<uint64_t, uint64_t>;
3162 SmallVector<SliceRange, 8> SortedRanges;
3163 SortedRanges.reserve(N: Accesses.size());
3164 for (auto &Acc : Accesses)
3165 SortedRanges.emplace_back(Args&: Acc.BeginOffset, Args&: Acc.EndOffset);
3166 llvm::sort(C&: SortedRanges);
3167 SortedRanges.erase(CS: llvm::unique(R&: SortedRanges), CE: SortedRanges.end());
3168 // Disjoint + contiguous tile of the whole alloca.
3169 uint64_t Expected = NewAllocaBeginOffset;
3170 for (auto &Range : SortedRanges) {
3171 if (Range.first != Expected)
3172 return std::nullopt;
3173 Expected = Range.second;
3174 }
3175 if (Expected != NewAllocaEndOffset)
3176 return std::nullopt;
3177
3178 LLVM_DEBUG({
3179 dbgs() << "Tree structured merge rewrite (RMW):\n";
3180 dbgs() << " Init store: " << *InitStore << "\n";
3181 if (FullLoad)
3182 dbgs() << " Final load: " << *FullLoad << "\n";
3183 dbgs() << " Slice ranges (" << SortedRanges.size() << "):\n";
3184 for (auto &Range : SortedRanges)
3185 dbgs() << " [" << Range.first << ", " << Range.second << ")\n";
3186 });
3187
3188 // Initialize SliceValues: one SSA value per slice range, tracking
3189 // the value the alloca currently holds at that range. Each entry
3190 // starts at the corresponding piece of the init store, obtained by
3191 // bitcasting the init value to the alloca's vector type (if needed)
3192 // and extracting the slice's sub-range.
3193 IRB.SetInsertPoint(InitStore->getNextNode());
3194 Value *InitVec = InitStore->getValueOperand();
3195 if (InitVec->getType() != NewAllocaTy)
3196 InitVec = IRB.CreateBitCast(V: InitVec, DestTy: NewAllocaTy, Name: "init.cast");
3197 DenseMap<SliceRange, Value *> SliceValues;
3198 for (auto &Range : SortedRanges) {
3199 unsigned BeginIdx = getIndex(Offset: Range.first);
3200 unsigned EndIdx = getIndex(Offset: Range.second);
3201 SliceValues[Range] = IRB.CreateShuffleVector(
3202 V: InitVec, Mask: createSequentialMask(Start: BeginIdx, NumInts: EndIdx - BeginIdx, NumUndefs: 0),
3203 Name: "init.extract");
3204 }
3205 // The init store itself becomes dead — its value is consumed via the
3206 // extracts above.
3207 DeletedValues.push_back(Elt: InitStore);
3208
3209 // Walk accesses in block order:
3210 // - partial load at range r: replace with SliceValues[r] (bitcast
3211 // if the load's type differs from the current tracked value's
3212 // type, e.g. because a previous store wrote a vector with a
3213 // different element type);
3214 // - partial store at range r: update SliceValues[r] to the stored
3215 // value and drop the store.
3216 for (auto &Acc : Accesses) {
3217 SliceRange Range{Acc.BeginOffset, Acc.EndOffset};
3218 if (!Acc.IsStore) {
3219 Value *V = SliceValues[Range];
3220 if (V->getType() != Acc.Inst->getType()) {
3221 IRB.SetInsertPoint(cast<LoadInst>(Val: Acc.Inst));
3222 V = IRB.CreateBitCast(V, DestTy: Acc.Inst->getType());
3223 }
3224 Acc.Inst->replaceAllUsesWith(V);
3225 } else {
3226 SliceValues[Range] = cast<StoreInst>(Val: Acc.Inst)->getValueOperand();
3227 }
3228 DeletedValues.push_back(Elt: Acc.Inst);
3229 }
3230
3231 // Tree-merge the final per-range values (in range order) into the
3232 // alloca's final vector value. Anchor the IRBuilder to FullLoad (when it
3233 // shares the partial-access block) or otherwise to the block's
3234 // terminator — never to a partial access, since those are queued for
3235 // deletion. Both anchors are guaranteed to dominate every SliceValues
3236 // entry: each one is either an init extract (before any access) or a
3237 // stored value defined before its (now-deleted) store.
3238 IRBuilder<> Builder(FullLoad && FullLoad->getParent() == StoreBB
3239 ? cast<Instruction>(Val: FullLoad)
3240 : StoreBB->getTerminator());
3241 SmallVector<Value *, 8> Vals;
3242 for (auto &Range : SortedRanges)
3243 Vals.push_back(Elt: SliceValues[Range]);
3244 Value *Merged = TreeMerge(Vals, Builder);
3245 Builder.CreateAlignedStore(Val: Merged, Ptr: &NewAI, Align: getSliceAlign());
3246
3247 // Replace the optional final full-width load with a load of the newly
3248 // merged alloca. Later promotion will forward the store above to it.
3249 if (FullLoad)
3250 ReplaceFullLoad(FullLoad, Merged);
3251
3252 return DeletedValues;
3253 }
3254
3255private:
3256 // Make sure the other visit overloads are visible.
3257 using Base::visit;
3258
3259 // Every instruction which can end up as a user must have a rewrite rule.
3260 bool visitInstruction(Instruction &I) {
3261 LLVM_DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n");
3262 llvm_unreachable("No rewrite rule for this instruction!");
3263 }
3264
3265 Value *getNewAllocaSlicePtr(IRBuilderTy &IRB, Type *PointerTy) {
3266 // Note that the offset computation can use BeginOffset or NewBeginOffset
3267 // interchangeably for unsplit slices.
3268 assert(IsSplit || BeginOffset == NewBeginOffset);
3269 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3270
3271 StringRef OldName = OldPtr->getName();
3272 // Skip through the last '.sroa.' component of the name.
3273 size_t LastSROAPrefix = OldName.rfind(Str: ".sroa.");
3274 if (LastSROAPrefix != StringRef::npos) {
3275 OldName = OldName.substr(Start: LastSROAPrefix + strlen(s: ".sroa."));
3276 // Look for an SROA slice index.
3277 size_t IndexEnd = OldName.find_first_not_of(Chars: "0123456789");
3278 if (IndexEnd != StringRef::npos && OldName[IndexEnd] == '.') {
3279 // Strip the index and look for the offset.
3280 OldName = OldName.substr(Start: IndexEnd + 1);
3281 size_t OffsetEnd = OldName.find_first_not_of(Chars: "0123456789");
3282 if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == '.')
3283 // Strip the offset.
3284 OldName = OldName.substr(Start: OffsetEnd + 1);
3285 }
3286 }
3287 // Strip any SROA suffixes as well.
3288 OldName = OldName.substr(Start: 0, N: OldName.find(Str: ".sroa_"));
3289
3290 return getAdjustedPtr(IRB, DL, Ptr: &NewAI,
3291 Offset: APInt(DL.getIndexTypeSizeInBits(Ty: PointerTy), Offset),
3292 PointerTy, NamePrefix: Twine(OldName) + ".");
3293 }
3294
3295 /// Compute suitable alignment to access this slice of the *new*
3296 /// alloca.
3297 ///
3298 /// You can optionally pass a type to this routine and if that type's ABI
3299 /// alignment is itself suitable, this will return zero.
3300 Align getSliceAlign() {
3301 return commonAlignment(A: NewAI.getAlign(),
3302 Offset: NewBeginOffset - NewAllocaBeginOffset);
3303 }
3304
3305 unsigned getIndex(uint64_t Offset) {
3306 assert(VecTy && "Can only call getIndex when rewriting a vector");
3307 uint64_t RelOffset = Offset - NewAllocaBeginOffset;
3308 assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
3309 uint32_t Index = RelOffset / ElementSize;
3310 assert(Index * ElementSize == RelOffset);
3311 return Index;
3312 }
3313
3314 void deleteIfTriviallyDead(Value *V) {
3315 Instruction *I = cast<Instruction>(Val: V);
3316 if (isInstructionTriviallyDead(I))
3317 Pass.DeadInsts.push_back(Elt: I);
3318 }
3319
3320 Value *rewriteVectorizedLoadInst(LoadInst &LI) {
3321 unsigned BeginIndex = getIndex(Offset: NewBeginOffset);
3322 unsigned EndIndex = getIndex(Offset: NewEndOffset);
3323 assert(EndIndex > BeginIndex && "Empty vector!");
3324
3325 LoadInst *Load =
3326 IRB.CreateAlignedLoad(Ty: NewAllocaTy, Ptr: &NewAI, Align: NewAI.getAlign(), Name: "load");
3327
3328 Load->copyMetadata(SrcInst: LI, WL: {LLVMContext::MD_mem_parallel_loop_access,
3329 LLVMContext::MD_access_group});
3330 return extractVector(IRB, V: Load, BeginIndex, EndIndex, Name: "vec");
3331 }
3332
3333 Value *rewriteIntegerLoad(LoadInst &LI) {
3334 assert(IntTy && "We cannot insert an integer to the alloca");
3335 assert(!LI.isVolatile());
3336 Value *V =
3337 IRB.CreateAlignedLoad(Ty: NewAllocaTy, Ptr: &NewAI, Align: NewAI.getAlign(), Name: "load");
3338 V = IRB.CreateBitPreservingCastChain(DL, V, NewTy: IntTy);
3339 assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
3340 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3341 if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) {
3342 IntegerType *ExtractTy = Type::getIntNTy(C&: LI.getContext(), N: SliceSize * 8);
3343 V = extractInteger(DL, IRB, V, Ty: ExtractTy, Offset, Name: "extract");
3344 }
3345 // It is possible that the extracted type is not the load type. This
3346 // happens if there is a load past the end of the alloca, and as
3347 // a consequence the slice is narrower but still a candidate for integer
3348 // lowering. To handle this case, we just zero extend the extracted
3349 // integer.
3350 assert(cast<IntegerType>(LI.getType())->getBitWidth() >= SliceSize * 8 &&
3351 "Can only handle an extract for an overly wide load");
3352 if (cast<IntegerType>(Val: LI.getType())->getBitWidth() > SliceSize * 8)
3353 V = IRB.CreateZExt(V, DestTy: LI.getType());
3354 return V;
3355 }
3356
3357 bool visitLoadInst(LoadInst &LI) {
3358 LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
3359 Value *OldOp = LI.getOperand(i_nocapture: 0);
3360 assert(OldOp == OldPtr);
3361
3362 AAMDNodes AATags = LI.getAAMetadata();
3363
3364 unsigned AS = LI.getPointerAddressSpace();
3365
3366 Type *TargetTy = IsSplit ? Type::getIntNTy(C&: LI.getContext(), N: SliceSize * 8)
3367 : LI.getType();
3368 bool IsPtrAdjusted = false;
3369 Value *V;
3370 if (VecTy) {
3371 V = rewriteVectorizedLoadInst(LI);
3372 } else if (IntTy && LI.getType()->isIntegerTy()) {
3373 V = rewriteIntegerLoad(LI);
3374 } else if (NewBeginOffset == NewAllocaBeginOffset &&
3375 NewEndOffset == NewAllocaEndOffset &&
3376 (canConvertValue(DL, OldTy: NewAllocaTy, NewTy: TargetTy) ||
3377 (NewAllocaTy->isIntegerTy() && TargetTy->isIntegerTy() &&
3378 DL.getTypeStoreSize(Ty: TargetTy).getFixedValue() > SliceSize &&
3379 !LI.isVolatile()))) {
3380 Value *NewPtr =
3381 getPtrToNewAI(AddrSpace: LI.getPointerAddressSpace(), IsVolatile: LI.isVolatile());
3382 LoadInst *NewLI = IRB.CreateAlignedLoad(
3383 Ty: NewAllocaTy, Ptr: NewPtr, Align: NewAI.getAlign(), isVolatile: LI.isVolatile(), Name: LI.getName());
3384 if (LI.isVolatile())
3385 NewLI->setAtomic(Ordering: LI.getOrdering(), SSID: LI.getSyncScopeID());
3386 if (NewLI->isAtomic())
3387 NewLI->setAlignment(LI.getAlign());
3388
3389 // Copy any metadata that is valid for the new load. This may require
3390 // conversion to a different kind of metadata, e.g. !nonnull might change
3391 // to !range or vice versa.
3392 copyMetadataForLoad(Dest&: *NewLI, Source: LI);
3393
3394 // Do this after copyMetadataForLoad() to preserve the TBAA shift.
3395 if (AATags)
3396 NewLI->setAAMetadata(AATags.adjustForAccess(
3397 Offset: NewBeginOffset - BeginOffset, AccessTy: NewLI->getType(), DL));
3398
3399 // Try to preserve nonnull metadata
3400 V = NewLI;
3401
3402 // If this is an integer load past the end of the slice (which means the
3403 // bytes outside the slice are undef or this load is dead) just forcibly
3404 // fix the integer size with correct handling of endianness.
3405 if (auto *AITy = dyn_cast<IntegerType>(Val: NewAllocaTy))
3406 if (auto *TITy = dyn_cast<IntegerType>(Val: TargetTy))
3407 if (AITy->getBitWidth() < TITy->getBitWidth()) {
3408 V = IRB.CreateZExt(V, DestTy: TITy, Name: "load.ext");
3409 if (DL.isBigEndian())
3410 V = IRB.CreateShl(LHS: V, RHS: TITy->getBitWidth() - AITy->getBitWidth(),
3411 Name: "endian_shift");
3412 }
3413 } else {
3414 Type *LTy = IRB.getPtrTy(AddrSpace: AS);
3415 LoadInst *NewLI =
3416 IRB.CreateAlignedLoad(Ty: TargetTy, Ptr: getNewAllocaSlicePtr(IRB, PointerTy: LTy),
3417 Align: getSliceAlign(), isVolatile: LI.isVolatile(), Name: LI.getName());
3418
3419 if (AATags)
3420 NewLI->setAAMetadata(AATags.adjustForAccess(
3421 Offset: NewBeginOffset - BeginOffset, AccessTy: NewLI->getType(), DL));
3422
3423 if (LI.isVolatile())
3424 NewLI->setAtomic(Ordering: LI.getOrdering(), SSID: LI.getSyncScopeID());
3425 NewLI->copyMetadata(SrcInst: LI, WL: {LLVMContext::MD_mem_parallel_loop_access,
3426 LLVMContext::MD_access_group});
3427
3428 V = NewLI;
3429 IsPtrAdjusted = true;
3430 }
3431 V = IRB.CreateBitPreservingCastChain(DL, V, NewTy: TargetTy);
3432
3433 if (IsSplit) {
3434 assert(!LI.isVolatile());
3435 assert(LI.getType()->isIntegerTy() &&
3436 "Only integer type loads and stores are split");
3437 assert(SliceSize < DL.getTypeStoreSize(LI.getType()).getFixedValue() &&
3438 "Split load isn't smaller than original load");
3439 assert(DL.typeSizeEqualsStoreSize(LI.getType()) &&
3440 "Non-byte-multiple bit width");
3441 // Move the insertion point just past the load so that we can refer to it.
3442 BasicBlock::iterator LIIt = std::next(x: LI.getIterator());
3443 // Ensure the insertion point comes before any debug-info immediately
3444 // after the load, so that variable values referring to the load are
3445 // dominated by it.
3446 LIIt.setHeadBit(true);
3447 IRB.SetInsertPoint(TheBB: LI.getParent(), IP: LIIt);
3448 // Create a placeholder value with the same type as LI to use as the
3449 // basis for the new value. This allows us to replace the uses of LI with
3450 // the computed value, and then replace the placeholder with LI, leaving
3451 // LI only used for this computation.
3452 Value *Placeholder =
3453 new LoadInst(LI.getType(), PoisonValue::get(T: IRB.getPtrTy(AddrSpace: AS)), "",
3454 false, Align(1));
3455 V = insertInteger(DL, IRB, Old: Placeholder, V, Offset: NewBeginOffset - BeginOffset,
3456 Name: "insert");
3457 LI.replaceAllUsesWith(V);
3458 Placeholder->replaceAllUsesWith(V: &LI);
3459 Placeholder->deleteValue();
3460 } else {
3461 LI.replaceAllUsesWith(V);
3462 }
3463
3464 Pass.DeadInsts.push_back(Elt: &LI);
3465 deleteIfTriviallyDead(V: OldOp);
3466 LLVM_DEBUG(dbgs() << " to: " << *V << "\n");
3467 return !LI.isVolatile() && !IsPtrAdjusted;
3468 }
3469
3470 bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp,
3471 AAMDNodes AATags) {
3472 // Capture V for the purpose of debug-info accounting once it's converted
3473 // to a vector store.
3474 Value *OrigV = V;
3475 if (V->getType() != VecTy) {
3476 unsigned BeginIndex = getIndex(Offset: NewBeginOffset);
3477 unsigned EndIndex = getIndex(Offset: NewEndOffset);
3478 assert(EndIndex > BeginIndex && "Empty vector!");
3479 unsigned NumElements = EndIndex - BeginIndex;
3480 assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
3481 "Too many elements!");
3482 Type *SliceTy = (NumElements == 1)
3483 ? ElementTy
3484 : FixedVectorType::get(ElementType: ElementTy, NumElts: NumElements);
3485 if (V->getType() != SliceTy)
3486 V = IRB.CreateBitPreservingCastChain(DL, V, NewTy: SliceTy);
3487
3488 // Mix in the existing elements.
3489 Value *Old =
3490 IRB.CreateAlignedLoad(Ty: NewAllocaTy, Ptr: &NewAI, Align: NewAI.getAlign(), Name: "load");
3491 V = insertVector(IRB, Old, V, BeginIndex, Name: "vec");
3492 }
3493 StoreInst *Store = IRB.CreateAlignedStore(Val: V, Ptr: &NewAI, Align: NewAI.getAlign());
3494 Store->copyMetadata(SrcInst: SI, WL: {LLVMContext::MD_mem_parallel_loop_access,
3495 LLVMContext::MD_access_group});
3496 if (AATags)
3497 Store->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3498 AccessTy: V->getType(), DL));
3499 Pass.DeadInsts.push_back(Elt: &SI);
3500
3501 // NOTE: Careful to use OrigV rather than V.
3502 migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * 8, SliceSizeInBits: SliceSize * 8, OldInst: &SI,
3503 Inst: Store, Dest: Store->getPointerOperand(), Value: OrigV, DL);
3504 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3505 return true;
3506 }
3507
3508 bool rewriteIntegerStore(Value *V, StoreInst &SI, AAMDNodes AATags) {
3509 assert(IntTy && "We cannot extract an integer from the alloca");
3510 assert(!SI.isVolatile());
3511 if (DL.getTypeSizeInBits(Ty: V->getType()).getFixedValue() !=
3512 IntTy->getBitWidth()) {
3513 Value *Old = IRB.CreateAlignedLoad(Ty: NewAllocaTy, Ptr: &NewAI, Align: NewAI.getAlign(),
3514 Name: "oldload");
3515 Old = IRB.CreateBitPreservingCastChain(DL, V: Old, NewTy: IntTy);
3516 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
3517 uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
3518 V = insertInteger(DL, IRB, Old, V: SI.getValueOperand(), Offset, Name: "insert");
3519 }
3520 V = IRB.CreateBitPreservingCastChain(DL, V, NewTy: NewAllocaTy);
3521 StoreInst *Store = IRB.CreateAlignedStore(Val: V, Ptr: &NewAI, Align: NewAI.getAlign());
3522 Store->copyMetadata(SrcInst: SI, WL: {LLVMContext::MD_mem_parallel_loop_access,
3523 LLVMContext::MD_access_group});
3524 if (AATags)
3525 Store->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3526 AccessTy: V->getType(), DL));
3527
3528 migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * 8, SliceSizeInBits: SliceSize * 8, OldInst: &SI,
3529 Inst: Store, Dest: Store->getPointerOperand(),
3530 Value: Store->getValueOperand(), DL);
3531
3532 Pass.DeadInsts.push_back(Elt: &SI);
3533 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
3534 return true;
3535 }
3536
3537 bool visitStoreInst(StoreInst &SI) {
3538 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
3539 Value *OldOp = SI.getOperand(i_nocapture: 1);
3540 assert(OldOp == OldPtr);
3541
3542 AAMDNodes AATags = SI.getAAMetadata();
3543 Value *V = SI.getValueOperand();
3544
3545 // Strip all inbounds GEPs and pointer casts to try to dig out any root
3546 // alloca that should be re-examined after promoting this alloca.
3547 if (V->getType()->isPointerTy())
3548 if (AllocaInst *AI = dyn_cast<AllocaInst>(Val: V->stripInBoundsOffsets()))
3549 Pass.PostPromotionWorklist.insert(X: AI);
3550
3551 TypeSize StoreSize = DL.getTypeStoreSize(Ty: V->getType());
3552 if (StoreSize.isFixed() && SliceSize < StoreSize.getFixedValue()) {
3553 assert(!SI.isVolatile());
3554 assert(V->getType()->isIntegerTy() &&
3555 "Only integer type loads and stores are split");
3556 assert(DL.typeSizeEqualsStoreSize(V->getType()) &&
3557 "Non-byte-multiple bit width");
3558 IntegerType *NarrowTy = Type::getIntNTy(C&: SI.getContext(), N: SliceSize * 8);
3559 V = extractInteger(DL, IRB, V, Ty: NarrowTy, Offset: NewBeginOffset - BeginOffset,
3560 Name: "extract");
3561 }
3562
3563 if (VecTy)
3564 return rewriteVectorizedStoreInst(V, SI, OldOp, AATags);
3565 if (IntTy && V->getType()->isIntegerTy())
3566 return rewriteIntegerStore(V, SI, AATags);
3567
3568 StoreInst *NewSI;
3569 if (NewBeginOffset == NewAllocaBeginOffset &&
3570 NewEndOffset == NewAllocaEndOffset &&
3571 canConvertValue(DL, OldTy: V->getType(), NewTy: NewAllocaTy)) {
3572 V = IRB.CreateBitPreservingCastChain(DL, V, NewTy: NewAllocaTy);
3573 Value *NewPtr =
3574 getPtrToNewAI(AddrSpace: SI.getPointerAddressSpace(), IsVolatile: SI.isVolatile());
3575
3576 NewSI =
3577 IRB.CreateAlignedStore(Val: V, Ptr: NewPtr, Align: NewAI.getAlign(), isVolatile: SI.isVolatile());
3578 } else {
3579 unsigned AS = SI.getPointerAddressSpace();
3580 Value *NewPtr = getNewAllocaSlicePtr(IRB, PointerTy: IRB.getPtrTy(AddrSpace: AS));
3581 NewSI =
3582 IRB.CreateAlignedStore(Val: V, Ptr: NewPtr, Align: getSliceAlign(), isVolatile: SI.isVolatile());
3583 }
3584 NewSI->copyMetadata(SrcInst: SI, WL: {LLVMContext::MD_mem_parallel_loop_access,
3585 LLVMContext::MD_access_group});
3586 if (AATags)
3587 NewSI->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3588 AccessTy: V->getType(), DL));
3589 if (SI.isVolatile())
3590 NewSI->setAtomic(Ordering: SI.getOrdering(), SSID: SI.getSyncScopeID());
3591 if (NewSI->isAtomic())
3592 NewSI->setAlignment(SI.getAlign());
3593
3594 migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * 8, SliceSizeInBits: SliceSize * 8, OldInst: &SI,
3595 Inst: NewSI, Dest: NewSI->getPointerOperand(),
3596 Value: NewSI->getValueOperand(), DL);
3597
3598 Pass.DeadInsts.push_back(Elt: &SI);
3599 deleteIfTriviallyDead(V: OldOp);
3600
3601 LLVM_DEBUG(dbgs() << " to: " << *NewSI << "\n");
3602 return NewSI->getPointerOperand() == &NewAI &&
3603 NewSI->getValueOperand()->getType() == NewAllocaTy &&
3604 !SI.isVolatile();
3605 }
3606
3607 /// Compute an integer value from splatting an i8 across the given
3608 /// number of bytes.
3609 ///
3610 /// Note that this routine assumes an i8 is a byte. If that isn't true, don't
3611 /// call this routine.
3612 /// FIXME: Heed the advice above.
3613 ///
3614 /// \param V The i8 value to splat.
3615 /// \param Size The number of bytes in the output (assuming i8 is one byte)
3616 Value *getIntegerSplat(Value *V, unsigned Size) {
3617 assert(Size > 0 && "Expected a positive number of bytes.");
3618 IntegerType *VTy = cast<IntegerType>(Val: V->getType());
3619 assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte");
3620 if (Size == 1)
3621 return V;
3622
3623 Type *SplatIntTy = Type::getIntNTy(C&: VTy->getContext(), N: Size * 8);
3624 V = IRB.CreateMul(
3625 LHS: IRB.CreateZExt(V, DestTy: SplatIntTy, Name: "zext"),
3626 RHS: IRB.CreateUDiv(LHS: Constant::getAllOnesValue(Ty: SplatIntTy),
3627 RHS: IRB.CreateZExt(V: Constant::getAllOnesValue(Ty: V->getType()),
3628 DestTy: SplatIntTy)),
3629 Name: "isplat");
3630 return V;
3631 }
3632
3633 /// Compute a vector splat for a given element value.
3634 Value *getVectorSplat(Value *V, unsigned NumElements) {
3635 V = IRB.CreateVectorSplat(NumElts: NumElements, V, Name: "vsplat");
3636 LLVM_DEBUG(dbgs() << " splat: " << *V << "\n");
3637 return V;
3638 }
3639
3640 bool visitMemSetInst(MemSetInst &II) {
3641 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3642 assert(II.getRawDest() == OldPtr);
3643
3644 AAMDNodes AATags = II.getAAMetadata();
3645
3646 // If the memset has a variable size, it cannot be split, just adjust the
3647 // pointer to the new alloca.
3648 if (!isa<ConstantInt>(Val: II.getLength())) {
3649 assert(!IsSplit);
3650 assert(NewBeginOffset == BeginOffset);
3651 II.setDest(getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType()));
3652 II.setDestAlignment(getSliceAlign());
3653 // In theory we should call migrateDebugInfo here. However, we do not
3654 // emit dbg.assign intrinsics for mem intrinsics storing through non-
3655 // constant geps, or storing a variable number of bytes.
3656 assert(at::getDVRAssignmentMarkers(&II).empty() &&
3657 "AT: Unexpected link to non-const GEP");
3658 deleteIfTriviallyDead(V: OldPtr);
3659 return false;
3660 }
3661
3662 // Record this instruction for deletion.
3663 Pass.DeadInsts.push_back(Elt: &II);
3664
3665 Type *ScalarTy = NewAllocaTy->getScalarType();
3666
3667 const bool CanContinue = [&]() {
3668 if (VecTy || IntTy)
3669 return true;
3670 if (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset)
3671 return false;
3672 // Length must be in range for FixedVectorType.
3673 auto *C = cast<ConstantInt>(Val: II.getLength());
3674 const uint64_t Len = C->getLimitedValue();
3675 if (Len > std::numeric_limits<unsigned>::max())
3676 return false;
3677 auto *Int8Ty = IntegerType::getInt8Ty(C&: NewAI.getContext());
3678 auto *SrcTy = FixedVectorType::get(ElementType: Int8Ty, NumElts: Len);
3679 return canConvertValue(DL, OldTy: SrcTy, NewTy: NewAllocaTy) &&
3680 DL.isLegalInteger(Width: DL.getTypeSizeInBits(Ty: ScalarTy).getFixedValue());
3681 }();
3682
3683 // If this doesn't map cleanly onto the alloca type, and that type isn't
3684 // a single value type, just emit a memset.
3685 if (!CanContinue) {
3686 Type *SizeTy = II.getLength()->getType();
3687 unsigned Sz = NewEndOffset - NewBeginOffset;
3688 Constant *Size = ConstantInt::get(Ty: SizeTy, V: Sz);
3689 MemIntrinsic *New = cast<MemIntrinsic>(Val: IRB.CreateMemSet(
3690 Ptr: getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType()), Val: II.getValue(), Size,
3691 Align: MaybeAlign(getSliceAlign()), isVolatile: II.isVolatile()));
3692 if (AATags)
3693 New->setAAMetadata(
3694 AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset, AccessSize: Sz));
3695
3696 migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * 8, SliceSizeInBits: SliceSize * 8, OldInst: &II,
3697 Inst: New, Dest: New->getRawDest(), Value: nullptr, DL);
3698
3699 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3700 return false;
3701 }
3702
3703 // If we can represent this as a simple value, we have to build the actual
3704 // value to store, which requires expanding the byte present in memset to
3705 // a sensible representation for the alloca type. This is essentially
3706 // splatting the byte to a sufficiently wide integer, splatting it across
3707 // any desired vector width, and bitcasting to the final type.
3708 Value *V;
3709
3710 if (VecTy) {
3711 // If this is a memset of a vectorized alloca, insert it.
3712 assert(ElementTy == ScalarTy);
3713
3714 unsigned BeginIndex = getIndex(Offset: NewBeginOffset);
3715 unsigned EndIndex = getIndex(Offset: NewEndOffset);
3716 assert(EndIndex > BeginIndex && "Empty vector!");
3717 unsigned NumElements = EndIndex - BeginIndex;
3718 assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
3719 "Too many elements!");
3720
3721 Value *Splat = getIntegerSplat(
3722 V: II.getValue(), Size: DL.getTypeSizeInBits(Ty: ElementTy).getFixedValue() / 8);
3723 Splat = IRB.CreateBitPreservingCastChain(DL, V: Splat, NewTy: ElementTy);
3724 if (NumElements > 1)
3725 Splat = getVectorSplat(V: Splat, NumElements);
3726
3727 Value *Old = IRB.CreateAlignedLoad(Ty: NewAllocaTy, Ptr: &NewAI, Align: NewAI.getAlign(),
3728 Name: "oldload");
3729 V = insertVector(IRB, Old, V: Splat, BeginIndex, Name: "vec");
3730 } else if (IntTy) {
3731 // If this is a memset on an alloca where we can widen stores, insert the
3732 // set integer.
3733 assert(!II.isVolatile());
3734
3735 uint64_t Size = NewEndOffset - NewBeginOffset;
3736 V = getIntegerSplat(V: II.getValue(), Size);
3737
3738 if (IntTy && (NewBeginOffset != NewAllocaBeginOffset ||
3739 NewEndOffset != NewAllocaEndOffset)) {
3740 Value *Old = IRB.CreateAlignedLoad(Ty: NewAllocaTy, Ptr: &NewAI,
3741 Align: NewAI.getAlign(), Name: "oldload");
3742 Old = IRB.CreateBitPreservingCastChain(DL, V: Old, NewTy: IntTy);
3743 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3744 V = insertInteger(DL, IRB, Old, V, Offset, Name: "insert");
3745 } else {
3746 assert(V->getType() == IntTy &&
3747 "Wrong type for an alloca wide integer!");
3748 }
3749 V = IRB.CreateBitPreservingCastChain(DL, V, NewTy: NewAllocaTy);
3750 } else {
3751 // Established these invariants above.
3752 assert(NewBeginOffset == NewAllocaBeginOffset);
3753 assert(NewEndOffset == NewAllocaEndOffset);
3754
3755 V = getIntegerSplat(V: II.getValue(),
3756 Size: DL.getTypeSizeInBits(Ty: ScalarTy).getFixedValue() / 8);
3757 if (VectorType *AllocaVecTy = dyn_cast<VectorType>(Val: NewAllocaTy))
3758 V = getVectorSplat(
3759 V, NumElements: cast<FixedVectorType>(Val: AllocaVecTy)->getNumElements());
3760
3761 V = IRB.CreateBitPreservingCastChain(DL, V, NewTy: NewAllocaTy);
3762 }
3763
3764 Value *NewPtr = getPtrToNewAI(AddrSpace: II.getDestAddressSpace(), IsVolatile: II.isVolatile());
3765 StoreInst *New =
3766 IRB.CreateAlignedStore(Val: V, Ptr: NewPtr, Align: NewAI.getAlign(), isVolatile: II.isVolatile());
3767 New->copyMetadata(SrcInst: II, WL: {LLVMContext::MD_mem_parallel_loop_access,
3768 LLVMContext::MD_access_group});
3769 if (AATags)
3770 New->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3771 AccessTy: V->getType(), DL));
3772
3773 migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * 8, SliceSizeInBits: SliceSize * 8, OldInst: &II,
3774 Inst: New, Dest: New->getPointerOperand(), Value: V, DL);
3775
3776 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3777 return !II.isVolatile();
3778 }
3779
3780 bool visitMemTransferInst(MemTransferInst &II) {
3781 // Rewriting of memory transfer instructions can be a bit tricky. We break
3782 // them into two categories: split intrinsics and unsplit intrinsics.
3783
3784 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
3785
3786 AAMDNodes AATags = II.getAAMetadata();
3787
3788 bool IsDest = &II.getRawDestUse() == OldUse;
3789 assert((IsDest && II.getRawDest() == OldPtr) ||
3790 (!IsDest && II.getRawSource() == OldPtr));
3791
3792 Align SliceAlign = getSliceAlign();
3793 // For unsplit intrinsics, we simply modify the source and destination
3794 // pointers in place. This isn't just an optimization, it is a matter of
3795 // correctness. With unsplit intrinsics we may be dealing with transfers
3796 // within a single alloca before SROA ran, or with transfers that have
3797 // a variable length. We may also be dealing with memmove instead of
3798 // memcpy, and so simply updating the pointers is the necessary for us to
3799 // update both source and dest of a single call.
3800 if (!IsSplittable) {
3801 Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType());
3802 if (IsDest) {
3803 // Update the address component of linked dbg.assigns.
3804 for (DbgVariableRecord *DbgAssign : at::getDVRAssignmentMarkers(Inst: &II)) {
3805 if (llvm::is_contained(Range: DbgAssign->location_ops(), Element: II.getDest()) ||
3806 DbgAssign->getAddress() == II.getDest())
3807 DbgAssign->replaceVariableLocationOp(OldValue: II.getDest(), NewValue: AdjustedPtr);
3808 }
3809 II.setDest(AdjustedPtr);
3810 II.setDestAlignment(SliceAlign);
3811 } else {
3812 II.setSource(AdjustedPtr);
3813 II.setSourceAlignment(SliceAlign);
3814 }
3815
3816 LLVM_DEBUG(dbgs() << " to: " << II << "\n");
3817 deleteIfTriviallyDead(V: OldPtr);
3818 return false;
3819 }
3820 // For split transfer intrinsics we have an incredibly useful assurance:
3821 // the source and destination do not reside within the same alloca, and at
3822 // least one of them does not escape. This means that we can replace
3823 // memmove with memcpy, and we don't need to worry about all manner of
3824 // downsides to splitting and transforming the operations.
3825
3826 // If this doesn't map cleanly onto the alloca type, and that type isn't
3827 // a single value type, just emit a memcpy.
3828 bool EmitMemCpy =
3829 !VecTy && !IntTy &&
3830 (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
3831 SliceSize != DL.getTypeStoreSize(Ty: NewAllocaTy).getFixedValue() ||
3832 !DL.typeSizeEqualsStoreSize(Ty: NewAllocaTy) ||
3833 !NewAllocaTy->isSingleValueType());
3834
3835 // If we're just going to emit a memcpy, the alloca hasn't changed, and the
3836 // size hasn't been shrunk based on analysis of the viable range, this is
3837 // a no-op.
3838 if (EmitMemCpy && &OldAI == &NewAI) {
3839 // Ensure the start lines up.
3840 assert(NewBeginOffset == BeginOffset);
3841
3842 // Rewrite the size as needed.
3843 if (NewEndOffset != EndOffset)
3844 II.setLength(NewEndOffset - NewBeginOffset);
3845 return false;
3846 }
3847 // Record this instruction for deletion.
3848 Pass.DeadInsts.push_back(Elt: &II);
3849
3850 // Strip all inbounds GEPs and pointer casts to try to dig out any root
3851 // alloca that should be re-examined after rewriting this instruction.
3852 Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
3853 if (AllocaInst *AI =
3854 dyn_cast<AllocaInst>(Val: OtherPtr->stripInBoundsOffsets())) {
3855 assert(AI != &OldAI && AI != &NewAI &&
3856 "Splittable transfers cannot reach the same alloca on both ends.");
3857 Pass.Worklist.insert(X: AI);
3858 }
3859
3860 Type *OtherPtrTy = OtherPtr->getType();
3861 unsigned OtherAS = OtherPtrTy->getPointerAddressSpace();
3862
3863 // Compute the relative offset for the other pointer within the transfer.
3864 unsigned OffsetWidth = DL.getIndexSizeInBits(AS: OtherAS);
3865 APInt OtherOffset(OffsetWidth, NewBeginOffset - BeginOffset);
3866 Align OtherAlign =
3867 (IsDest ? II.getSourceAlign() : II.getDestAlign()).valueOrOne();
3868 OtherAlign =
3869 commonAlignment(A: OtherAlign, Offset: OtherOffset.zextOrTrunc(width: 64).getZExtValue());
3870
3871 if (EmitMemCpy) {
3872 // Compute the other pointer, folding as much as possible to produce
3873 // a single, simple GEP in most cases.
3874 OtherPtr = getAdjustedPtr(IRB, DL, Ptr: OtherPtr, Offset: OtherOffset, PointerTy: OtherPtrTy,
3875 NamePrefix: OtherPtr->getName() + ".");
3876
3877 Value *OurPtr = getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType());
3878 Type *SizeTy = II.getLength()->getType();
3879 Constant *Size = ConstantInt::get(Ty: SizeTy, V: NewEndOffset - NewBeginOffset);
3880
3881 Value *DestPtr, *SrcPtr;
3882 MaybeAlign DestAlign, SrcAlign;
3883 // Note: IsDest is true iff we're copying into the new alloca slice
3884 if (IsDest) {
3885 DestPtr = OurPtr;
3886 DestAlign = SliceAlign;
3887 SrcPtr = OtherPtr;
3888 SrcAlign = OtherAlign;
3889 } else {
3890 DestPtr = OtherPtr;
3891 DestAlign = OtherAlign;
3892 SrcPtr = OurPtr;
3893 SrcAlign = SliceAlign;
3894 }
3895 CallInst *New = IRB.CreateMemCpy(Dst: DestPtr, DstAlign: DestAlign, Src: SrcPtr, SrcAlign,
3896 Size, isVolatile: II.isVolatile());
3897 if (AATags)
3898 New->setAAMetadata(AATags.shift(Offset: NewBeginOffset - BeginOffset));
3899
3900 APInt Offset(DL.getIndexTypeSizeInBits(Ty: DestPtr->getType()), 0);
3901 if (IsDest) {
3902 migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * 8, SliceSizeInBits: SliceSize * 8,
3903 OldInst: &II, Inst: New, Dest: DestPtr, Value: nullptr, DL);
3904 } else if (AllocaInst *Base = dyn_cast<AllocaInst>(
3905 Val: DestPtr->stripAndAccumulateConstantOffsets(
3906 DL, Offset, /*AllowNonInbounds*/ true))) {
3907 migrateDebugInfo(OldAlloca: Base, IsSplit, OldAllocaOffsetInBits: Offset.getZExtValue() * 8,
3908 SliceSizeInBits: SliceSize * 8, OldInst: &II, Inst: New, Dest: DestPtr, Value: nullptr, DL);
3909 }
3910 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
3911 return false;
3912 }
3913
3914 bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset &&
3915 NewEndOffset == NewAllocaEndOffset;
3916 uint64_t Size = NewEndOffset - NewBeginOffset;
3917 unsigned BeginIndex = VecTy ? getIndex(Offset: NewBeginOffset) : 0;
3918 unsigned EndIndex = VecTy ? getIndex(Offset: NewEndOffset) : 0;
3919 unsigned NumElements = EndIndex - BeginIndex;
3920 IntegerType *SubIntTy =
3921 IntTy ? Type::getIntNTy(C&: IntTy->getContext(), N: Size * 8) : nullptr;
3922
3923 // Reset the other pointer type to match the register type we're going to
3924 // use, but using the address space of the original other pointer.
3925 Type *OtherTy;
3926 if (VecTy && !IsWholeAlloca) {
3927 if (NumElements == 1)
3928 OtherTy = VecTy->getElementType();
3929 else
3930 OtherTy = FixedVectorType::get(ElementType: VecTy->getElementType(), NumElts: NumElements);
3931 } else if (IntTy && !IsWholeAlloca) {
3932 OtherTy = SubIntTy;
3933 } else {
3934 OtherTy = NewAllocaTy;
3935 }
3936
3937 Value *AdjPtr = getAdjustedPtr(IRB, DL, Ptr: OtherPtr, Offset: OtherOffset, PointerTy: OtherPtrTy,
3938 NamePrefix: OtherPtr->getName() + ".");
3939 MaybeAlign SrcAlign = OtherAlign;
3940 MaybeAlign DstAlign = SliceAlign;
3941 if (!IsDest)
3942 std::swap(a&: SrcAlign, b&: DstAlign);
3943
3944 Value *SrcPtr;
3945 Value *DstPtr;
3946
3947 if (IsDest) {
3948 DstPtr = getPtrToNewAI(AddrSpace: II.getDestAddressSpace(), IsVolatile: II.isVolatile());
3949 SrcPtr = AdjPtr;
3950 } else {
3951 DstPtr = AdjPtr;
3952 SrcPtr = getPtrToNewAI(AddrSpace: II.getSourceAddressSpace(), IsVolatile: II.isVolatile());
3953 }
3954
3955 Value *Src;
3956 if (VecTy && !IsWholeAlloca && !IsDest) {
3957 Src =
3958 IRB.CreateAlignedLoad(Ty: NewAllocaTy, Ptr: &NewAI, Align: NewAI.getAlign(), Name: "load");
3959 Src = extractVector(IRB, V: Src, BeginIndex, EndIndex, Name: "vec");
3960 } else if (IntTy && !IsWholeAlloca && !IsDest) {
3961 Src =
3962 IRB.CreateAlignedLoad(Ty: NewAllocaTy, Ptr: &NewAI, Align: NewAI.getAlign(), Name: "load");
3963 Src = IRB.CreateBitPreservingCastChain(DL, V: Src, NewTy: IntTy);
3964 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3965 Src = extractInteger(DL, IRB, V: Src, Ty: SubIntTy, Offset, Name: "extract");
3966 } else {
3967 LoadInst *Load = IRB.CreateAlignedLoad(Ty: OtherTy, Ptr: SrcPtr, Align: SrcAlign,
3968 isVolatile: II.isVolatile(), Name: "copyload");
3969 Load->copyMetadata(SrcInst: II, WL: {LLVMContext::MD_mem_parallel_loop_access,
3970 LLVMContext::MD_access_group});
3971 if (AATags)
3972 Load->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3973 AccessTy: Load->getType(), DL));
3974 Src = Load;
3975 }
3976
3977 if (VecTy && !IsWholeAlloca && IsDest) {
3978 Value *Old = IRB.CreateAlignedLoad(Ty: NewAllocaTy, Ptr: &NewAI, Align: NewAI.getAlign(),
3979 Name: "oldload");
3980 Src = insertVector(IRB, Old, V: Src, BeginIndex, Name: "vec");
3981 } else if (IntTy && !IsWholeAlloca && IsDest) {
3982 Value *Old = IRB.CreateAlignedLoad(Ty: NewAllocaTy, Ptr: &NewAI, Align: NewAI.getAlign(),
3983 Name: "oldload");
3984 Old = IRB.CreateBitPreservingCastChain(DL, V: Old, NewTy: IntTy);
3985 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3986 Src = insertInteger(DL, IRB, Old, V: Src, Offset, Name: "insert");
3987 Src = IRB.CreateBitPreservingCastChain(DL, V: Src, NewTy: NewAllocaTy);
3988 }
3989
3990 StoreInst *Store = cast<StoreInst>(
3991 Val: IRB.CreateAlignedStore(Val: Src, Ptr: DstPtr, Align: DstAlign, isVolatile: II.isVolatile()));
3992 Store->copyMetadata(SrcInst: II, WL: {LLVMContext::MD_mem_parallel_loop_access,
3993 LLVMContext::MD_access_group});
3994 if (AATags)
3995 Store->setAAMetadata(AATags.adjustForAccess(Offset: NewBeginOffset - BeginOffset,
3996 AccessTy: Src->getType(), DL));
3997
3998 APInt Offset(DL.getIndexTypeSizeInBits(Ty: DstPtr->getType()), 0);
3999 if (IsDest) {
4000
4001 migrateDebugInfo(OldAlloca: &OldAI, IsSplit, OldAllocaOffsetInBits: NewBeginOffset * 8, SliceSizeInBits: SliceSize * 8, OldInst: &II,
4002 Inst: Store, Dest: DstPtr, Value: Src, DL);
4003 } else if (AllocaInst *Base = dyn_cast<AllocaInst>(
4004 Val: DstPtr->stripAndAccumulateConstantOffsets(
4005 DL, Offset, /*AllowNonInbounds*/ true))) {
4006 migrateDebugInfo(OldAlloca: Base, IsSplit, OldAllocaOffsetInBits: Offset.getZExtValue() * 8, SliceSizeInBits: SliceSize * 8,
4007 OldInst: &II, Inst: Store, Dest: DstPtr, Value: Src, DL);
4008 }
4009
4010 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
4011 return !II.isVolatile();
4012 }
4013
4014 bool visitIntrinsicInst(IntrinsicInst &II) {
4015 assert((II.isLifetimeStartOrEnd() || II.isDroppable()) &&
4016 "Unexpected intrinsic!");
4017 LLVM_DEBUG(dbgs() << " original: " << II << "\n");
4018
4019 // Record this instruction for deletion.
4020 Pass.DeadInsts.push_back(Elt: &II);
4021
4022 if (II.isDroppable()) {
4023 assert(II.getIntrinsicID() == Intrinsic::assume && "Expected assume");
4024 // TODO For now we forget assumed information, this can be improved.
4025 OldPtr->dropDroppableUsesIn(Usr&: II);
4026 return true;
4027 }
4028
4029 assert(II.getArgOperand(0) == OldPtr);
4030 Type *PointerTy = IRB.getPtrTy(AddrSpace: OldPtr->getType()->getPointerAddressSpace());
4031 Value *Ptr = getNewAllocaSlicePtr(IRB, PointerTy);
4032 Value *New;
4033 if (II.getIntrinsicID() == Intrinsic::lifetime_start)
4034 New = IRB.CreateLifetimeStart(Ptr);
4035 else
4036 New = IRB.CreateLifetimeEnd(Ptr);
4037
4038 (void)New;
4039 LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
4040
4041 return true;
4042 }
4043
4044 void fixLoadStoreAlign(Instruction &Root) {
4045 // This algorithm implements the same visitor loop as
4046 // hasUnsafePHIOrSelectUse, and fixes the alignment of each load
4047 // or store found.
4048 SmallPtrSet<Instruction *, 4> Visited;
4049 SmallVector<Instruction *, 4> Uses;
4050 Visited.insert(Ptr: &Root);
4051 Uses.push_back(Elt: &Root);
4052 do {
4053 Instruction *I = Uses.pop_back_val();
4054
4055 if (LoadInst *LI = dyn_cast<LoadInst>(Val: I)) {
4056 LI->setAlignment(std::min(a: LI->getAlign(), b: getSliceAlign()));
4057 continue;
4058 }
4059 if (StoreInst *SI = dyn_cast<StoreInst>(Val: I)) {
4060 SI->setAlignment(std::min(a: SI->getAlign(), b: getSliceAlign()));
4061 continue;
4062 }
4063
4064 assert(isa<BitCastInst>(I) || isa<AddrSpaceCastInst>(I) ||
4065 isa<PHINode>(I) || isa<SelectInst>(I) ||
4066 isa<GetElementPtrInst>(I));
4067 for (User *U : I->users())
4068 if (Visited.insert(Ptr: cast<Instruction>(Val: U)).second)
4069 Uses.push_back(Elt: cast<Instruction>(Val: U));
4070 } while (!Uses.empty());
4071 }
4072
4073 bool visitPHINode(PHINode &PN) {
4074 LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
4075 assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable");
4076 assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable");
4077
4078 // We would like to compute a new pointer in only one place, but have it be
4079 // as local as possible to the PHI. To do that, we re-use the location of
4080 // the old pointer, which necessarily must be in the right position to
4081 // dominate the PHI.
4082 IRBuilderBase::InsertPointGuard Guard(IRB);
4083 if (isa<PHINode>(Val: OldPtr))
4084 IRB.SetInsertPoint(TheBB: OldPtr->getParent(),
4085 IP: OldPtr->getParent()->getFirstInsertionPt());
4086 else
4087 IRB.SetInsertPoint(OldPtr);
4088 IRB.SetCurrentDebugLocation(OldPtr->getDebugLoc());
4089
4090 Value *NewPtr = getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType());
4091 // Replace the operands which were using the old pointer.
4092 std::replace(first: PN.op_begin(), last: PN.op_end(), old_value: cast<Value>(Val: OldPtr), new_value: NewPtr);
4093
4094 LLVM_DEBUG(dbgs() << " to: " << PN << "\n");
4095 deleteIfTriviallyDead(V: OldPtr);
4096
4097 // Fix the alignment of any loads or stores using this PHI node.
4098 fixLoadStoreAlign(Root&: PN);
4099
4100 // PHIs can't be promoted on their own, but often can be speculated. We
4101 // check the speculation outside of the rewriter so that we see the
4102 // fully-rewritten alloca.
4103 PHIUsers.insert(X: &PN);
4104 return true;
4105 }
4106
4107 bool visitSelectInst(SelectInst &SI) {
4108 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
4109 assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) &&
4110 "Pointer isn't an operand!");
4111 assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
4112 assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable");
4113
4114 Value *NewPtr = getNewAllocaSlicePtr(IRB, PointerTy: OldPtr->getType());
4115 // Replace the operands which were using the old pointer.
4116 if (SI.getOperand(i_nocapture: 1) == OldPtr)
4117 SI.setOperand(i_nocapture: 1, Val_nocapture: NewPtr);
4118 if (SI.getOperand(i_nocapture: 2) == OldPtr)
4119 SI.setOperand(i_nocapture: 2, Val_nocapture: NewPtr);
4120
4121 LLVM_DEBUG(dbgs() << " to: " << SI << "\n");
4122 deleteIfTriviallyDead(V: OldPtr);
4123
4124 // Fix the alignment of any loads or stores using this select.
4125 fixLoadStoreAlign(Root&: SI);
4126
4127 // Selects can't be promoted on their own, but often can be speculated. We
4128 // check the speculation outside of the rewriter so that we see the
4129 // fully-rewritten alloca.
4130 SelectUsers.insert(X: &SI);
4131 return true;
4132 }
4133};
4134
4135/// Visitor to rewrite aggregate loads and stores as scalar.
4136///
4137/// This pass aggressively rewrites all aggregate loads and stores on
4138/// a particular pointer (or any pointer derived from it which we can identify)
4139/// with scalar loads and stores.
4140class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
4141 // Befriend the base class so it can delegate to private visit methods.
4142 friend class InstVisitor<AggLoadStoreRewriter, bool>;
4143
4144 /// Queue of pointer uses to analyze and potentially rewrite.
4145 SmallVector<Use *, 8> Queue;
4146
4147 /// Set to prevent us from cycling with phi nodes and loops.
4148 SmallPtrSet<User *, 8> Visited;
4149
4150 /// The current pointer use being rewritten. This is used to dig up the used
4151 /// value (as opposed to the user).
4152 Use *U = nullptr;
4153
4154 /// Used to calculate offsets, and hence alignment, of subobjects.
4155 const DataLayout &DL;
4156
4157 IRBuilderTy &IRB;
4158
4159public:
4160 AggLoadStoreRewriter(const DataLayout &DL, IRBuilderTy &IRB)
4161 : DL(DL), IRB(IRB) {}
4162
4163 /// Rewrite loads and stores through a pointer and all pointers derived from
4164 /// it.
4165 bool rewrite(Instruction &I) {
4166 LLVM_DEBUG(dbgs() << " Rewriting FCA loads and stores...\n");
4167 enqueueUsers(I);
4168 bool Changed = false;
4169 while (!Queue.empty()) {
4170 U = Queue.pop_back_val();
4171 Changed |= visit(I: cast<Instruction>(Val: U->getUser()));
4172 }
4173 return Changed;
4174 }
4175
4176private:
4177 /// Enqueue all the users of the given instruction for further processing.
4178 /// This uses a set to de-duplicate users.
4179 void enqueueUsers(Instruction &I) {
4180 for (Use &U : I.uses())
4181 if (Visited.insert(Ptr: U.getUser()).second)
4182 Queue.push_back(Elt: &U);
4183 }
4184
4185 // Conservative default is to not rewrite anything.
4186 bool visitInstruction(Instruction &I) { return false; }
4187
4188 /// Generic recursive split emission class.
4189 template <typename Derived> class OpSplitter {
4190 protected:
4191 /// The builder used to form new instructions.
4192 IRBuilderTy &IRB;
4193
4194 /// The indices which to be used with insert- or extractvalue to select the
4195 /// appropriate value within the aggregate.
4196 SmallVector<unsigned, 4> Indices;
4197
4198 /// The indices to a GEP instruction which will move Ptr to the correct slot
4199 /// within the aggregate.
4200 SmallVector<Value *, 4> GEPIndices;
4201
4202 /// The base pointer of the original op, used as a base for GEPing the
4203 /// split operations.
4204 Value *Ptr;
4205
4206 /// The base pointee type being GEPed into.
4207 Type *BaseTy;
4208
4209 /// Known alignment of the base pointer.
4210 Align BaseAlign;
4211
4212 /// To calculate offset of each component so we can correctly deduce
4213 /// alignments.
4214 const DataLayout &DL;
4215
4216 /// Initialize the splitter with an insertion point, Ptr and start with a
4217 /// single zero GEP index.
4218 OpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
4219 Align BaseAlign, const DataLayout &DL, IRBuilderTy &IRB)
4220 : IRB(IRB), GEPIndices(1, IRB.getInt32(C: 0)), Ptr(Ptr), BaseTy(BaseTy),
4221 BaseAlign(BaseAlign), DL(DL) {
4222 IRB.SetInsertPoint(InsertionPoint);
4223 }
4224
4225 public:
4226 /// Generic recursive split emission routine.
4227 ///
4228 /// This method recursively splits an aggregate op (load or store) into
4229 /// scalar or vector ops. It splits recursively until it hits a single value
4230 /// and emits that single value operation via the template argument.
4231 ///
4232 /// The logic of this routine relies on GEPs and insertvalue and
4233 /// extractvalue all operating with the same fundamental index list, merely
4234 /// formatted differently (GEPs need actual values).
4235 ///
4236 /// \param Ty The type being split recursively into smaller ops.
4237 /// \param Agg The aggregate value being built up or stored, depending on
4238 /// whether this is splitting a load or a store respectively.
4239 void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) {
4240 if (Ty->isSingleValueType()) {
4241 unsigned Offset = DL.getIndexedOffsetInType(ElemTy: BaseTy, Indices: GEPIndices);
4242 return static_cast<Derived *>(this)->emitFunc(
4243 Ty, Agg, commonAlignment(A: BaseAlign, Offset), Name);
4244 }
4245
4246 if (ArrayType *ATy = dyn_cast<ArrayType>(Val: Ty)) {
4247 unsigned OldSize = Indices.size();
4248 (void)OldSize;
4249 for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size;
4250 ++Idx) {
4251 assert(Indices.size() == OldSize && "Did not return to the old size");
4252 Indices.push_back(Elt: Idx);
4253 GEPIndices.push_back(Elt: IRB.getInt32(C: Idx));
4254 emitSplitOps(Ty: ATy->getElementType(), Agg, Name: Name + "." + Twine(Idx));
4255 GEPIndices.pop_back();
4256 Indices.pop_back();
4257 }
4258 return;
4259 }
4260
4261 if (StructType *STy = dyn_cast<StructType>(Val: Ty)) {
4262 unsigned OldSize = Indices.size();
4263 (void)OldSize;
4264 for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size;
4265 ++Idx) {
4266 assert(Indices.size() == OldSize && "Did not return to the old size");
4267 Indices.push_back(Elt: Idx);
4268 GEPIndices.push_back(Elt: IRB.getInt32(C: Idx));
4269 emitSplitOps(Ty: STy->getElementType(N: Idx), Agg, Name: Name + "." + Twine(Idx));
4270 GEPIndices.pop_back();
4271 Indices.pop_back();
4272 }
4273 return;
4274 }
4275
4276 llvm_unreachable("Only arrays and structs are aggregate loadable types");
4277 }
4278 };
4279
4280 struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
4281 AAMDNodes AATags;
4282 // A vector to hold the split components that we want to emit
4283 // separate fake uses for.
4284 SmallVector<Value *, 4> Components;
4285 // A vector to hold all the fake uses of the struct that we are splitting.
4286 // Usually there should only be one, but we are handling the general case.
4287 SmallVector<Instruction *, 1> FakeUses;
4288
4289 LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
4290 AAMDNodes AATags, Align BaseAlign, const DataLayout &DL,
4291 IRBuilderTy &IRB)
4292 : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, DL,
4293 IRB),
4294 AATags(AATags) {}
4295
4296 /// Emit a leaf load of a single value. This is called at the leaves of the
4297 /// recursive emission to actually load values.
4298 void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
4299 assert(Ty->isSingleValueType());
4300 // Load the single value and insert it using the indices.
4301 Value *GEP =
4302 IRB.CreateInBoundsGEP(Ty: BaseTy, Ptr, IdxList: GEPIndices, Name: Name + ".gep");
4303 LoadInst *Load =
4304 IRB.CreateAlignedLoad(Ty, Ptr: GEP, Align: Alignment, Name: Name + ".load");
4305
4306 APInt Offset(
4307 DL.getIndexSizeInBits(AS: Ptr->getType()->getPointerAddressSpace()), 0);
4308 if (AATags &&
4309 GEPOperator::accumulateConstantOffset(SourceType: BaseTy, Index: GEPIndices, DL, Offset))
4310 Load->setAAMetadata(
4311 AATags.adjustForAccess(Offset: Offset.getZExtValue(), AccessTy: Load->getType(), DL));
4312 // Record the load so we can generate a fake use for this aggregate
4313 // component.
4314 Components.push_back(Elt: Load);
4315
4316 Agg = IRB.CreateInsertValue(Agg, Val: Load, Idxs: Indices, Name: Name + ".insert");
4317 LLVM_DEBUG(dbgs() << " to: " << *Load << "\n");
4318 }
4319
4320 // Stash the fake uses that use the value generated by this instruction.
4321 void recordFakeUses(LoadInst &LI) {
4322 for (Use &U : LI.uses())
4323 if (auto *II = dyn_cast<IntrinsicInst>(Val: U.getUser()))
4324 if (II->getIntrinsicID() == Intrinsic::fake_use)
4325 FakeUses.push_back(Elt: II);
4326 }
4327
4328 // Replace all fake uses of the aggregate with a series of fake uses, one
4329 // for each split component.
4330 void emitFakeUses() {
4331 for (Instruction *I : FakeUses) {
4332 IRB.SetInsertPoint(I);
4333 for (auto *V : Components)
4334 IRB.CreateIntrinsic(ID: Intrinsic::fake_use, Args: {V});
4335 I->eraseFromParent();
4336 }
4337 }
4338 };
4339
4340 bool visitLoadInst(LoadInst &LI) {
4341 assert(LI.getPointerOperand() == *U);
4342 if (!LI.isSimple() || LI.getType()->isSingleValueType())
4343 return false;
4344
4345 // We have an aggregate being loaded, split it apart.
4346 LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
4347 LoadOpSplitter Splitter(&LI, *U, LI.getType(), LI.getAAMetadata(),
4348 getAdjustedAlignment(I: &LI, Offset: 0), DL, IRB);
4349 Splitter.recordFakeUses(LI);
4350 Value *V = PoisonValue::get(T: LI.getType());
4351 Splitter.emitSplitOps(Ty: LI.getType(), Agg&: V, Name: LI.getName() + ".fca");
4352 Splitter.emitFakeUses();
4353 Visited.erase(Ptr: &LI);
4354 LI.replaceAllUsesWith(V);
4355 LI.eraseFromParent();
4356 return true;
4357 }
4358
4359 struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
4360 StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
4361 AAMDNodes AATags, StoreInst *AggStore, Align BaseAlign,
4362 const DataLayout &DL, IRBuilderTy &IRB)
4363 : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
4364 DL, IRB),
4365 AATags(AATags), AggStore(AggStore) {}
4366 AAMDNodes AATags;
4367 StoreInst *AggStore;
4368 /// Emit a leaf store of a single value. This is called at the leaves of the
4369 /// recursive emission to actually produce stores.
4370 void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
4371 assert(Ty->isSingleValueType());
4372 // Extract the single value and store it using the indices.
4373 //
4374 // The gep and extractvalue values are factored out of the CreateStore
4375 // call to make the output independent of the argument evaluation order.
4376 Value *ExtractValue =
4377 IRB.CreateExtractValue(Agg, Idxs: Indices, Name: Name + ".extract");
4378 Value *InBoundsGEP =
4379 IRB.CreateInBoundsGEP(Ty: BaseTy, Ptr, IdxList: GEPIndices, Name: Name + ".gep");
4380 StoreInst *Store =
4381 IRB.CreateAlignedStore(Val: ExtractValue, Ptr: InBoundsGEP, Align: Alignment);
4382
4383 APInt Offset(
4384 DL.getIndexSizeInBits(AS: Ptr->getType()->getPointerAddressSpace()), 0);
4385 GEPOperator::accumulateConstantOffset(SourceType: BaseTy, Index: GEPIndices, DL, Offset);
4386 if (AATags) {
4387 Store->setAAMetadata(AATags.adjustForAccess(
4388 Offset: Offset.getZExtValue(), AccessTy: ExtractValue->getType(), DL));
4389 }
4390
4391 // migrateDebugInfo requires the base Alloca. Walk to it from this gep.
4392 // If we cannot (because there's an intervening non-const or unbounded
4393 // gep) then we wouldn't expect to see dbg.assign intrinsics linked to
4394 // this instruction.
4395 Value *Base = AggStore->getPointerOperand()->stripInBoundsOffsets();
4396 if (auto *OldAI = dyn_cast<AllocaInst>(Val: Base)) {
4397 uint64_t SizeInBits =
4398 DL.getTypeSizeInBits(Ty: Store->getValueOperand()->getType());
4399 migrateDebugInfo(OldAlloca: OldAI, /*IsSplit*/ true, OldAllocaOffsetInBits: Offset.getZExtValue() * 8,
4400 SliceSizeInBits: SizeInBits, OldInst: AggStore, Inst: Store,
4401 Dest: Store->getPointerOperand(), Value: Store->getValueOperand(),
4402 DL);
4403 } else {
4404 assert(at::getDVRAssignmentMarkers(Store).empty() &&
4405 "AT: unexpected debug.assign linked to store through "
4406 "unbounded GEP");
4407 }
4408 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
4409 }
4410 };
4411
4412 bool visitStoreInst(StoreInst &SI) {
4413 if (!SI.isSimple() || SI.getPointerOperand() != *U)
4414 return false;
4415 Value *V = SI.getValueOperand();
4416 if (V->getType()->isSingleValueType())
4417 return false;
4418
4419 // We have an aggregate being stored, split it apart.
4420 LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
4421 StoreOpSplitter Splitter(&SI, *U, V->getType(), SI.getAAMetadata(), &SI,
4422 getAdjustedAlignment(I: &SI, Offset: 0), DL, IRB);
4423 Splitter.emitSplitOps(Ty: V->getType(), Agg&: V, Name: V->getName() + ".fca");
4424 Visited.erase(Ptr: &SI);
4425 // The stores replacing SI each have markers describing fragments of the
4426 // assignment so delete the assignment markers linked to SI.
4427 at::deleteAssignmentMarkers(Inst: &SI);
4428 SI.eraseFromParent();
4429 return true;
4430 }
4431
4432 bool visitBitCastInst(BitCastInst &BC) {
4433 enqueueUsers(I&: BC);
4434 return false;
4435 }
4436
4437 bool visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
4438 enqueueUsers(I&: ASC);
4439 return false;
4440 }
4441
4442 // Unfold gep (select cond, ptr1, ptr2), idx
4443 // => select cond, gep(ptr1, idx), gep(ptr2, idx)
4444 // and gep ptr, (select cond, idx1, idx2)
4445 // => select cond, gep(ptr, idx1), gep(ptr, idx2)
4446 // We also allow for i1 zext indices, which are equivalent to selects.
4447 bool unfoldGEPSelect(GetElementPtrInst &GEPI) {
4448 // Check whether the GEP has exactly one select operand and all indices
4449 // will become constant after the transform.
4450 Instruction *Sel = dyn_cast<SelectInst>(Val: GEPI.getPointerOperand());
4451 for (Value *Op : GEPI.indices()) {
4452 if (auto *SI = dyn_cast<SelectInst>(Val: Op)) {
4453 if (Sel)
4454 return false;
4455
4456 Sel = SI;
4457 if (!isa<ConstantInt>(Val: SI->getTrueValue()) ||
4458 !isa<ConstantInt>(Val: SI->getFalseValue()))
4459 return false;
4460 continue;
4461 }
4462 if (auto *ZI = dyn_cast<ZExtInst>(Val: Op)) {
4463 if (Sel)
4464 return false;
4465 Sel = ZI;
4466 if (!ZI->getSrcTy()->isIntegerTy(BitWidth: 1))
4467 return false;
4468 continue;
4469 }
4470
4471 if (!isa<ConstantInt>(Val: Op))
4472 return false;
4473 }
4474
4475 if (!Sel)
4476 return false;
4477
4478 LLVM_DEBUG(dbgs() << " Rewriting gep(select) -> select(gep):\n";
4479 dbgs() << " original: " << *Sel << "\n";
4480 dbgs() << " " << GEPI << "\n";);
4481
4482 auto GetNewOps = [&](Value *SelOp) {
4483 SmallVector<Value *> NewOps;
4484 for (Value *Op : GEPI.operands())
4485 if (Op == Sel)
4486 NewOps.push_back(Elt: SelOp);
4487 else
4488 NewOps.push_back(Elt: Op);
4489 return NewOps;
4490 };
4491
4492 Value *Cond, *True, *False;
4493 Instruction *MDFrom = nullptr;
4494 if (auto *SI = dyn_cast<SelectInst>(Val: Sel)) {
4495 Cond = SI->getCondition();
4496 True = SI->getTrueValue();
4497 False = SI->getFalseValue();
4498 if (!ProfcheckDisableMetadataFixes)
4499 MDFrom = SI;
4500 } else {
4501 Cond = Sel->getOperand(i: 0);
4502 True = ConstantInt::get(Ty: Sel->getType(), V: 1);
4503 False = ConstantInt::get(Ty: Sel->getType(), V: 0);
4504 }
4505 SmallVector<Value *> TrueOps = GetNewOps(True);
4506 SmallVector<Value *> FalseOps = GetNewOps(False);
4507
4508 IRB.SetInsertPoint(&GEPI);
4509 GEPNoWrapFlags NW = GEPI.getNoWrapFlags();
4510
4511 Type *Ty = GEPI.getSourceElementType();
4512 Value *NTrue = IRB.CreateGEP(Ty, Ptr: TrueOps[0], IdxList: ArrayRef(TrueOps).drop_front(),
4513 Name: True->getName() + ".sroa.gep", NW);
4514
4515 Value *NFalse =
4516 IRB.CreateGEP(Ty, Ptr: FalseOps[0], IdxList: ArrayRef(FalseOps).drop_front(),
4517 Name: False->getName() + ".sroa.gep", NW);
4518
4519 Value *NSel = MDFrom
4520 ? IRB.CreateSelect(C: Cond, True: NTrue, False: NFalse,
4521 Name: Sel->getName() + ".sroa.sel", MDFrom)
4522 : IRB.CreateSelectWithUnknownProfile(
4523 C: Cond, True: NTrue, False: NFalse, DEBUG_TYPE,
4524 Name: Sel->getName() + ".sroa.sel");
4525 Visited.erase(Ptr: &GEPI);
4526 GEPI.replaceAllUsesWith(V: NSel);
4527 GEPI.eraseFromParent();
4528 Instruction *NSelI = cast<Instruction>(Val: NSel);
4529 Visited.insert(Ptr: NSelI);
4530 enqueueUsers(I&: *NSelI);
4531
4532 LLVM_DEBUG(dbgs() << " to: " << *NTrue << "\n";
4533 dbgs() << " " << *NFalse << "\n";
4534 dbgs() << " " << *NSel << "\n";);
4535
4536 return true;
4537 }
4538
4539 // Unfold gep (phi ptr1, ptr2), idx
4540 // => phi ((gep ptr1, idx), (gep ptr2, idx))
4541 // and gep ptr, (phi idx1, idx2)
4542 // => phi ((gep ptr, idx1), (gep ptr, idx2))
4543 bool unfoldGEPPhi(GetElementPtrInst &GEPI) {
4544 // To prevent infinitely expanding recursive phis, bail if the GEP pointer
4545 // operand (looking through the phi if it is the phi we want to unfold) is
4546 // an instruction besides a static alloca.
4547 PHINode *Phi = dyn_cast<PHINode>(Val: GEPI.getPointerOperand());
4548 auto IsInvalidPointerOperand = [](Value *V) {
4549 if (!isa<Instruction>(Val: V))
4550 return false;
4551 if (auto *AI = dyn_cast<AllocaInst>(Val: V))
4552 return !AI->isStaticAlloca();
4553 return true;
4554 };
4555 if (Phi) {
4556 if (any_of(Range: Phi->operands(), P: IsInvalidPointerOperand))
4557 return false;
4558 } else {
4559 if (IsInvalidPointerOperand(GEPI.getPointerOperand()))
4560 return false;
4561 }
4562 // Check whether the GEP has exactly one phi operand (including the pointer
4563 // operand) and all indices will become constant after the transform.
4564 for (Value *Op : GEPI.indices()) {
4565 if (auto *SI = dyn_cast<PHINode>(Val: Op)) {
4566 if (Phi)
4567 return false;
4568
4569 Phi = SI;
4570 if (!all_of(Range: Phi->incoming_values(),
4571 P: [](Value *V) { return isa<ConstantInt>(Val: V); }))
4572 return false;
4573 continue;
4574 }
4575
4576 if (!isa<ConstantInt>(Val: Op))
4577 return false;
4578 }
4579
4580 if (!Phi)
4581 return false;
4582
4583 LLVM_DEBUG(dbgs() << " Rewriting gep(phi) -> phi(gep):\n";
4584 dbgs() << " original: " << *Phi << "\n";
4585 dbgs() << " " << GEPI << "\n";);
4586
4587 auto GetNewOps = [&](Value *PhiOp) {
4588 SmallVector<Value *> NewOps;
4589 for (Value *Op : GEPI.operands())
4590 if (Op == Phi)
4591 NewOps.push_back(Elt: PhiOp);
4592 else
4593 NewOps.push_back(Elt: Op);
4594 return NewOps;
4595 };
4596
4597 IRB.SetInsertPoint(Phi);
4598 PHINode *NewPhi = IRB.CreatePHI(Ty: GEPI.getType(), NumReservedValues: Phi->getNumIncomingValues(),
4599 Name: Phi->getName() + ".sroa.phi");
4600
4601 Type *SourceTy = GEPI.getSourceElementType();
4602 // We only handle arguments, constants, and static allocas here, so we can
4603 // insert GEPs at the end of the entry block.
4604 IRB.SetInsertPoint(GEPI.getFunction()->getEntryBlock().getTerminator());
4605 for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
4606 Value *Op = Phi->getIncomingValue(i: I);
4607 BasicBlock *BB = Phi->getIncomingBlock(i: I);
4608 Value *NewGEP;
4609 if (int NI = NewPhi->getBasicBlockIndex(BB); NI >= 0) {
4610 NewGEP = NewPhi->getIncomingValue(i: NI);
4611 } else {
4612 SmallVector<Value *> NewOps = GetNewOps(Op);
4613 NewGEP =
4614 IRB.CreateGEP(Ty: SourceTy, Ptr: NewOps[0], IdxList: ArrayRef(NewOps).drop_front(),
4615 Name: Phi->getName() + ".sroa.gep", NW: GEPI.getNoWrapFlags());
4616 }
4617 NewPhi->addIncoming(V: NewGEP, BB);
4618 }
4619
4620 Visited.erase(Ptr: &GEPI);
4621 GEPI.replaceAllUsesWith(V: NewPhi);
4622 GEPI.eraseFromParent();
4623 Visited.insert(Ptr: NewPhi);
4624 enqueueUsers(I&: *NewPhi);
4625
4626 LLVM_DEBUG(dbgs() << " to: ";
4627 for (Value *In
4628 : NewPhi->incoming_values()) dbgs()
4629 << "\n " << *In;
4630 dbgs() << "\n " << *NewPhi << '\n');
4631
4632 return true;
4633 }
4634
4635 bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
4636 if (unfoldGEPSelect(GEPI))
4637 return true;
4638
4639 if (unfoldGEPPhi(GEPI))
4640 return true;
4641
4642 enqueueUsers(I&: GEPI);
4643 return false;
4644 }
4645
4646 bool visitPHINode(PHINode &PN) {
4647 enqueueUsers(I&: PN);
4648 return false;
4649 }
4650
4651 bool visitSelectInst(SelectInst &SI) {
4652 enqueueUsers(I&: SI);
4653 return false;
4654 }
4655};
4656
4657} // end anonymous namespace
4658
4659/// Strip aggregate type wrapping.
4660///
4661/// This removes no-op aggregate types wrapping an underlying type. It will
4662/// strip as many layers of types as it can without changing either the type
4663/// size or the allocated size.
4664static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) {
4665 if (Ty->isSingleValueType())
4666 return Ty;
4667
4668 uint64_t AllocSize = DL.getTypeAllocSize(Ty).getFixedValue();
4669 uint64_t TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
4670
4671 Type *InnerTy;
4672 if (ArrayType *ArrTy = dyn_cast<ArrayType>(Val: Ty)) {
4673 InnerTy = ArrTy->getElementType();
4674 } else if (StructType *STy = dyn_cast<StructType>(Val: Ty)) {
4675 const StructLayout *SL = DL.getStructLayout(Ty: STy);
4676 unsigned Index = SL->getElementContainingOffset(FixedOffset: 0);
4677 InnerTy = STy->getElementType(N: Index);
4678 } else {
4679 return Ty;
4680 }
4681
4682 if (AllocSize > DL.getTypeAllocSize(Ty: InnerTy).getFixedValue() ||
4683 TypeSize > DL.getTypeSizeInBits(Ty: InnerTy).getFixedValue())
4684 return Ty;
4685
4686 return stripAggregateTypeWrapping(DL, Ty: InnerTy);
4687}
4688
4689/// Try to find a partition of the aggregate type passed in for a given
4690/// offset and size.
4691///
4692/// This recurses through the aggregate type and tries to compute a subtype
4693/// based on the offset and size. When the offset and size span a sub-section
4694/// of an array, it will even compute a new array type for that sub-section,
4695/// and the same for structs.
4696///
4697/// Note that this routine is very strict and tries to find a partition of the
4698/// type which produces the *exact* right offset and size. It is not forgiving
4699/// when the size or offset cause either end of type-based partition to be off.
4700/// Also, this is a best-effort routine. It is reasonable to give up and not
4701/// return a type if necessary.
4702static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
4703 uint64_t Size) {
4704 if (Offset == 0 && DL.getTypeAllocSize(Ty).getFixedValue() == Size)
4705 return stripAggregateTypeWrapping(DL, Ty);
4706 if (Offset > DL.getTypeAllocSize(Ty).getFixedValue() ||
4707 (DL.getTypeAllocSize(Ty).getFixedValue() - Offset) < Size)
4708 return nullptr;
4709
4710 if (isa<ArrayType>(Val: Ty) || isa<VectorType>(Val: Ty)) {
4711 Type *ElementTy;
4712 uint64_t TyNumElements;
4713 if (auto *AT = dyn_cast<ArrayType>(Val: Ty)) {
4714 ElementTy = AT->getElementType();
4715 TyNumElements = AT->getNumElements();
4716 } else {
4717 // FIXME: This isn't right for vectors with non-byte-sized or
4718 // non-power-of-two sized elements.
4719 auto *VT = cast<FixedVectorType>(Val: Ty);
4720 ElementTy = VT->getElementType();
4721 TyNumElements = VT->getNumElements();
4722 }
4723 uint64_t ElementSize = DL.getTypeAllocSize(Ty: ElementTy).getFixedValue();
4724 uint64_t NumSkippedElements = Offset / ElementSize;
4725 if (NumSkippedElements >= TyNumElements)
4726 return nullptr;
4727 Offset -= NumSkippedElements * ElementSize;
4728
4729 // First check if we need to recurse.
4730 if (Offset > 0 || Size < ElementSize) {
4731 // Bail if the partition ends in a different array element.
4732 if ((Offset + Size) > ElementSize)
4733 return nullptr;
4734 // Recurse through the element type trying to peel off offset bytes.
4735 return getTypePartition(DL, Ty: ElementTy, Offset, Size);
4736 }
4737 assert(Offset == 0);
4738
4739 if (Size == ElementSize)
4740 return stripAggregateTypeWrapping(DL, Ty: ElementTy);
4741 assert(Size > ElementSize);
4742 uint64_t NumElements = Size / ElementSize;
4743 if (NumElements * ElementSize != Size)
4744 return nullptr;
4745 return ArrayType::get(ElementType: ElementTy, NumElements);
4746 }
4747
4748 StructType *STy = dyn_cast<StructType>(Val: Ty);
4749 if (!STy)
4750 return nullptr;
4751
4752 const StructLayout *SL = DL.getStructLayout(Ty: STy);
4753
4754 if (SL->getSizeInBits().isScalable())
4755 return nullptr;
4756
4757 if (Offset >= SL->getSizeInBytes())
4758 return nullptr;
4759 uint64_t EndOffset = Offset + Size;
4760 if (EndOffset > SL->getSizeInBytes())
4761 return nullptr;
4762
4763 unsigned Index = SL->getElementContainingOffset(FixedOffset: Offset);
4764 Offset -= SL->getElementOffset(Idx: Index);
4765
4766 Type *ElementTy = STy->getElementType(N: Index);
4767 uint64_t ElementSize = DL.getTypeAllocSize(Ty: ElementTy).getFixedValue();
4768 if (Offset >= ElementSize)
4769 return nullptr; // The offset points into alignment padding.
4770
4771 // See if any partition must be contained by the element.
4772 if (Offset > 0 || Size < ElementSize) {
4773 if ((Offset + Size) > ElementSize)
4774 return nullptr;
4775 return getTypePartition(DL, Ty: ElementTy, Offset, Size);
4776 }
4777 assert(Offset == 0);
4778
4779 if (Size == ElementSize)
4780 return stripAggregateTypeWrapping(DL, Ty: ElementTy);
4781
4782 StructType::element_iterator EI = STy->element_begin() + Index,
4783 EE = STy->element_end();
4784 if (EndOffset < SL->getSizeInBytes()) {
4785 unsigned EndIndex = SL->getElementContainingOffset(FixedOffset: EndOffset);
4786 if (Index == EndIndex)
4787 return nullptr; // Within a single element and its padding.
4788
4789 // Don't try to form "natural" types if the elements don't line up with the
4790 // expected size.
4791 // FIXME: We could potentially recurse down through the last element in the
4792 // sub-struct to find a natural end point.
4793 if (SL->getElementOffset(Idx: EndIndex) != EndOffset)
4794 return nullptr;
4795
4796 assert(Index < EndIndex);
4797 EE = STy->element_begin() + EndIndex;
4798 }
4799
4800 // Try to build up a sub-structure.
4801 StructType *SubTy =
4802 StructType::get(Context&: STy->getContext(), Elements: ArrayRef(EI, EE), isPacked: STy->isPacked());
4803 const StructLayout *SubSL = DL.getStructLayout(Ty: SubTy);
4804 if (Size != SubSL->getSizeInBytes())
4805 return nullptr; // The sub-struct doesn't have quite the size needed.
4806
4807 return SubTy;
4808}
4809
4810/// Pre-split loads and stores to simplify rewriting.
4811///
4812/// We want to break up the splittable load+store pairs as much as
4813/// possible. This is important to do as a preprocessing step, as once we
4814/// start rewriting the accesses to partitions of the alloca we lose the
4815/// necessary information to correctly split apart paired loads and stores
4816/// which both point into this alloca. The case to consider is something like
4817/// the following:
4818///
4819/// %a = alloca [12 x i8]
4820/// %gep1 = getelementptr i8, ptr %a, i32 0
4821/// %gep2 = getelementptr i8, ptr %a, i32 4
4822/// %gep3 = getelementptr i8, ptr %a, i32 8
4823/// store float 0.0, ptr %gep1
4824/// store float 1.0, ptr %gep2
4825/// %v = load i64, ptr %gep1
4826/// store i64 %v, ptr %gep2
4827/// %f1 = load float, ptr %gep2
4828/// %f2 = load float, ptr %gep3
4829///
4830/// Here we want to form 3 partitions of the alloca, each 4 bytes large, and
4831/// promote everything so we recover the 2 SSA values that should have been
4832/// there all along.
4833///
4834/// \returns true if any changes are made.
4835bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
4836 LLVM_DEBUG(dbgs() << "Pre-splitting loads and stores\n");
4837
4838 // Track the loads and stores which are candidates for pre-splitting here, in
4839 // the order they first appear during the partition scan. These give stable
4840 // iteration order and a basis for tracking which loads and stores we
4841 // actually split.
4842 SmallVector<LoadInst *, 4> Loads;
4843 SmallVector<StoreInst *, 4> Stores;
4844
4845 // We need to accumulate the splits required of each load or store where we
4846 // can find them via a direct lookup. This is important to cross-check loads
4847 // and stores against each other. We also track the slice so that we can kill
4848 // all the slices that end up split.
4849 struct SplitOffsets {
4850 Slice *S;
4851 std::vector<uint64_t> Splits;
4852 };
4853 SmallDenseMap<Instruction *, SplitOffsets, 8> SplitOffsetsMap;
4854
4855 // Track loads out of this alloca which cannot, for any reason, be pre-split.
4856 // This is important as we also cannot pre-split stores of those loads!
4857 // FIXME: This is all pretty gross. It means that we can be more aggressive
4858 // in pre-splitting when the load feeding the store happens to come from
4859 // a separate alloca. Put another way, the effectiveness of SROA would be
4860 // decreased by a frontend which just concatenated all of its local allocas
4861 // into one big flat alloca. But defeating such patterns is exactly the job
4862 // SROA is tasked with! Sadly, to not have this discrepancy we would have
4863 // change store pre-splitting to actually force pre-splitting of the load
4864 // that feeds it *and all stores*. That makes pre-splitting much harder, but
4865 // maybe it would make it more principled?
4866 SmallPtrSet<LoadInst *, 8> UnsplittableLoads;
4867
4868 LLVM_DEBUG(dbgs() << " Searching for candidate loads and stores\n");
4869 for (auto &P : AS.partitions()) {
4870 for (Slice &S : P) {
4871 Instruction *I = cast<Instruction>(Val: S.getUse()->getUser());
4872 if (!S.isSplittable() || S.endOffset() <= P.endOffset()) {
4873 // If this is a load we have to track that it can't participate in any
4874 // pre-splitting. If this is a store of a load we have to track that
4875 // that load also can't participate in any pre-splitting.
4876 if (auto *LI = dyn_cast<LoadInst>(Val: I))
4877 UnsplittableLoads.insert(Ptr: LI);
4878 else if (auto *SI = dyn_cast<StoreInst>(Val: I))
4879 if (auto *LI = dyn_cast<LoadInst>(Val: SI->getValueOperand()))
4880 UnsplittableLoads.insert(Ptr: LI);
4881 continue;
4882 }
4883 assert(P.endOffset() > S.beginOffset() &&
4884 "Empty or backwards partition!");
4885
4886 // Determine if this is a pre-splittable slice.
4887 if (auto *LI = dyn_cast<LoadInst>(Val: I)) {
4888 assert(!LI->isVolatile() && "Cannot split volatile loads!");
4889
4890 // The load must be used exclusively to store into other pointers for
4891 // us to be able to arbitrarily pre-split it. The stores must also be
4892 // simple to avoid changing semantics.
4893 auto IsLoadSimplyStored = [](LoadInst *LI) {
4894 for (User *LU : LI->users()) {
4895 auto *SI = dyn_cast<StoreInst>(Val: LU);
4896 if (!SI || !SI->isSimple())
4897 return false;
4898 }
4899 return true;
4900 };
4901 if (!IsLoadSimplyStored(LI)) {
4902 UnsplittableLoads.insert(Ptr: LI);
4903 continue;
4904 }
4905
4906 Loads.push_back(Elt: LI);
4907 } else if (auto *SI = dyn_cast<StoreInst>(Val: I)) {
4908 if (S.getUse() != &SI->getOperandUse(i: SI->getPointerOperandIndex()))
4909 // Skip stores *of* pointers. FIXME: This shouldn't even be possible!
4910 continue;
4911 auto *StoredLoad = dyn_cast<LoadInst>(Val: SI->getValueOperand());
4912 if (!StoredLoad || !StoredLoad->isSimple())
4913 continue;
4914 assert(!SI->isVolatile() && "Cannot split volatile stores!");
4915
4916 Stores.push_back(Elt: SI);
4917 } else {
4918 // Other uses cannot be pre-split.
4919 continue;
4920 }
4921
4922 // Record the initial split.
4923 LLVM_DEBUG(dbgs() << " Candidate: " << *I << "\n");
4924 auto &Offsets = SplitOffsetsMap[I];
4925 assert(Offsets.Splits.empty() &&
4926 "Should not have splits the first time we see an instruction!");
4927 Offsets.S = &S;
4928 Offsets.Splits.push_back(x: P.endOffset() - S.beginOffset());
4929 }
4930
4931 // Now scan the already split slices, and add a split for any of them which
4932 // we're going to pre-split.
4933 for (Slice *S : P.splitSliceTails()) {
4934 auto SplitOffsetsMapI =
4935 SplitOffsetsMap.find(Val: cast<Instruction>(Val: S->getUse()->getUser()));
4936 if (SplitOffsetsMapI == SplitOffsetsMap.end())
4937 continue;
4938 auto &Offsets = SplitOffsetsMapI->second;
4939
4940 assert(Offsets.S == S && "Found a mismatched slice!");
4941 assert(!Offsets.Splits.empty() &&
4942 "Cannot have an empty set of splits on the second partition!");
4943 assert(Offsets.Splits.back() ==
4944 P.beginOffset() - Offsets.S->beginOffset() &&
4945 "Previous split does not end where this one begins!");
4946
4947 // Record each split. The last partition's end isn't needed as the size
4948 // of the slice dictates that.
4949 if (S->endOffset() > P.endOffset())
4950 Offsets.Splits.push_back(x: P.endOffset() - Offsets.S->beginOffset());
4951 }
4952 }
4953
4954 // We may have split loads where some of their stores are split stores. For
4955 // such loads and stores, we can only pre-split them if their splits exactly
4956 // match relative to their starting offset. We have to verify this prior to
4957 // any rewriting.
4958 llvm::erase_if(C&: Stores, P: [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
4959 // Lookup the load we are storing in our map of split
4960 // offsets.
4961 auto *LI = cast<LoadInst>(Val: SI->getValueOperand());
4962 // If it was completely unsplittable, then we're done,
4963 // and this store can't be pre-split.
4964 if (UnsplittableLoads.count(Ptr: LI))
4965 return true;
4966
4967 auto LoadOffsetsI = SplitOffsetsMap.find(Val: LI);
4968 if (LoadOffsetsI == SplitOffsetsMap.end())
4969 return false; // Unrelated loads are definitely safe.
4970 auto &LoadOffsets = LoadOffsetsI->second;
4971
4972 // Now lookup the store's offsets.
4973 auto &StoreOffsets = SplitOffsetsMap[SI];
4974
4975 // If the relative offsets of each split in the load and
4976 // store match exactly, then we can split them and we
4977 // don't need to remove them here.
4978 if (LoadOffsets.Splits == StoreOffsets.Splits)
4979 return false;
4980
4981 LLVM_DEBUG(dbgs() << " Mismatched splits for load and store:\n"
4982 << " " << *LI << "\n"
4983 << " " << *SI << "\n");
4984
4985 // We've found a store and load that we need to split
4986 // with mismatched relative splits. Just give up on them
4987 // and remove both instructions from our list of
4988 // candidates.
4989 UnsplittableLoads.insert(Ptr: LI);
4990 return true;
4991 });
4992 // Now we have to go *back* through all the stores, because a later store may
4993 // have caused an earlier store's load to become unsplittable and if it is
4994 // unsplittable for the later store, then we can't rely on it being split in
4995 // the earlier store either.
4996 llvm::erase_if(C&: Stores, P: [&UnsplittableLoads](StoreInst *SI) {
4997 auto *LI = cast<LoadInst>(Val: SI->getValueOperand());
4998 return UnsplittableLoads.count(Ptr: LI);
4999 });
5000 // Once we've established all the loads that can't be split for some reason,
5001 // filter any that made it into our list out.
5002 llvm::erase_if(C&: Loads, P: [&UnsplittableLoads](LoadInst *LI) {
5003 return UnsplittableLoads.count(Ptr: LI);
5004 });
5005
5006 // If no loads or stores are left, there is no pre-splitting to be done for
5007 // this alloca.
5008 if (Loads.empty() && Stores.empty())
5009 return false;
5010
5011 // From here on, we can't fail and will be building new accesses, so rig up
5012 // an IR builder.
5013 IRBuilderTy IRB(&AI);
5014
5015 // Collect the new slices which we will merge into the alloca slices.
5016 SmallVector<Slice, 4> NewSlices;
5017
5018 // Track any allocas we end up splitting loads and stores for so we iterate
5019 // on them.
5020 SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas;
5021
5022 // At this point, we have collected all of the loads and stores we can
5023 // pre-split, and the specific splits needed for them. We actually do the
5024 // splitting in a specific order in order to handle when one of the loads in
5025 // the value operand to one of the stores.
5026 //
5027 // First, we rewrite all of the split loads, and just accumulate each split
5028 // load in a parallel structure. We also build the slices for them and append
5029 // them to the alloca slices.
5030 SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap;
5031 std::vector<LoadInst *> SplitLoads;
5032 const DataLayout &DL = AI.getDataLayout();
5033 for (LoadInst *LI : Loads) {
5034 SplitLoads.clear();
5035
5036 auto &Offsets = SplitOffsetsMap[LI];
5037 unsigned SliceSize = Offsets.S->endOffset() - Offsets.S->beginOffset();
5038 assert(LI->getType()->getIntegerBitWidth() % 8 == 0 &&
5039 "Load must have type size equal to store size");
5040 assert(LI->getType()->getIntegerBitWidth() / 8 >= SliceSize &&
5041 "Load must be >= slice size");
5042
5043 uint64_t BaseOffset = Offsets.S->beginOffset();
5044 assert(BaseOffset + SliceSize > BaseOffset &&
5045 "Cannot represent alloca access size using 64-bit integers!");
5046
5047 Instruction *BasePtr = cast<Instruction>(Val: LI->getPointerOperand());
5048 IRB.SetInsertPoint(LI);
5049
5050 LLVM_DEBUG(dbgs() << " Splitting load: " << *LI << "\n");
5051
5052 uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
5053 int Idx = 0, Size = Offsets.Splits.size();
5054 for (;;) {
5055 auto *PartTy = Type::getIntNTy(C&: LI->getContext(), N: PartSize * 8);
5056 auto AS = LI->getPointerAddressSpace();
5057 auto *PartPtrTy = LI->getPointerOperandType();
5058 LoadInst *PLoad = IRB.CreateAlignedLoad(
5059 Ty: PartTy,
5060 Ptr: getAdjustedPtr(IRB, DL, Ptr: BasePtr,
5061 Offset: APInt(DL.getIndexSizeInBits(AS), PartOffset),
5062 PointerTy: PartPtrTy, NamePrefix: BasePtr->getName() + "."),
5063 Align: getAdjustedAlignment(I: LI, Offset: PartOffset),
5064 /*IsVolatile*/ isVolatile: false, Name: LI->getName());
5065 PLoad->copyMetadata(SrcInst: *LI, WL: {LLVMContext::MD_mem_parallel_loop_access,
5066 LLVMContext::MD_access_group});
5067
5068 // Append this load onto the list of split loads so we can find it later
5069 // to rewrite the stores.
5070 SplitLoads.push_back(x: PLoad);
5071
5072 // Now build a new slice for the alloca.
5073 NewSlices.push_back(
5074 Elt: Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
5075 &PLoad->getOperandUse(i: PLoad->getPointerOperandIndex()),
5076 /*IsSplittable*/ false));
5077 LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
5078 << ", " << NewSlices.back().endOffset()
5079 << "): " << *PLoad << "\n");
5080
5081 // See if we've handled all the splits.
5082 if (Idx >= Size)
5083 break;
5084
5085 // Setup the next partition.
5086 PartOffset = Offsets.Splits[Idx];
5087 ++Idx;
5088 PartSize = (Idx < Size ? Offsets.Splits[Idx] : SliceSize) - PartOffset;
5089 }
5090
5091 // Now that we have the split loads, do the slow walk over all uses of the
5092 // load and rewrite them as split stores, or save the split loads to use
5093 // below if the store is going to be split there anyways.
5094 bool DeferredStores = false;
5095 for (User *LU : LI->users()) {
5096 StoreInst *SI = cast<StoreInst>(Val: LU);
5097 if (!Stores.empty() && SplitOffsetsMap.count(Val: SI)) {
5098 DeferredStores = true;
5099 LLVM_DEBUG(dbgs() << " Deferred splitting of store: " << *SI
5100 << "\n");
5101 continue;
5102 }
5103
5104 Value *StoreBasePtr = SI->getPointerOperand();
5105 IRB.SetInsertPoint(SI);
5106 AAMDNodes AATags = SI->getAAMetadata();
5107
5108 LLVM_DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n");
5109
5110 for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) {
5111 LoadInst *PLoad = SplitLoads[Idx];
5112 uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1];
5113 auto *PartPtrTy = SI->getPointerOperandType();
5114
5115 auto AS = SI->getPointerAddressSpace();
5116 StoreInst *PStore = IRB.CreateAlignedStore(
5117 Val: PLoad,
5118 Ptr: getAdjustedPtr(IRB, DL, Ptr: StoreBasePtr,
5119 Offset: APInt(DL.getIndexSizeInBits(AS), PartOffset),
5120 PointerTy: PartPtrTy, NamePrefix: StoreBasePtr->getName() + "."),
5121 Align: getAdjustedAlignment(I: SI, Offset: PartOffset),
5122 /*IsVolatile*/ isVolatile: false);
5123 PStore->copyMetadata(SrcInst: *SI, WL: {LLVMContext::MD_mem_parallel_loop_access,
5124 LLVMContext::MD_access_group,
5125 LLVMContext::MD_DIAssignID});
5126
5127 if (AATags)
5128 PStore->setAAMetadata(
5129 AATags.adjustForAccess(Offset: PartOffset, AccessTy: PLoad->getType(), DL));
5130 LLVM_DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n");
5131 }
5132
5133 // We want to immediately iterate on any allocas impacted by splitting
5134 // this store, and we have to track any promotable alloca (indicated by
5135 // a direct store) as needing to be resplit because it is no longer
5136 // promotable.
5137 if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(Val: StoreBasePtr)) {
5138 ResplitPromotableAllocas.insert(Ptr: OtherAI);
5139 Worklist.insert(X: OtherAI);
5140 } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
5141 Val: StoreBasePtr->stripInBoundsOffsets())) {
5142 Worklist.insert(X: OtherAI);
5143 }
5144
5145 // Mark the original store as dead.
5146 DeadInsts.push_back(Elt: SI);
5147 }
5148
5149 // Save the split loads if there are deferred stores among the users.
5150 if (DeferredStores)
5151 SplitLoadsMap.insert(KV: std::make_pair(x&: LI, y: std::move(SplitLoads)));
5152
5153 // Mark the original load as dead and kill the original slice.
5154 DeadInsts.push_back(Elt: LI);
5155 Offsets.S->kill();
5156 }
5157
5158 // Second, we rewrite all of the split stores. At this point, we know that
5159 // all loads from this alloca have been split already. For stores of such
5160 // loads, we can simply look up the pre-existing split loads. For stores of
5161 // other loads, we split those loads first and then write split stores of
5162 // them.
5163 for (StoreInst *SI : Stores) {
5164 auto *LI = cast<LoadInst>(Val: SI->getValueOperand());
5165 IntegerType *Ty = cast<IntegerType>(Val: LI->getType());
5166 assert(Ty->getBitWidth() % 8 == 0);
5167 uint64_t StoreSize = Ty->getBitWidth() / 8;
5168 assert(StoreSize > 0 && "Cannot have a zero-sized integer store!");
5169
5170 auto &Offsets = SplitOffsetsMap[SI];
5171 assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
5172 "Slice size should always match load size exactly!");
5173 uint64_t BaseOffset = Offsets.S->beginOffset();
5174 assert(BaseOffset + StoreSize > BaseOffset &&
5175 "Cannot represent alloca access size using 64-bit integers!");
5176
5177 Value *LoadBasePtr = LI->getPointerOperand();
5178 Instruction *StoreBasePtr = cast<Instruction>(Val: SI->getPointerOperand());
5179
5180 LLVM_DEBUG(dbgs() << " Splitting store: " << *SI << "\n");
5181
5182 // Check whether we have an already split load.
5183 auto SplitLoadsMapI = SplitLoadsMap.find(Val: LI);
5184 std::vector<LoadInst *> *SplitLoads = nullptr;
5185 if (SplitLoadsMapI != SplitLoadsMap.end()) {
5186 SplitLoads = &SplitLoadsMapI->second;
5187 assert(SplitLoads->size() == Offsets.Splits.size() + 1 &&
5188 "Too few split loads for the number of splits in the store!");
5189 } else {
5190 LLVM_DEBUG(dbgs() << " of load: " << *LI << "\n");
5191 }
5192
5193 uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
5194 int Idx = 0, Size = Offsets.Splits.size();
5195 for (;;) {
5196 auto *PartTy = Type::getIntNTy(C&: Ty->getContext(), N: PartSize * 8);
5197 auto *LoadPartPtrTy = LI->getPointerOperandType();
5198 auto *StorePartPtrTy = SI->getPointerOperandType();
5199
5200 // Either lookup a split load or create one.
5201 LoadInst *PLoad;
5202 if (SplitLoads) {
5203 PLoad = (*SplitLoads)[Idx];
5204 } else {
5205 IRB.SetInsertPoint(LI);
5206 auto AS = LI->getPointerAddressSpace();
5207 PLoad = IRB.CreateAlignedLoad(
5208 Ty: PartTy,
5209 Ptr: getAdjustedPtr(IRB, DL, Ptr: LoadBasePtr,
5210 Offset: APInt(DL.getIndexSizeInBits(AS), PartOffset),
5211 PointerTy: LoadPartPtrTy, NamePrefix: LoadBasePtr->getName() + "."),
5212 Align: getAdjustedAlignment(I: LI, Offset: PartOffset),
5213 /*IsVolatile*/ isVolatile: false, Name: LI->getName());
5214 PLoad->copyMetadata(SrcInst: *LI, WL: {LLVMContext::MD_mem_parallel_loop_access,
5215 LLVMContext::MD_access_group});
5216 }
5217
5218 // And store this partition.
5219 IRB.SetInsertPoint(SI);
5220 auto AS = SI->getPointerAddressSpace();
5221 StoreInst *PStore = IRB.CreateAlignedStore(
5222 Val: PLoad,
5223 Ptr: getAdjustedPtr(IRB, DL, Ptr: StoreBasePtr,
5224 Offset: APInt(DL.getIndexSizeInBits(AS), PartOffset),
5225 PointerTy: StorePartPtrTy, NamePrefix: StoreBasePtr->getName() + "."),
5226 Align: getAdjustedAlignment(I: SI, Offset: PartOffset),
5227 /*IsVolatile*/ isVolatile: false);
5228 PStore->copyMetadata(SrcInst: *SI, WL: {LLVMContext::MD_mem_parallel_loop_access,
5229 LLVMContext::MD_access_group});
5230
5231 // Now build a new slice for the alloca.
5232 NewSlices.push_back(
5233 Elt: Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
5234 &PStore->getOperandUse(i: PStore->getPointerOperandIndex()),
5235 /*IsSplittable*/ false));
5236 LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
5237 << ", " << NewSlices.back().endOffset()
5238 << "): " << *PStore << "\n");
5239 if (!SplitLoads) {
5240 LLVM_DEBUG(dbgs() << " of split load: " << *PLoad << "\n");
5241 }
5242
5243 // See if we've finished all the splits.
5244 if (Idx >= Size)
5245 break;
5246
5247 // Setup the next partition.
5248 PartOffset = Offsets.Splits[Idx];
5249 ++Idx;
5250 PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset;
5251 }
5252
5253 // We want to immediately iterate on any allocas impacted by splitting
5254 // this load, which is only relevant if it isn't a load of this alloca and
5255 // thus we didn't already split the loads above. We also have to keep track
5256 // of any promotable allocas we split loads on as they can no longer be
5257 // promoted.
5258 if (!SplitLoads) {
5259 if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(Val: LoadBasePtr)) {
5260 assert(OtherAI != &AI && "We can't re-split our own alloca!");
5261 ResplitPromotableAllocas.insert(Ptr: OtherAI);
5262 Worklist.insert(X: OtherAI);
5263 } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
5264 Val: LoadBasePtr->stripInBoundsOffsets())) {
5265 assert(OtherAI != &AI && "We can't re-split our own alloca!");
5266 Worklist.insert(X: OtherAI);
5267 }
5268 }
5269
5270 // Mark the original store as dead now that we've split it up and kill its
5271 // slice. Note that we leave the original load in place unless this store
5272 // was its only use. It may in turn be split up if it is an alloca load
5273 // for some other alloca, but it may be a normal load. This may introduce
5274 // redundant loads, but where those can be merged the rest of the optimizer
5275 // should handle the merging, and this uncovers SSA splits which is more
5276 // important. In practice, the original loads will almost always be fully
5277 // split and removed eventually, and the splits will be merged by any
5278 // trivial CSE, including instcombine.
5279 if (LI->hasOneUse()) {
5280 assert(*LI->user_begin() == SI && "Single use isn't this store!");
5281 DeadInsts.push_back(Elt: LI);
5282 }
5283 DeadInsts.push_back(Elt: SI);
5284 Offsets.S->kill();
5285 }
5286
5287 // Remove the killed slices that have ben pre-split.
5288 llvm::erase_if(C&: AS, P: [](const Slice &S) { return S.isDead(); });
5289
5290 // Insert our new slices. This will sort and merge them into the sorted
5291 // sequence.
5292 AS.insert(NewSlices);
5293
5294 LLVM_DEBUG(dbgs() << " Pre-split slices:\n");
5295#ifndef NDEBUG
5296 for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
5297 LLVM_DEBUG(AS.print(dbgs(), I, " "));
5298#endif
5299
5300 // Finally, don't try to promote any allocas that new require re-splitting.
5301 // They have already been added to the worklist above.
5302 PromotableAllocas.set_subtract(ResplitPromotableAllocas);
5303
5304 return true;
5305}
5306
5307/// Try to canonicalize a homogeneous struct partition to a vector type.
5308///
5309/// We can do this if all the elements of the struct are the same and the
5310/// corresponding vector has the same byte-level layout. This can sometimes
5311/// eliminate allocas because structs cannot get promoted to LLVM values, but
5312/// vectors can.
5313///
5314/// We only apply this transformation when all users of the partition are memory
5315/// intrinsics. Otherwise, if there is a load or store of some other type to the
5316/// partition, SROA would select that type.
5317///
5318/// Applying this transformation too early may hinder memcpyopt, which may
5319/// generate better code when eliminating allocas. For example, see
5320/// `struct-to-vector-fp-store-only-tail.ll`, which demonstrates that applying
5321/// this before memcpyopt can initialize previously uninitialized memory when
5322/// the alloca gets promoted to an SSA value. For another example, see
5323/// `struct-to-vector-before-memcpyopt.ll`, which demonstrates that applying
5324/// this before memcpyopt can result in promoting an alloca so that we load a
5325/// temporary value instead of copying the temporary value into memory, whereas
5326/// memcpyopt eliminates the temporary altogether.
5327///
5328/// As such, we only apply this transformation after memcpyopt has run. We gate
5329/// this transformation by the "AggregateToVector" pass option.
5330static FixedVectorType *tryCanonicalizeStructToVector(StructType *STy,
5331 Partition &P,
5332 const DataLayout &DL) {
5333 unsigned NumElts = STy->getNumElements();
5334
5335 Type *EltTy = STy->getElementType(N: 0);
5336 if (!llvm::all_equal(Range: STy->elements()))
5337 return nullptr;
5338
5339 bool IsIntegralPointerTy =
5340 EltTy->isPointerTy() && !DL.isNonIntegralPointerType(Ty: EltTy);
5341 if (!EltTy->isIntegerTy() && !EltTy->isFloatingPointTy() &&
5342 !IsIntegralPointerTy)
5343 return nullptr;
5344
5345 // Ensure the struct is tightly packed so that the bit-layout is the same as
5346 // the corresponding vector. For example, this prevents a miscompile for
5347 // { i5, i5 }, which has padding after each i5 field, whereas <i5, i5> has
5348 // tightly packed elements and trailing padding.
5349 if (DL.getTypeSizeInBits(Ty: EltTy) != DL.getTypeAllocSizeInBits(Ty: EltTy))
5350 return nullptr;
5351
5352 auto *VTy = FixedVectorType::get(ElementType: EltTy, NumElts);
5353 TypeSize StructSize = DL.getStructLayout(Ty: STy)->getSizeInBytes();
5354 TypeSize VectorSize = DL.getTypeStoreSize(Ty: VTy);
5355 // After ruling out per-element padding, make sure a vector load/store
5356 // covers the same number of bytes as the struct layout.
5357 if (StructSize != VectorSize)
5358 return nullptr;
5359
5360 auto IsIgnorableOrMemIntrinsicSlice = [](const Slice &S) {
5361 if (S.isDead())
5362 return true;
5363 auto *U = S.getUse();
5364 if (!U)
5365 return true;
5366
5367 User *Usr = U->getUser();
5368 if (isa<LifetimeIntrinsic>(Val: Usr) || isa<DbgInfoIntrinsic>(Val: Usr))
5369 return true;
5370
5371 return isa<MemIntrinsic>(Val: Usr);
5372 };
5373
5374 for (const Slice &S : P)
5375 if (!IsIgnorableOrMemIntrinsicSlice(S))
5376 return nullptr;
5377
5378 for (const Slice *S : P.splitSliceTails())
5379 if (!IsIgnorableOrMemIntrinsicSlice(*S))
5380 return nullptr;
5381
5382 return VTy;
5383}
5384
5385/// Select a partition type for an alloca partition.
5386///
5387/// Try to compute a friendly type for this partition of the alloca. This
5388/// won't always succeed, in which case we fall back to a legal integer type
5389/// or an i8 array of an appropriate size.
5390///
5391/// \returns A tuple with the following elements:
5392/// - PartitionType: The computed type for this partition.
5393/// - IsIntegerWideningViable: True if integer widening promotion is used.
5394/// - VectorType: The vector type if vector promotion is used, otherwise
5395/// nullptr.
5396static std::tuple<Type *, bool, VectorType *>
5397selectPartitionType(Partition &P, const DataLayout &DL, AllocaInst &AI,
5398 LLVMContext &C, bool AggregateToVector) {
5399 auto LogSelection = [&](StringRef Path, Type *SelectedTy,
5400 VectorType *SelectedVecTy, bool SelectedIntWidening) {
5401 LLVM_DEBUG({
5402 dbgs() << "selectPartitionType path=" << Path
5403 << " func=" << AI.getFunction()->getName() << " alloca=";
5404 if (AI.hasName())
5405 dbgs() << AI.getName();
5406 else
5407 dbgs() << "<unnamed>";
5408 dbgs() << " partition=[" << P.beginOffset() << "," << P.endOffset()
5409 << ") size=" << P.size();
5410 if (std::optional<TypeSize> AllocSize = AI.getAllocationSize(DL))
5411 dbgs() << " alloc-size=" << AllocSize->getKnownMinValue();
5412 if (SelectedTy)
5413 dbgs() << " chosen=" << *SelectedTy;
5414 if (SelectedVecTy)
5415 dbgs() << " vec=" << *SelectedVecTy;
5416 dbgs() << " intwiden=" << SelectedIntWidening << "\n";
5417 });
5418 };
5419 // First check if the partition is viable for vector promotion.
5420 //
5421 // We prefer vector promotion over integer widening promotion when:
5422 // - The vector element type is a floating-point type.
5423 // - All the loads/stores to the alloca are vector loads/stores to the
5424 // entire alloca or load/store a single element of the vector.
5425 //
5426 // Otherwise when there is an integer vector with mixed type loads/stores we
5427 // prefer integer widening promotion because it's more likely the user is
5428 // doing bitwise arithmetic and we generate better code.
5429 VectorType *VecTy =
5430 isVectorPromotionViable(P, DL, VScale: AI.getFunction()->getVScaleValue());
5431 // If the vector element type is a floating-point type, we prefer vector
5432 // promotion. If the vector has one element, let the below code select
5433 // whether we promote with the vector or scalar.
5434 if (VecTy && VecTy->getElementType()->isFloatingPointTy() &&
5435 VecTy->getElementCount().getFixedValue() > 1) {
5436 LogSelection("direct-fp-vecty", VecTy, VecTy, false);
5437 return {VecTy, false, VecTy};
5438 }
5439
5440 // Check if there is a common type that all slices of the partition use that
5441 // spans the partition.
5442 auto [CommonUseTy, LargestIntTy] =
5443 findCommonType(B: P.begin(), E: P.end(), EndOffset: P.endOffset());
5444 if (CommonUseTy) {
5445 TypeSize CommonUseSize = DL.getTypeAllocSize(Ty: CommonUseTy);
5446 if (CommonUseSize.isFixed() && CommonUseSize.getFixedValue() >= P.size()) {
5447 // We prefer vector promotion here because if vector promotion is viable
5448 // and there is a common type used, then it implies the second listed
5449 // condition for preferring vector promotion is true.
5450 if (VecTy) {
5451 LogSelection("common-type-vecty", VecTy, VecTy, false);
5452 return {VecTy, false, VecTy};
5453 }
5454 bool IntWiden = isIntegerWideningViable(P, AllocaTy: CommonUseTy, DL);
5455 LogSelection("common-type", CommonUseTy, nullptr, IntWiden);
5456 return {CommonUseTy, IntWiden, nullptr};
5457 }
5458 }
5459
5460 // Can we find an appropriate subtype in the original allocated
5461 // type?
5462 if (Type *TypePartitionTy = getTypePartition(DL, Ty: AI.getAllocatedType(),
5463 Offset: P.beginOffset(), Size: P.size())) {
5464 // If the partition is an integer array that can be spanned by a legal
5465 // integer type, prefer to represent it as a legal integer type because
5466 // it's more likely to be promotable.
5467 if (TypePartitionTy->isArrayTy() &&
5468 TypePartitionTy->getArrayElementType()->isIntegerTy() &&
5469 DL.isLegalInteger(Width: P.size() * 8))
5470 TypePartitionTy = Type::getIntNTy(C, N: P.size() * 8);
5471 // There was no common type used, so we prefer integer widening promotion.
5472 if (isIntegerWideningViable(P, AllocaTy: TypePartitionTy, DL)) {
5473 LogSelection("type-partition-int-widen", TypePartitionTy, nullptr, true);
5474 return {TypePartitionTy, true, nullptr};
5475 }
5476 if (VecTy) {
5477 LogSelection("type-partition-vecty", VecTy, VecTy, false);
5478 return {VecTy, false, VecTy};
5479 }
5480 // If we couldn't promote with TypePartitionTy, try with the largest
5481 // integer type used.
5482 if (LargestIntTy &&
5483 DL.getTypeAllocSize(Ty: LargestIntTy).getFixedValue() >= P.size() &&
5484 isIntegerWideningViable(P, AllocaTy: LargestIntTy, DL)) {
5485 LogSelection("largest-int-int-widen", LargestIntTy, nullptr, true);
5486 return {LargestIntTy, true, nullptr};
5487 }
5488
5489 // Try homogeneous struct to vector canonicalization when requested. Running
5490 // this too early can hide memcpy chains from MemCpyOpt.
5491 if (AggregateToVector) {
5492 if (auto *STy = dyn_cast<StructType>(Val: TypePartitionTy)) {
5493 if (auto *VTy = tryCanonicalizeStructToVector(STy, P, DL)) {
5494 LogSelection("struct-fallback-vecty", VTy, nullptr, false);
5495 return {VTy, false, nullptr};
5496 }
5497 }
5498 }
5499
5500 // Fallback to TypePartitionTy and we probably won't promote.
5501 LogSelection("type-partition-fallback", TypePartitionTy, nullptr, false);
5502 return {TypePartitionTy, false, nullptr};
5503 }
5504
5505 // Select the largest integer type used if it spans the partition.
5506 if (LargestIntTy &&
5507 DL.getTypeAllocSize(Ty: LargestIntTy).getFixedValue() >= P.size()) {
5508 LogSelection("largest-int-fallback", LargestIntTy, nullptr, false);
5509 return {LargestIntTy, false, nullptr};
5510 }
5511
5512 // Select a legal integer type if it spans the partition.
5513 if (DL.isLegalInteger(Width: P.size() * 8)) {
5514 Type *IntTy = Type::getIntNTy(C, N: P.size() * 8);
5515 LogSelection("legal-int-fallback", IntTy, nullptr, false);
5516 return {IntTy, false, nullptr};
5517 }
5518
5519 // Fallback to an i8 array.
5520 Type *ArrayTy = ArrayType::get(ElementType: Type::getInt8Ty(C), NumElements: P.size());
5521 LogSelection("byte-array-fallback", ArrayTy, nullptr, false);
5522 return {ArrayTy, false, nullptr};
5523}
5524
5525/// Rewrite an alloca partition's users.
5526///
5527/// This routine drives both of the rewriting goals of the SROA pass. It tries
5528/// to rewrite uses of an alloca partition to be conducive for SSA value
5529/// promotion. If the partition needs a new, more refined alloca, this will
5530/// build that new alloca, preserving as much type information as possible, and
5531/// rewrite the uses of the old alloca to point at the new one and have the
5532/// appropriate new offsets. It also evaluates how successful the rewrite was
5533/// at enabling promotion and if it was successful queues the alloca to be
5534/// promoted.
5535std::pair<AllocaInst *, uint64_t>
5536SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P) {
5537 const DataLayout &DL = AI.getDataLayout();
5538 // Select the type for the new alloca that spans the partition.
5539 auto [PartitionTy, IsIntegerWideningViable, VecTy] =
5540 selectPartitionType(P, DL, AI, C&: *C, AggregateToVector);
5541
5542 // Check for the case where we're going to rewrite to a new alloca of the
5543 // exact same type as the original, and with the same access offsets. In that
5544 // case, re-use the existing alloca, but still run through the rewriter to
5545 // perform phi and select speculation.
5546 // P.beginOffset() can be non-zero even with the same type in a case with
5547 // out-of-bounds access (e.g. @PR35657 function in SROA/basictest.ll).
5548 AllocaInst *NewAI;
5549 if (PartitionTy == AI.getAllocatedType() && P.beginOffset() == 0) {
5550 NewAI = &AI;
5551 // FIXME: We should be able to bail at this point with "nothing changed".
5552 // FIXME: We might want to defer PHI speculation until after here.
5553 // FIXME: return nullptr;
5554 } else {
5555 // Make sure the alignment is compatible with P.beginOffset().
5556 const Align Alignment = commonAlignment(A: AI.getAlign(), Offset: P.beginOffset());
5557 // If we will get at least this much alignment from the type alone, leave
5558 // the alloca's alignment unconstrained.
5559 const bool IsUnconstrained = Alignment <= DL.getABITypeAlign(Ty: PartitionTy);
5560 NewAI = new AllocaInst(
5561 PartitionTy, AI.getAddressSpace(), nullptr,
5562 IsUnconstrained ? DL.getPrefTypeAlign(Ty: PartitionTy) : Alignment,
5563 AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()),
5564 AI.getIterator());
5565 // Copy the old AI debug location over to the new one.
5566 NewAI->setDebugLoc(AI.getDebugLoc());
5567 ++NumNewAllocas;
5568 }
5569
5570 LLVM_DEBUG(dbgs() << "Rewriting alloca partition " << "[" << P.beginOffset()
5571 << "," << P.endOffset() << ") to: " << *NewAI << "\n");
5572
5573 // Track the high watermark on the worklist as it is only relevant for
5574 // promoted allocas. We will reset it to this point if the alloca is not in
5575 // fact scheduled for promotion.
5576 unsigned PPWOldSize = PostPromotionWorklist.size();
5577 unsigned NumUses = 0;
5578 SmallSetVector<PHINode *, 8> PHIUsers;
5579 SmallSetVector<SelectInst *, 8> SelectUsers;
5580
5581 AllocaSliceRewriter Rewriter(
5582 DL, AS, *this, AI, *NewAI, PartitionTy, P.beginOffset(), P.endOffset(),
5583 IsIntegerWideningViable, VecTy, PHIUsers, SelectUsers);
5584 bool Promotable = true;
5585 // Check whether we can have tree-structured merge.
5586 if (auto DeletedValues = Rewriter.rewriteTreeStructuredMerge(P)) {
5587 NumUses += DeletedValues->size() + 1;
5588 for (Value *V : *DeletedValues)
5589 DeadInsts.push_back(Elt: V);
5590 } else {
5591 for (Slice *S : P.splitSliceTails()) {
5592 Promotable &= Rewriter.visit(I: S);
5593 ++NumUses;
5594 }
5595 for (Slice &S : P) {
5596 Promotable &= Rewriter.visit(I: &S);
5597 ++NumUses;
5598 }
5599 }
5600
5601 NumAllocaPartitionUses += NumUses;
5602 MaxUsesPerAllocaPartition.updateMax(V: NumUses);
5603
5604 // Now that we've processed all the slices in the new partition, check if any
5605 // PHIs or Selects would block promotion.
5606 for (PHINode *PHI : PHIUsers)
5607 if (!isSafePHIToSpeculate(PN&: *PHI)) {
5608 Promotable = false;
5609 PHIUsers.clear();
5610 SelectUsers.clear();
5611 break;
5612 }
5613
5614 SmallVector<std::pair<SelectInst *, RewriteableMemOps>, 2>
5615 NewSelectsToRewrite;
5616 NewSelectsToRewrite.reserve(N: SelectUsers.size());
5617 for (SelectInst *Sel : SelectUsers) {
5618 std::optional<RewriteableMemOps> Ops =
5619 isSafeSelectToSpeculate(SI&: *Sel, PreserveCFG);
5620 if (!Ops) {
5621 Promotable = false;
5622 PHIUsers.clear();
5623 SelectUsers.clear();
5624 NewSelectsToRewrite.clear();
5625 break;
5626 }
5627 NewSelectsToRewrite.emplace_back(Args: std::make_pair(x&: Sel, y&: *Ops));
5628 }
5629
5630 if (Promotable) {
5631 for (Use *U : AS.getDeadUsesIfPromotable()) {
5632 auto *OldInst = dyn_cast<Instruction>(Val: U->get());
5633 Value::dropDroppableUse(U&: *U);
5634 if (OldInst)
5635 if (isInstructionTriviallyDead(I: OldInst))
5636 DeadInsts.push_back(Elt: OldInst);
5637 }
5638 if (PHIUsers.empty() && SelectUsers.empty()) {
5639 // Promote the alloca.
5640 PromotableAllocas.insert(X: NewAI);
5641 } else {
5642 // If we have either PHIs or Selects to speculate, add them to those
5643 // worklists and re-queue the new alloca so that we promote in on the
5644 // next iteration.
5645 SpeculatablePHIs.insert_range(R&: PHIUsers);
5646 SelectsToRewrite.reserve(NumEntries: SelectsToRewrite.size() +
5647 NewSelectsToRewrite.size());
5648 for (auto &&KV : llvm::make_range(
5649 x: std::make_move_iterator(i: NewSelectsToRewrite.begin()),
5650 y: std::make_move_iterator(i: NewSelectsToRewrite.end())))
5651 SelectsToRewrite.insert(KV: std::move(KV));
5652 Worklist.insert(X: NewAI);
5653 }
5654 } else {
5655 // Drop any post-promotion work items if promotion didn't happen.
5656 while (PostPromotionWorklist.size() > PPWOldSize)
5657 PostPromotionWorklist.pop_back();
5658
5659 // We couldn't promote and we didn't create a new partition, nothing
5660 // happened.
5661 if (NewAI == &AI)
5662 return {nullptr, 0};
5663
5664 // If we can't promote the alloca, iterate on it to check for new
5665 // refinements exposed by splitting the current alloca. Don't iterate on an
5666 // alloca which didn't actually change and didn't get promoted.
5667 Worklist.insert(X: NewAI);
5668 }
5669
5670 return {NewAI, DL.getTypeSizeInBits(Ty: PartitionTy).getFixedValue()};
5671}
5672
5673// There isn't a shared interface to get the "address" parts out of a
5674// dbg.declare and dbg.assign, so provide some wrappers.
5675bool isKillAddress(const DbgVariableRecord *DVR) {
5676 if (DVR->getType() == DbgVariableRecord::LocationType::Assign)
5677 return DVR->isKillAddress();
5678 return DVR->isKillLocation();
5679}
5680
5681const DIExpression *getAddressExpression(const DbgVariableRecord *DVR) {
5682 if (DVR->getType() == DbgVariableRecord::LocationType::Assign)
5683 return DVR->getAddressExpression();
5684 return DVR->getExpression();
5685}
5686
5687/// Create or replace an existing fragment in a DIExpression with \p Frag.
5688/// If the expression already contains a DW_OP_LLVM_extract_bits_[sz]ext
5689/// operation, add \p BitExtractOffset to the offset part.
5690///
5691/// Returns the new expression, or nullptr if this fails (see details below).
5692///
5693/// This function is similar to DIExpression::createFragmentExpression except
5694/// for 3 important distinctions:
5695/// 1. The new fragment isn't relative to an existing fragment.
5696/// 2. It assumes the computed location is a memory location. This means we
5697/// don't need to perform checks that creating the fragment preserves the
5698/// expression semantics.
5699/// 3. Existing extract_bits are modified independently of fragment changes
5700/// using \p BitExtractOffset. A change to the fragment offset or size
5701/// may affect a bit extract. But a bit extract offset can change
5702/// independently of the fragment dimensions.
5703///
5704/// Returns the new expression, or nullptr if one couldn't be created.
5705/// Ideally this is only used to signal that a bit-extract has become
5706/// zero-sized (and thus the new debug record has no size and can be
5707/// dropped), however, it fails for other reasons too - see the FIXME below.
5708///
5709/// FIXME: To keep the change that introduces this function NFC it bails
5710/// in some situations unecessarily, e.g. when fragment and bit extract
5711/// sizes differ.
5712static DIExpression *createOrReplaceFragment(const DIExpression *Expr,
5713 DIExpression::FragmentInfo Frag,
5714 int64_t BitExtractOffset) {
5715 SmallVector<uint64_t, 8> Ops;
5716 bool HasFragment = false;
5717 bool HasBitExtract = false;
5718
5719 for (auto &Op : Expr->expr_ops()) {
5720 if (Op.getOp() == dwarf::DW_OP_LLVM_fragment) {
5721 HasFragment = true;
5722 continue;
5723 }
5724 if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext ||
5725 Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_sext) {
5726 HasBitExtract = true;
5727 int64_t ExtractOffsetInBits = Op.getArg(I: 0);
5728 int64_t ExtractSizeInBits = Op.getArg(I: 1);
5729
5730 // DIExpression::createFragmentExpression doesn't know how to handle
5731 // a fragment that is smaller than the extract. Copy the behaviour
5732 // (bail) to avoid non-NFC changes.
5733 // FIXME: Don't do this.
5734 if (Frag.SizeInBits < uint64_t(ExtractSizeInBits))
5735 return nullptr;
5736
5737 assert(BitExtractOffset <= 0);
5738 int64_t AdjustedOffset = ExtractOffsetInBits + BitExtractOffset;
5739
5740 // DIExpression::createFragmentExpression doesn't know what to do
5741 // if the new extract starts "outside" the existing one. Copy the
5742 // behaviour (bail) to avoid non-NFC changes.
5743 // FIXME: Don't do this.
5744 if (AdjustedOffset < 0)
5745 return nullptr;
5746
5747 Ops.push_back(Elt: Op.getOp());
5748 Ops.push_back(Elt: std::max<int64_t>(a: 0, b: AdjustedOffset));
5749 Ops.push_back(Elt: ExtractSizeInBits);
5750 continue;
5751 }
5752 Op.appendToVector(V&: Ops);
5753 }
5754
5755 // Unsupported by createFragmentExpression, so don't support it here yet to
5756 // preserve NFC-ness.
5757 if (HasFragment && HasBitExtract)
5758 return nullptr;
5759
5760 if (!HasBitExtract) {
5761 Ops.push_back(Elt: dwarf::DW_OP_LLVM_fragment);
5762 Ops.push_back(Elt: Frag.OffsetInBits);
5763 Ops.push_back(Elt: Frag.SizeInBits);
5764 }
5765 return DIExpression::get(Context&: Expr->getContext(), Elements: Ops);
5766}
5767
5768/// Insert a new DbgRecord.
5769/// \p Orig Original to copy record type, debug loc and variable from, and
5770/// additionally value and value expression for dbg_assign records.
5771/// \p NewAddr Location's new base address.
5772/// \p NewAddrExpr New expression to apply to address.
5773/// \p BeforeInst Insert position.
5774/// \p NewFragment New fragment (absolute, non-relative).
5775/// \p BitExtractAdjustment Offset to apply to any extract_bits op.
5776static void
5777insertNewDbgInst(DIBuilder &DIB, DbgVariableRecord *Orig, AllocaInst *NewAddr,
5778 DIExpression *NewAddrExpr, Instruction *BeforeInst,
5779 std::optional<DIExpression::FragmentInfo> NewFragment,
5780 int64_t BitExtractAdjustment) {
5781 (void)DIB;
5782
5783 // A dbg_assign puts fragment info in the value expression only. The address
5784 // expression has already been built: NewAddrExpr. A dbg_declare puts the
5785 // new fragment info into NewAddrExpr (as it only has one expression).
5786 DIExpression *NewFragmentExpr =
5787 Orig->isDbgAssign() ? Orig->getExpression() : NewAddrExpr;
5788 if (NewFragment)
5789 NewFragmentExpr = createOrReplaceFragment(Expr: NewFragmentExpr, Frag: *NewFragment,
5790 BitExtractOffset: BitExtractAdjustment);
5791 if (!NewFragmentExpr)
5792 return;
5793
5794 if (Orig->isDbgDeclare()) {
5795 DbgVariableRecord *DVR = DbgVariableRecord::createDVRDeclare(
5796 Address: NewAddr, DV: Orig->getVariable(), Expr: NewFragmentExpr, DI: Orig->getDebugLoc());
5797 BeforeInst->getParent()->insertDbgRecordBefore(DR: DVR,
5798 Here: BeforeInst->getIterator());
5799 return;
5800 }
5801
5802 if (Orig->isDbgValue()) {
5803 DbgVariableRecord *DVR = DbgVariableRecord::createDbgVariableRecord(
5804 Location: NewAddr, DV: Orig->getVariable(), Expr: NewFragmentExpr, DI: Orig->getDebugLoc());
5805 // Drop debug information if the expression doesn't start with a
5806 // DW_OP_deref. This is because without a DW_OP_deref, the #dbg_value
5807 // describes the address of alloca rather than the value inside the alloca.
5808 if (!NewFragmentExpr->startsWithDeref())
5809 DVR->setKillAddress();
5810 BeforeInst->getParent()->insertDbgRecordBefore(DR: DVR,
5811 Here: BeforeInst->getIterator());
5812 return;
5813 }
5814
5815 // Apply a DIAssignID to the store if it doesn't already have it.
5816 if (!NewAddr->hasMetadata(KindID: LLVMContext::MD_DIAssignID)) {
5817 NewAddr->setMetadata(KindID: LLVMContext::MD_DIAssignID,
5818 Node: DIAssignID::getDistinct(Context&: NewAddr->getContext()));
5819 }
5820
5821 DbgVariableRecord *NewAssign = DbgVariableRecord::createLinkedDVRAssign(
5822 LinkedInstr: NewAddr, Val: Orig->getValue(), Variable: Orig->getVariable(), Expression: NewFragmentExpr, Address: NewAddr,
5823 AddressExpression: NewAddrExpr, DI: Orig->getDebugLoc());
5824 LLVM_DEBUG(dbgs() << "Created new DVRAssign: " << *NewAssign << "\n");
5825 (void)NewAssign;
5826}
5827
5828/// Walks the slices of an alloca and form partitions based on them,
5829/// rewriting each of their uses.
5830bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
5831 if (AS.begin() == AS.end())
5832 return false;
5833
5834 unsigned NumPartitions = 0;
5835 bool Changed = false;
5836 const DataLayout &DL = AI.getModule()->getDataLayout();
5837
5838 // First try to pre-split loads and stores.
5839 Changed |= presplitLoadsAndStores(AI, AS);
5840
5841 // Now that we have identified any pre-splitting opportunities,
5842 // mark loads and stores unsplittable except for the following case.
5843 // We leave a slice splittable if all other slices are disjoint or fully
5844 // included in the slice, such as whole-alloca loads and stores.
5845 // If we fail to split these during pre-splitting, we want to force them
5846 // to be rewritten into a partition.
5847 bool IsSorted = true;
5848
5849 uint64_t AllocaSize = AI.getAllocationSize(DL)->getFixedValue();
5850 const uint64_t MaxBitVectorSize = 1024;
5851 if (AllocaSize <= MaxBitVectorSize) {
5852 // If a byte boundary is included in any load or store, a slice starting or
5853 // ending at the boundary is not splittable.
5854 SmallBitVector SplittableOffset(AllocaSize + 1, true);
5855 for (Slice &S : AS)
5856 for (unsigned O = S.beginOffset() + 1;
5857 O < S.endOffset() && O < AllocaSize; O++)
5858 SplittableOffset.reset(Idx: O);
5859
5860 for (Slice &S : AS) {
5861 if (!S.isSplittable())
5862 continue;
5863
5864 if ((S.beginOffset() > AllocaSize || SplittableOffset[S.beginOffset()]) &&
5865 (S.endOffset() > AllocaSize || SplittableOffset[S.endOffset()]))
5866 continue;
5867
5868 if (isa<LoadInst>(Val: S.getUse()->getUser()) ||
5869 isa<StoreInst>(Val: S.getUse()->getUser())) {
5870 S.makeUnsplittable();
5871 IsSorted = false;
5872 }
5873 }
5874 } else {
5875 // We only allow whole-alloca splittable loads and stores
5876 // for a large alloca to avoid creating too large BitVector.
5877 for (Slice &S : AS) {
5878 if (!S.isSplittable())
5879 continue;
5880
5881 if (S.beginOffset() == 0 && S.endOffset() >= AllocaSize)
5882 continue;
5883
5884 if (isa<LoadInst>(Val: S.getUse()->getUser()) ||
5885 isa<StoreInst>(Val: S.getUse()->getUser())) {
5886 S.makeUnsplittable();
5887 IsSorted = false;
5888 }
5889 }
5890 }
5891
5892 if (!IsSorted)
5893 llvm::stable_sort(Range&: AS);
5894
5895 /// Describes the allocas introduced by rewritePartition in order to migrate
5896 /// the debug info.
5897 struct Fragment {
5898 AllocaInst *Alloca;
5899 uint64_t Offset;
5900 uint64_t Size;
5901 Fragment(AllocaInst *AI, uint64_t O, uint64_t S)
5902 : Alloca(AI), Offset(O), Size(S) {}
5903 };
5904 SmallVector<Fragment, 4> Fragments;
5905
5906 // Rewrite each partition.
5907 for (auto &P : AS.partitions()) {
5908 auto [NewAI, ActiveBits] = rewritePartition(AI, AS, P);
5909 if (NewAI) {
5910 Changed = true;
5911 if (NewAI != &AI) {
5912 uint64_t SizeOfByte = 8;
5913 // Don't include any padding.
5914 uint64_t Size = std::min(a: ActiveBits, b: P.size() * SizeOfByte);
5915 Fragments.push_back(
5916 Elt: Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
5917 }
5918 }
5919 ++NumPartitions;
5920 }
5921
5922 NumAllocaPartitions += NumPartitions;
5923 MaxPartitionsPerAlloca.updateMax(V: NumPartitions);
5924
5925 // Migrate debug information from the old alloca to the new alloca(s)
5926 // and the individual partitions.
5927 auto MigrateOne = [&](DbgVariableRecord *DbgVariable) {
5928 // Can't overlap with undef memory.
5929 if (isKillAddress(DVR: DbgVariable))
5930 return;
5931
5932 const Value *DbgPtr = DbgVariable->getAddress();
5933 DIExpression::FragmentInfo VarFrag =
5934 DbgVariable->getFragmentOrEntireVariable();
5935 // Get the address expression constant offset if one exists and the ops
5936 // that come after it.
5937 int64_t CurrentExprOffsetInBytes = 0;
5938 SmallVector<uint64_t> PostOffsetOps;
5939 if (!getAddressExpression(DVR: DbgVariable)
5940 ->extractLeadingOffset(OffsetInBytes&: CurrentExprOffsetInBytes, RemainingOps&: PostOffsetOps))
5941 return; // Couldn't interpret this DIExpression - drop the var.
5942
5943 // Offset defined by a DW_OP_LLVM_extract_bits_[sz]ext.
5944 int64_t ExtractOffsetInBits = 0;
5945 for (auto Op : getAddressExpression(DVR: DbgVariable)->expr_ops()) {
5946 if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext ||
5947 Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_sext) {
5948 ExtractOffsetInBits = Op.getArg(I: 0);
5949 break;
5950 }
5951 }
5952
5953 DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
5954 for (auto Fragment : Fragments) {
5955 int64_t OffsetFromLocationInBits;
5956 std::optional<DIExpression::FragmentInfo> NewDbgFragment;
5957 // Find the variable fragment that the new alloca slice covers.
5958 // Drop debug info for this variable fragment if we can't compute an
5959 // intersect between it and the alloca slice.
5960 if (!DIExpression::calculateFragmentIntersect(
5961 DL, SliceStart: &AI, SliceOffsetInBits: Fragment.Offset, SliceSizeInBits: Fragment.Size, DbgPtr,
5962 DbgPtrOffsetInBits: CurrentExprOffsetInBytes * 8, DbgExtractOffsetInBits: ExtractOffsetInBits, VarFrag,
5963 Result&: NewDbgFragment, OffsetFromLocationInBits))
5964 continue; // Do not migrate this fragment to this slice.
5965
5966 // Zero sized fragment indicates there's no intersect between the variable
5967 // fragment and the alloca slice. Skip this slice for this variable
5968 // fragment.
5969 if (NewDbgFragment && !NewDbgFragment->SizeInBits)
5970 continue; // Do not migrate this fragment to this slice.
5971
5972 // No fragment indicates DbgVariable's variable or fragment exactly
5973 // overlaps the slice; copy its fragment (or nullopt if there isn't one).
5974 if (!NewDbgFragment)
5975 NewDbgFragment = DbgVariable->getFragment();
5976
5977 // Reduce the new expression offset by the bit-extract offset since
5978 // we'll be keeping that.
5979 int64_t OffestFromNewAllocaInBits =
5980 OffsetFromLocationInBits - ExtractOffsetInBits;
5981 // We need to adjust an existing bit extract if the offset expression
5982 // can't eat the slack (i.e., if the new offset would be negative).
5983 int64_t BitExtractOffset =
5984 std::min<int64_t>(a: 0, b: OffestFromNewAllocaInBits);
5985 // The magnitude of a negative value indicates the number of bits into
5986 // the existing variable fragment that the memory region begins. The new
5987 // variable fragment already excludes those bits - the new DbgPtr offset
5988 // only needs to be applied if it's positive.
5989 OffestFromNewAllocaInBits =
5990 std::max(a: int64_t(0), b: OffestFromNewAllocaInBits);
5991
5992 // Rebuild the expression:
5993 // {Offset(OffestFromNewAllocaInBits), PostOffsetOps, NewDbgFragment}
5994 // Add NewDbgFragment later, because dbg.assigns don't want it in the
5995 // address expression but the value expression instead.
5996 DIExpression *NewExpr = DIExpression::get(Context&: AI.getContext(), Elements: PostOffsetOps);
5997 if (OffestFromNewAllocaInBits > 0) {
5998 int64_t OffsetInBytes = (OffestFromNewAllocaInBits + 7) / 8;
5999 NewExpr = DIExpression::prepend(Expr: NewExpr, /*flags=*/Flags: 0, Offset: OffsetInBytes);
6000 }
6001
6002 // Remove any existing intrinsics on the new alloca describing
6003 // the variable fragment.
6004 auto RemoveOne = [DbgVariable](auto *OldDII) {
6005 auto SameVariableFragment = [](const auto *LHS, const auto *RHS) {
6006 return LHS->getVariable() == RHS->getVariable() &&
6007 LHS->getDebugLoc()->getInlinedAt() ==
6008 RHS->getDebugLoc()->getInlinedAt();
6009 };
6010 if (SameVariableFragment(OldDII, DbgVariable))
6011 OldDII->eraseFromParent();
6012 };
6013 for_each(Range: findDVRDeclares(V: Fragment.Alloca), F: RemoveOne);
6014 for_each(Range: findDVRValues(V: Fragment.Alloca), F: RemoveOne);
6015 insertNewDbgInst(DIB, Orig: DbgVariable, NewAddr: Fragment.Alloca, NewAddrExpr: NewExpr, BeforeInst: &AI,
6016 NewFragment: NewDbgFragment, BitExtractAdjustment: BitExtractOffset);
6017 }
6018 };
6019
6020 // Migrate debug information from the old alloca to the new alloca(s)
6021 // and the individual partitions.
6022 for_each(Range: findDVRDeclares(V: &AI), F: MigrateOne);
6023 for_each(Range: findDVRValues(V: &AI), F: MigrateOne);
6024 for_each(Range: at::getDVRAssignmentMarkers(Inst: &AI), F: MigrateOne);
6025
6026 return Changed;
6027}
6028
6029/// Clobber a use with poison, deleting the used value if it becomes dead.
6030void SROA::clobberUse(Use &U) {
6031 Value *OldV = U;
6032 // Replace the use with an poison value.
6033 U = PoisonValue::get(T: OldV->getType());
6034
6035 // Check for this making an instruction dead. We have to garbage collect
6036 // all the dead instructions to ensure the uses of any alloca end up being
6037 // minimal.
6038 if (Instruction *OldI = dyn_cast<Instruction>(Val: OldV))
6039 if (isInstructionTriviallyDead(I: OldI)) {
6040 DeadInsts.push_back(Elt: OldI);
6041 }
6042}
6043
6044/// A basic LoadAndStorePromoter that does not remove store nodes.
6045class BasicLoadAndStorePromoter : public LoadAndStorePromoter {
6046public:
6047 BasicLoadAndStorePromoter(ArrayRef<const Instruction *> Insts, SSAUpdater &S,
6048 Type *ZeroType)
6049 : LoadAndStorePromoter(Insts, S), ZeroType(ZeroType) {}
6050 bool shouldDelete(Instruction *I) const override {
6051 return !isa<StoreInst>(Val: I) && !isa<AllocaInst>(Val: I);
6052 }
6053
6054 Value *getValueToUseForAlloca(Instruction *I) const override {
6055 return UndefValue::get(T: ZeroType);
6056 }
6057
6058private:
6059 Type *ZeroType;
6060};
6061
6062bool SROA::propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS) {
6063 // Look through each "partition", looking for slices with the same start/end
6064 // that do not overlap with any before them. The slices are sorted by
6065 // increasing beginOffset. We don't use AS.partitions(), as it will use a more
6066 // sophisticated algorithm that takes splittable slices into account.
6067 LLVM_DEBUG(dbgs() << "Attempting to propagate values on " << AI << "\n");
6068 bool AllSameAndValid = true;
6069 Type *PartitionType = nullptr;
6070 SmallVector<Instruction *> Insts;
6071 uint64_t BeginOffset = 0;
6072 uint64_t EndOffset = 0;
6073
6074 auto Flush = [&]() {
6075 if (AllSameAndValid && !Insts.empty()) {
6076 LLVM_DEBUG(dbgs() << "Propagate values on slice [" << BeginOffset << ", "
6077 << EndOffset << ")\n");
6078 SmallVector<PHINode *, 4> NewPHIs;
6079 SSAUpdater SSA(&NewPHIs);
6080 Insts.push_back(Elt: &AI);
6081 BasicLoadAndStorePromoter Promoter(Insts, SSA, PartitionType);
6082 Promoter.run(Insts);
6083 }
6084 AllSameAndValid = true;
6085 PartitionType = nullptr;
6086 Insts.clear();
6087 };
6088
6089 for (Slice &S : AS) {
6090 auto *User = cast<Instruction>(Val: S.getUse()->getUser());
6091 if (isAssumeLikeIntrinsic(I: User)) {
6092 LLVM_DEBUG({
6093 dbgs() << "Ignoring slice: ";
6094 AS.print(dbgs(), &S);
6095 });
6096 continue;
6097 }
6098 if (S.beginOffset() >= EndOffset) {
6099 Flush();
6100 BeginOffset = S.beginOffset();
6101 EndOffset = S.endOffset();
6102 } else if (S.beginOffset() != BeginOffset || S.endOffset() != EndOffset) {
6103 if (AllSameAndValid) {
6104 LLVM_DEBUG({
6105 dbgs() << "Slice does not match range [" << BeginOffset << ", "
6106 << EndOffset << ")";
6107 AS.print(dbgs(), &S);
6108 });
6109 AllSameAndValid = false;
6110 }
6111 EndOffset = std::max(a: EndOffset, b: S.endOffset());
6112 continue;
6113 }
6114
6115 if (auto *LI = dyn_cast<LoadInst>(Val: User)) {
6116 Type *UserTy = LI->getType();
6117 // LoadAndStorePromoter requires all the types to be the same.
6118 if (!LI->isSimple() || (PartitionType && UserTy != PartitionType))
6119 AllSameAndValid = false;
6120 PartitionType = UserTy;
6121 Insts.push_back(Elt: User);
6122 } else if (auto *SI = dyn_cast<StoreInst>(Val: User)) {
6123 Type *UserTy = SI->getValueOperand()->getType();
6124 if (!SI->isSimple() || (PartitionType && UserTy != PartitionType))
6125 AllSameAndValid = false;
6126 PartitionType = UserTy;
6127 Insts.push_back(Elt: User);
6128 } else {
6129 AllSameAndValid = false;
6130 }
6131 }
6132
6133 Flush();
6134 return true;
6135}
6136
6137/// Analyze an alloca for SROA.
6138///
6139/// This analyzes the alloca to ensure we can reason about it, builds
6140/// the slices of the alloca, and then hands it off to be split and
6141/// rewritten as needed.
6142std::pair<bool /*Changed*/, bool /*CFGChanged*/>
6143SROA::runOnAlloca(AllocaInst &AI) {
6144 bool Changed = false;
6145 bool CFGChanged = false;
6146
6147 LLVM_DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
6148 ++NumAllocasAnalyzed;
6149
6150 // Special case dead allocas, as they're trivial.
6151 if (AI.use_empty()) {
6152 AI.eraseFromParent();
6153 Changed = true;
6154 return {Changed, CFGChanged};
6155 }
6156 const DataLayout &DL = AI.getDataLayout();
6157
6158 // Skip alloca forms that this analysis can't handle.
6159 std::optional<TypeSize> Size = AI.getAllocationSize(DL);
6160 if (AI.isArrayAllocation() || !Size || Size->isScalable() || Size->isZero())
6161 return {Changed, CFGChanged};
6162
6163 // First, split any FCA loads and stores touching this alloca to promote
6164 // better splitting and promotion opportunities.
6165 IRBuilderTy IRB(&AI);
6166 AggLoadStoreRewriter AggRewriter(DL, IRB);
6167 Changed |= AggRewriter.rewrite(I&: AI);
6168
6169 // Build the slices using a recursive instruction-visiting builder.
6170 AllocaSlices AS(DL, AI);
6171 LLVM_DEBUG(AS.print(dbgs()));
6172 if (AS.isEscaped())
6173 return {Changed, CFGChanged};
6174
6175 if (AS.isEscapedReadOnly()) {
6176 Changed |= propagateStoredValuesToLoads(AI, AS);
6177 return {Changed, CFGChanged};
6178 }
6179
6180 // Delete all the dead users of this alloca before splitting and rewriting it.
6181 for (Instruction *DeadUser : AS.getDeadUsers()) {
6182 // Free up everything used by this instruction.
6183 for (Use &DeadOp : DeadUser->operands())
6184 clobberUse(U&: DeadOp);
6185
6186 // Now replace the uses of this instruction.
6187 DeadUser->replaceAllUsesWith(V: PoisonValue::get(T: DeadUser->getType()));
6188
6189 // And mark it for deletion.
6190 DeadInsts.push_back(Elt: DeadUser);
6191 Changed = true;
6192 }
6193 for (Use *DeadOp : AS.getDeadOperands()) {
6194 clobberUse(U&: *DeadOp);
6195 Changed = true;
6196 }
6197
6198 // No slices to split. Leave the dead alloca for a later pass to clean up.
6199 if (AS.begin() == AS.end())
6200 return {Changed, CFGChanged};
6201
6202 Changed |= splitAlloca(AI, AS);
6203
6204 LLVM_DEBUG(dbgs() << " Speculating PHIs\n");
6205 while (!SpeculatablePHIs.empty())
6206 speculatePHINodeLoads(IRB, PN&: *SpeculatablePHIs.pop_back_val());
6207
6208 LLVM_DEBUG(dbgs() << " Rewriting Selects\n");
6209 auto RemainingSelectsToRewrite = SelectsToRewrite.takeVector();
6210 while (!RemainingSelectsToRewrite.empty()) {
6211 const auto [K, V] = RemainingSelectsToRewrite.pop_back_val();
6212 CFGChanged |=
6213 rewriteSelectInstMemOps(SI&: *K, Ops: V, IRB, DTU: PreserveCFG ? nullptr : DTU);
6214 }
6215
6216 return {Changed, CFGChanged};
6217}
6218
6219/// Delete the dead instructions accumulated in this run.
6220///
6221/// Recursively deletes the dead instructions we've accumulated. This is done
6222/// at the very end to maximize locality of the recursive delete and to
6223/// minimize the problems of invalidated instruction pointers as such pointers
6224/// are used heavily in the intermediate stages of the algorithm.
6225///
6226/// We also record the alloca instructions deleted here so that they aren't
6227/// subsequently handed to mem2reg to promote.
6228bool SROA::deleteDeadInstructions(
6229 SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
6230 bool Changed = false;
6231 while (!DeadInsts.empty()) {
6232 Instruction *I = dyn_cast_or_null<Instruction>(Val: DeadInsts.pop_back_val());
6233 if (!I)
6234 continue;
6235 LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
6236
6237 // If the instruction is an alloca, find the possible dbg.declare connected
6238 // to it, and remove it too. We must do this before calling RAUW or we will
6239 // not be able to find it.
6240 if (AllocaInst *AI = dyn_cast<AllocaInst>(Val: I)) {
6241 DeletedAllocas.insert(Ptr: AI);
6242 for (DbgVariableRecord *OldDII : findDVRDeclares(V: AI))
6243 OldDII->eraseFromParent();
6244 }
6245
6246 at::deleteAssignmentMarkers(Inst: I);
6247 I->replaceAllUsesWith(V: UndefValue::get(T: I->getType()));
6248
6249 for (Use &Operand : I->operands())
6250 if (Instruction *U = dyn_cast<Instruction>(Val&: Operand)) {
6251 // Zero out the operand and see if it becomes trivially dead.
6252 Operand = nullptr;
6253 if (isInstructionTriviallyDead(I: U))
6254 DeadInsts.push_back(Elt: U);
6255 }
6256
6257 ++NumDeleted;
6258 I->eraseFromParent();
6259 Changed = true;
6260 }
6261 return Changed;
6262}
6263/// Promote the allocas, using the best available technique.
6264///
6265/// This attempts to promote whatever allocas have been identified as viable in
6266/// the PromotableAllocas list. If that list is empty, there is nothing to do.
6267/// This function returns whether any promotion occurred.
6268bool SROA::promoteAllocas() {
6269 if (PromotableAllocas.empty())
6270 return false;
6271
6272 if (SROASkipMem2Reg) {
6273 LLVM_DEBUG(dbgs() << "Not promoting allocas with mem2reg!\n");
6274 } else {
6275 LLVM_DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
6276 NumPromoted += PromotableAllocas.size();
6277 PromoteMemToReg(Allocas: PromotableAllocas.getArrayRef(), DT&: DTU->getDomTree(), AC);
6278 }
6279
6280 PromotableAllocas.clear();
6281 return true;
6282}
6283
6284std::pair<bool /*Changed*/, bool /*CFGChanged*/> SROA::runSROA(Function &F) {
6285 LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
6286
6287 const DataLayout &DL = F.getDataLayout();
6288 BasicBlock &EntryBB = F.getEntryBlock();
6289 for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(x: EntryBB.end());
6290 I != E; ++I) {
6291 if (AllocaInst *AI = dyn_cast<AllocaInst>(Val&: I)) {
6292 std::optional<TypeSize> Size = AI->getAllocationSize(DL);
6293 if (Size && Size->isScalable() && isAllocaPromotable(AI))
6294 PromotableAllocas.insert(X: AI);
6295 else
6296 Worklist.insert(X: AI);
6297 }
6298 }
6299
6300 bool Changed = false;
6301 bool CFGChanged = false;
6302 // A set of deleted alloca instruction pointers which should be removed from
6303 // the list of promotable allocas.
6304 SmallPtrSet<AllocaInst *, 4> DeletedAllocas;
6305
6306 do {
6307 while (!Worklist.empty()) {
6308 auto [IterationChanged, IterationCFGChanged] =
6309 runOnAlloca(AI&: *Worklist.pop_back_val());
6310 Changed |= IterationChanged;
6311 CFGChanged |= IterationCFGChanged;
6312
6313 Changed |= deleteDeadInstructions(DeletedAllocas);
6314
6315 // Remove the deleted allocas from various lists so that we don't try to
6316 // continue processing them.
6317 if (!DeletedAllocas.empty()) {
6318 Worklist.set_subtract(DeletedAllocas);
6319 PostPromotionWorklist.set_subtract(DeletedAllocas);
6320 PromotableAllocas.set_subtract(DeletedAllocas);
6321 DeletedAllocas.clear();
6322 }
6323 }
6324
6325 Changed |= promoteAllocas();
6326
6327 Worklist = PostPromotionWorklist;
6328 PostPromotionWorklist.clear();
6329 } while (!Worklist.empty());
6330
6331 assert((!CFGChanged || Changed) && "Can not only modify the CFG.");
6332 assert((!CFGChanged || !PreserveCFG) &&
6333 "Should not have modified the CFG when told to preserve it.");
6334
6335 if (Changed && isAssignmentTrackingEnabled(M: *F.getParent())) {
6336 for (auto &BB : F) {
6337 RemoveRedundantDbgInstrs(BB: &BB);
6338 }
6339 }
6340
6341 return {Changed, CFGChanged};
6342}
6343
6344PreservedAnalyses SROAPass::run(Function &F, FunctionAnalysisManager &AM) {
6345 DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(IR&: F);
6346 AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(IR&: F);
6347 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
6348 auto [Changed, CFGChanged] =
6349 SROA(&F.getContext(), &DTU, &AC, Options).runSROA(F);
6350 if (!Changed)
6351 return PreservedAnalyses::all();
6352 PreservedAnalyses PA;
6353 if (!CFGChanged)
6354 PA.preserveSet<CFGAnalyses>();
6355 PA.preserve<DominatorTreeAnalysis>();
6356 return PA;
6357}
6358
6359void SROAPass::printPipeline(
6360 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
6361 static_cast<PassInfoMixin<SROAPass> *>(this)->printPipeline(
6362 OS, MapClassName2PassName);
6363 OS << '<'
6364 << (Options.CFG == SROAOptions::PreserveCFG ? "preserve-cfg"
6365 : "modify-cfg");
6366 if (Options.AggregateToVector)
6367 OS << ";aggregate-to-vector";
6368 OS << '>';
6369}
6370
6371SROAPass::SROAPass(SROAOptions Options) : Options(Options) {}
6372
6373namespace {
6374
6375/// A legacy pass for the legacy pass manager that wraps the \c SROA pass.
6376class SROALegacyPass : public FunctionPass {
6377 SROAOptions Options;
6378
6379public:
6380 static char ID;
6381
6382 SROALegacyPass(SROAOptions Options = SROAOptions::PreserveCFG)
6383 : FunctionPass(ID), Options(Options) {
6384 initializeSROALegacyPassPass(*PassRegistry::getPassRegistry());
6385 }
6386
6387 bool runOnFunction(Function &F) override {
6388 if (skipFunction(F))
6389 return false;
6390
6391 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
6392 AssumptionCache &AC =
6393 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
6394 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
6395 auto [Changed, _] = SROA(&F.getContext(), &DTU, &AC, Options).runSROA(F);
6396 return Changed;
6397 }
6398
6399 void getAnalysisUsage(AnalysisUsage &AU) const override {
6400 AU.addRequired<AssumptionCacheTracker>();
6401 AU.addRequired<DominatorTreeWrapperPass>();
6402 AU.addPreserved<GlobalsAAWrapperPass>();
6403 AU.addPreserved<DominatorTreeWrapperPass>();
6404 }
6405
6406 StringRef getPassName() const override { return "SROA"; }
6407};
6408
6409} // end anonymous namespace
6410
6411char SROALegacyPass::ID = 0;
6412
6413FunctionPass *llvm::createSROAPass(bool PreserveCFG, bool AggregateToVector) {
6414 return new SROALegacyPass(SROAOptions(PreserveCFG ? SROAOptions::PreserveCFG
6415 : SROAOptions::ModifyCFG,
6416 AggregateToVector));
6417}
6418
6419INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa",
6420 "Scalar Replacement Of Aggregates", false, false)
6421INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6422INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6423INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates",
6424 false, false)
6425