1//===- CodeGenPrepare.cpp - Prepare a function for code generation --------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass munges the code in the input function to better prepare it for
10// SelectionDAG-based code generation. This works around limitations in it's
11// basic-block-at-a-time approach. It should eventually be removed.
12//
13//===----------------------------------------------------------------------===//
14
15#include "llvm/CodeGen/CodeGenPrepare.h"
16#include "llvm/ADT/APInt.h"
17#include "llvm/ADT/ArrayRef.h"
18#include "llvm/ADT/DenseMap.h"
19#include "llvm/ADT/MapVector.h"
20#include "llvm/ADT/PointerIntPair.h"
21#include "llvm/ADT/STLExtras.h"
22#include "llvm/ADT/SmallPtrSet.h"
23#include "llvm/ADT/SmallVector.h"
24#include "llvm/ADT/Statistic.h"
25#include "llvm/Analysis/BlockFrequencyInfo.h"
26#include "llvm/Analysis/BranchProbabilityInfo.h"
27#include "llvm/Analysis/DomTreeUpdater.h"
28#include "llvm/Analysis/FloatingPointPredicateUtils.h"
29#include "llvm/Analysis/InstructionSimplify.h"
30#include "llvm/Analysis/LoopInfo.h"
31#include "llvm/Analysis/ProfileSummaryInfo.h"
32#include "llvm/Analysis/ScalarEvolutionExpressions.h"
33#include "llvm/Analysis/TargetLibraryInfo.h"
34#include "llvm/Analysis/TargetTransformInfo.h"
35#include "llvm/Analysis/ValueTracking.h"
36#include "llvm/Analysis/VectorUtils.h"
37#include "llvm/CodeGen/Analysis.h"
38#include "llvm/CodeGen/BasicBlockSectionsProfileReader.h"
39#include "llvm/CodeGen/ISDOpcodes.h"
40#include "llvm/CodeGen/SelectionDAGNodes.h"
41#include "llvm/CodeGen/TargetLowering.h"
42#include "llvm/CodeGen/TargetPassConfig.h"
43#include "llvm/CodeGen/TargetSubtargetInfo.h"
44#include "llvm/CodeGen/ValueTypes.h"
45#include "llvm/CodeGenTypes/MachineValueType.h"
46#include "llvm/Config/llvm-config.h"
47#include "llvm/IR/Argument.h"
48#include "llvm/IR/Attributes.h"
49#include "llvm/IR/BasicBlock.h"
50#include "llvm/IR/CFG.h"
51#include "llvm/IR/Constant.h"
52#include "llvm/IR/Constants.h"
53#include "llvm/IR/DataLayout.h"
54#include "llvm/IR/DebugInfo.h"
55#include "llvm/IR/DerivedTypes.h"
56#include "llvm/IR/Dominators.h"
57#include "llvm/IR/Function.h"
58#include "llvm/IR/GetElementPtrTypeIterator.h"
59#include "llvm/IR/GlobalValue.h"
60#include "llvm/IR/GlobalVariable.h"
61#include "llvm/IR/IRBuilder.h"
62#include "llvm/IR/InlineAsm.h"
63#include "llvm/IR/InstrTypes.h"
64#include "llvm/IR/Instruction.h"
65#include "llvm/IR/Instructions.h"
66#include "llvm/IR/IntrinsicInst.h"
67#include "llvm/IR/Intrinsics.h"
68#include "llvm/IR/IntrinsicsAArch64.h"
69#include "llvm/IR/LLVMContext.h"
70#include "llvm/IR/MDBuilder.h"
71#include "llvm/IR/Module.h"
72#include "llvm/IR/Operator.h"
73#include "llvm/IR/PatternMatch.h"
74#include "llvm/IR/ProfDataUtils.h"
75#include "llvm/IR/Statepoint.h"
76#include "llvm/IR/Type.h"
77#include "llvm/IR/Use.h"
78#include "llvm/IR/User.h"
79#include "llvm/IR/Value.h"
80#include "llvm/IR/ValueHandle.h"
81#include "llvm/IR/ValueMap.h"
82#include "llvm/InitializePasses.h"
83#include "llvm/Pass.h"
84#include "llvm/Support/BlockFrequency.h"
85#include "llvm/Support/BranchProbability.h"
86#include "llvm/Support/Casting.h"
87#include "llvm/Support/CommandLine.h"
88#include "llvm/Support/Compiler.h"
89#include "llvm/Support/Debug.h"
90#include "llvm/Support/ErrorHandling.h"
91#include "llvm/Support/raw_ostream.h"
92#include "llvm/Target/TargetMachine.h"
93#include "llvm/Target/TargetOptions.h"
94#include "llvm/Transforms/Utils/BasicBlockUtils.h"
95#include "llvm/Transforms/Utils/BypassSlowDivision.h"
96#include "llvm/Transforms/Utils/Local.h"
97#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
98#include "llvm/Transforms/Utils/SizeOpts.h"
99#include <algorithm>
100#include <cassert>
101#include <cstdint>
102#include <iterator>
103#include <limits>
104#include <memory>
105#include <optional>
106#include <utility>
107#include <vector>
108
109using namespace llvm;
110using namespace llvm::PatternMatch;
111
112#define DEBUG_TYPE "codegenprepare"
113
114STATISTIC(NumBlocksElim, "Number of blocks eliminated");
115STATISTIC(NumPHIsElim, "Number of trivial PHIs eliminated");
116STATISTIC(NumGEPsElim, "Number of GEPs converted to casts");
117STATISTIC(NumCmpUses, "Number of uses of Cmp expressions replaced with uses of "
118 "sunken Cmps");
119STATISTIC(NumCastUses, "Number of uses of Cast expressions replaced with uses "
120 "of sunken Casts");
121STATISTIC(NumMemoryInsts, "Number of memory instructions whose address "
122 "computations were sunk");
123STATISTIC(NumMemoryInstsPhiCreated,
124 "Number of phis created when address "
125 "computations were sunk to memory instructions");
126STATISTIC(NumMemoryInstsSelectCreated,
127 "Number of select created when address "
128 "computations were sunk to memory instructions");
129STATISTIC(NumExtsMoved, "Number of [s|z]ext instructions combined with loads");
130STATISTIC(NumExtUses, "Number of uses of [s|z]ext instructions optimized");
131STATISTIC(NumAndsAdded,
132 "Number of and mask instructions added to form ext loads");
133STATISTIC(NumAndUses, "Number of uses of and mask instructions optimized");
134STATISTIC(NumRetsDup, "Number of return instructions duplicated");
135STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved");
136STATISTIC(NumSelectsExpanded, "Number of selects turned into branches");
137STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed");
138
139static cl::opt<bool> DisableBranchOpts(
140 "disable-cgp-branch-opts", cl::Hidden, cl::init(Val: false),
141 cl::desc("Disable branch optimizations in CodeGenPrepare"));
142
143static cl::opt<bool>
144 DisableGCOpts("disable-cgp-gc-opts", cl::Hidden, cl::init(Val: false),
145 cl::desc("Disable GC optimizations in CodeGenPrepare"));
146
147static cl::opt<bool>
148 DisableSelectToBranch("disable-cgp-select2branch", cl::Hidden,
149 cl::init(Val: false),
150 cl::desc("Disable select to branch conversion."));
151
152static cl::opt<bool>
153 AddrSinkUsingGEPs("addr-sink-using-gep", cl::Hidden, cl::init(Val: true),
154 cl::desc("Address sinking in CGP using GEPs."));
155
156static cl::opt<bool>
157 EnableAndCmpSinking("enable-andcmp-sinking", cl::Hidden, cl::init(Val: true),
158 cl::desc("Enable sinking and/cmp into branches."));
159
160static cl::opt<bool> DisableStoreExtract(
161 "disable-cgp-store-extract", cl::Hidden, cl::init(Val: false),
162 cl::desc("Disable store(extract) optimizations in CodeGenPrepare"));
163
164static cl::opt<bool> StressStoreExtract(
165 "stress-cgp-store-extract", cl::Hidden, cl::init(Val: false),
166 cl::desc("Stress test store(extract) optimizations in CodeGenPrepare"));
167
168static cl::opt<bool> DisableExtLdPromotion(
169 "disable-cgp-ext-ld-promotion", cl::Hidden, cl::init(Val: false),
170 cl::desc("Disable ext(promotable(ld)) -> promoted(ext(ld)) optimization in "
171 "CodeGenPrepare"));
172
173static cl::opt<bool> StressExtLdPromotion(
174 "stress-cgp-ext-ld-promotion", cl::Hidden, cl::init(Val: false),
175 cl::desc("Stress test ext(promotable(ld)) -> promoted(ext(ld)) "
176 "optimization in CodeGenPrepare"));
177
178static cl::opt<bool> DisablePreheaderProtect(
179 "disable-preheader-prot", cl::Hidden, cl::init(Val: false),
180 cl::desc("Disable protection against removing loop preheaders"));
181
182static cl::opt<bool> ProfileGuidedSectionPrefix(
183 "profile-guided-section-prefix", cl::Hidden, cl::init(Val: true),
184 cl::desc("Use profile info to add section prefix for hot/cold functions"));
185
186static cl::opt<bool> ProfileUnknownInSpecialSection(
187 "profile-unknown-in-special-section", cl::Hidden,
188 cl::desc("In profiling mode like sampleFDO, if a function doesn't have "
189 "profile, we cannot tell the function is cold for sure because "
190 "it may be a function newly added without ever being sampled. "
191 "With the flag enabled, compiler can put such profile unknown "
192 "functions into a special section, so runtime system can choose "
193 "to handle it in a different way than .text section, to save "
194 "RAM for example. "));
195
196static cl::opt<bool> BBSectionsGuidedSectionPrefix(
197 "bbsections-guided-section-prefix", cl::Hidden, cl::init(Val: true),
198 cl::desc("Use the basic-block-sections profile to determine the text "
199 "section prefix for hot functions. Functions with "
200 "basic-block-sections profile will be placed in `.text.hot` "
201 "regardless of their FDO profile info. Other functions won't be "
202 "impacted, i.e., their prefixes will be decided by FDO/sampleFDO "
203 "profiles."));
204
205static cl::opt<uint64_t> FreqRatioToSkipMerge(
206 "cgp-freq-ratio-to-skip-merge", cl::Hidden, cl::init(Val: 2),
207 cl::desc("Skip merging empty blocks if (frequency of empty block) / "
208 "(frequency of destination block) is greater than this ratio"));
209
210static cl::opt<bool> ForceSplitStore(
211 "force-split-store", cl::Hidden, cl::init(Val: false),
212 cl::desc("Force store splitting no matter what the target query says."));
213
214static cl::opt<bool> EnableTypePromotionMerge(
215 "cgp-type-promotion-merge", cl::Hidden,
216 cl::desc("Enable merging of redundant sexts when one is dominating"
217 " the other."),
218 cl::init(Val: true));
219
220static cl::opt<bool> DisableComplexAddrModes(
221 "disable-complex-addr-modes", cl::Hidden, cl::init(Val: false),
222 cl::desc("Disables combining addressing modes with different parts "
223 "in optimizeMemoryInst."));
224
225static cl::opt<bool>
226 AddrSinkNewPhis("addr-sink-new-phis", cl::Hidden, cl::init(Val: false),
227 cl::desc("Allow creation of Phis in Address sinking."));
228
229static cl::opt<bool> AddrSinkNewSelects(
230 "addr-sink-new-select", cl::Hidden, cl::init(Val: true),
231 cl::desc("Allow creation of selects in Address sinking."));
232
233static cl::opt<bool> AddrSinkCombineBaseReg(
234 "addr-sink-combine-base-reg", cl::Hidden, cl::init(Val: true),
235 cl::desc("Allow combining of BaseReg field in Address sinking."));
236
237static cl::opt<bool> AddrSinkCombineBaseGV(
238 "addr-sink-combine-base-gv", cl::Hidden, cl::init(Val: true),
239 cl::desc("Allow combining of BaseGV field in Address sinking."));
240
241static cl::opt<bool> AddrSinkCombineBaseOffs(
242 "addr-sink-combine-base-offs", cl::Hidden, cl::init(Val: true),
243 cl::desc("Allow combining of BaseOffs field in Address sinking."));
244
245static cl::opt<bool> AddrSinkCombineScaledReg(
246 "addr-sink-combine-scaled-reg", cl::Hidden, cl::init(Val: true),
247 cl::desc("Allow combining of ScaledReg field in Address sinking."));
248
249static cl::opt<bool>
250 EnableGEPOffsetSplit("cgp-split-large-offset-gep", cl::Hidden,
251 cl::init(Val: true),
252 cl::desc("Enable splitting large offset of GEP."));
253
254static cl::opt<bool> EnableICMP_EQToICMP_ST(
255 "cgp-icmp-eq2icmp-st", cl::Hidden, cl::init(Val: false),
256 cl::desc("Enable ICMP_EQ to ICMP_S(L|G)T conversion."));
257
258static cl::opt<bool>
259 VerifyBFIUpdates("cgp-verify-bfi-updates", cl::Hidden, cl::init(Val: false),
260 cl::desc("Enable BFI update verification for "
261 "CodeGenPrepare."));
262
263static cl::opt<bool>
264 OptimizePhiTypes("cgp-optimize-phi-types", cl::Hidden, cl::init(Val: true),
265 cl::desc("Enable converting phi types in CodeGenPrepare"));
266
267static cl::opt<unsigned>
268 HugeFuncThresholdInCGPP("cgpp-huge-func", cl::init(Val: 10000), cl::Hidden,
269 cl::desc("Least BB number of huge function."));
270
271static cl::opt<unsigned>
272 MaxAddressUsersToScan("cgp-max-address-users-to-scan", cl::init(Val: 100),
273 cl::Hidden,
274 cl::desc("Max number of address users to look at"));
275
276static cl::opt<bool>
277 DisableDeletePHIs("disable-cgp-delete-phis", cl::Hidden, cl::init(Val: false),
278 cl::desc("Disable elimination of dead PHI nodes."));
279
280namespace {
281
282enum ExtType {
283 ZeroExtension, // Zero extension has been seen.
284 SignExtension, // Sign extension has been seen.
285 BothExtension // This extension type is used if we saw sext after
286 // ZeroExtension had been set, or if we saw zext after
287 // SignExtension had been set. It makes the type
288 // information of a promoted instruction invalid.
289};
290
291enum ModifyDT {
292 NotModifyDT, // Not Modify any DT.
293 ModifyBBDT, // Modify the Basic Block Dominator Tree.
294 ModifyInstDT // Modify the Instruction Dominator in a Basic Block,
295 // This usually means we move/delete/insert instruction
296 // in a Basic Block. So we should re-iterate instructions
297 // in such Basic Block.
298};
299
300using SetOfInstrs = SmallPtrSet<Instruction *, 16>;
301using TypeIsSExt = PointerIntPair<Type *, 2, ExtType>;
302using InstrToOrigTy = DenseMap<Instruction *, TypeIsSExt>;
303using SExts = SmallVector<Instruction *, 16>;
304using ValueToSExts = MapVector<Value *, SExts>;
305
306class TypePromotionTransaction;
307
308class CodeGenPrepare {
309 friend class CodeGenPrepareLegacyPass;
310 const TargetMachine *TM = nullptr;
311 const TargetSubtargetInfo *SubtargetInfo = nullptr;
312 const TargetLowering *TLI = nullptr;
313 const TargetRegisterInfo *TRI = nullptr;
314 const TargetTransformInfo *TTI = nullptr;
315 const BasicBlockSectionsProfileReader *BBSectionsProfileReader = nullptr;
316 const TargetLibraryInfo *TLInfo = nullptr;
317 DomTreeUpdater *DTU = nullptr;
318 LoopInfo *LI = nullptr;
319 BlockFrequencyInfo *BFI;
320 BranchProbabilityInfo *BPI;
321 ProfileSummaryInfo *PSI = nullptr;
322
323 /// As we scan instructions optimizing them, this is the next instruction
324 /// to optimize. Transforms that can invalidate this should update it.
325 BasicBlock::iterator CurInstIterator;
326
327 /// Keeps track of non-local addresses that have been sunk into a block.
328 /// This allows us to avoid inserting duplicate code for blocks with
329 /// multiple load/stores of the same address. The usage of WeakTrackingVH
330 /// enables SunkAddrs to be treated as a cache whose entries can be
331 /// invalidated if a sunken address computation has been erased.
332 ValueMap<Value *, WeakTrackingVH> SunkAddrs;
333
334 /// Keeps track of all instructions inserted for the current function.
335 SetOfInstrs InsertedInsts;
336
337 /// Keeps track of the type of the related instruction before their
338 /// promotion for the current function.
339 InstrToOrigTy PromotedInsts;
340
341 /// Keep track of instructions removed during promotion.
342 SetOfInstrs RemovedInsts;
343
344 /// Keep track of sext chains based on their initial value.
345 DenseMap<Value *, Instruction *> SeenChainsForSExt;
346
347 /// Keep track of GEPs accessing the same data structures such as structs or
348 /// arrays that are candidates to be split later because of their large
349 /// size.
350 MapVector<AssertingVH<Value>,
351 SmallVector<std::pair<AssertingVH<GetElementPtrInst>, int64_t>, 32>>
352 LargeOffsetGEPMap;
353
354 /// Keep track of new GEP base after splitting the GEPs having large offset.
355 SmallSet<AssertingVH<Value>, 2> NewGEPBases;
356
357 /// Map serial numbers to Large offset GEPs.
358 DenseMap<AssertingVH<GetElementPtrInst>, int> LargeOffsetGEPID;
359
360 /// Keep track of SExt promoted.
361 ValueToSExts ValToSExtendedUses;
362
363 /// True if the function has the OptSize attribute.
364 bool OptSize;
365
366 /// DataLayout for the Function being processed.
367 const DataLayout *DL = nullptr;
368
369public:
370 CodeGenPrepare() = default;
371 CodeGenPrepare(const TargetMachine *TM) : TM(TM){};
372 /// If encounter huge function, we need to limit the build time.
373 bool IsHugeFunc = false;
374
375 /// FreshBBs is like worklist, it collected the updated BBs which need
376 /// to be optimized again.
377 /// Note: Consider building time in this pass, when a BB updated, we need
378 /// to insert such BB into FreshBBs for huge function.
379 SmallPtrSet<BasicBlock *, 32> FreshBBs;
380
381 void releaseMemory() {
382 // Clear per function information.
383 InsertedInsts.clear();
384 PromotedInsts.clear();
385 FreshBBs.clear();
386 }
387
388 bool run(Function &F, FunctionAnalysisManager &AM);
389
390private:
391 template <typename F>
392 void resetIteratorIfInvalidatedWhileCalling(BasicBlock *BB, F f) {
393 // Substituting can cause recursive simplifications, which can invalidate
394 // our iterator. Use a WeakTrackingVH to hold onto it in case this
395 // happens.
396 Value *CurValue = &*CurInstIterator;
397 WeakTrackingVH IterHandle(CurValue);
398
399 f();
400
401 // If the iterator instruction was recursively deleted, start over at the
402 // start of the block.
403 if (IterHandle != CurValue) {
404 CurInstIterator = BB->begin();
405 SunkAddrs.clear();
406 }
407 }
408
409 // Get the DominatorTree, updating it if necessary.
410 DominatorTree &getDT() { return DTU->getDomTree(); }
411
412 void removeAllAssertingVHReferences(Value *V);
413 bool eliminateAssumptions(Function &F);
414 bool eliminateFallThrough(Function &F);
415 bool eliminateMostlyEmptyBlocks(Function &F, bool &ResetLI);
416 BasicBlock *findDestBlockOfMergeableEmptyBlock(BasicBlock *BB);
417 bool canMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const;
418 bool eliminateMostlyEmptyBlock(BasicBlock *BB);
419 bool isMergingEmptyBlockProfitable(BasicBlock *BB, BasicBlock *DestBB,
420 bool isPreheader);
421 bool makeBitReverse(Instruction &I);
422 bool optimizeBlock(BasicBlock &BB, ModifyDT &ModifiedDT);
423 bool optimizeInst(Instruction *I, ModifyDT &ModifiedDT);
424 bool optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, Type *AccessTy,
425 unsigned AddrSpace);
426 bool optimizeGatherScatterInst(Instruction *MemoryInst, Value *Ptr);
427 bool optimizeMulWithOverflow(Instruction *I, bool IsSigned,
428 ModifyDT &ModifiedDT);
429 bool optimizeInlineAsmInst(CallInst *CS);
430 bool optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT);
431 bool optimizeExt(Instruction *&I);
432 bool optimizeExtUses(Instruction *I);
433 bool optimizeLoadExt(LoadInst *Load);
434 bool optimizeShiftInst(BinaryOperator *BO);
435 bool optimizeFunnelShift(IntrinsicInst *Fsh);
436 bool optimizeSelectInst(SelectInst *SI);
437 bool optimizeShuffleVectorInst(ShuffleVectorInst *SVI);
438 bool optimizeSwitchType(SwitchInst *SI);
439 bool optimizeSwitchPhiConstants(SwitchInst *SI);
440 bool optimizeSwitchInst(SwitchInst *SI);
441 bool optimizeExtractElementInst(Instruction *Inst);
442 bool dupRetToEnableTailCallOpts(BasicBlock *BB, ModifyDT &ModifiedDT);
443 bool fixupDbgVariableRecord(DbgVariableRecord &I);
444 bool fixupDbgVariableRecordsOnInst(Instruction &I);
445 bool placeDbgValues(Function &F);
446 bool placePseudoProbes(Function &F);
447 bool canFormExtLd(const SmallVectorImpl<Instruction *> &MovedExts,
448 LoadInst *&LI, Instruction *&Inst, bool HasPromoted);
449 bool tryToPromoteExts(TypePromotionTransaction &TPT,
450 const SmallVectorImpl<Instruction *> &Exts,
451 SmallVectorImpl<Instruction *> &ProfitablyMovedExts,
452 unsigned CreatedInstsCost = 0);
453 bool mergeSExts(Function &F);
454 bool splitLargeGEPOffsets();
455 bool optimizePhiType(PHINode *Inst, SmallPtrSetImpl<PHINode *> &Visited,
456 SmallPtrSetImpl<Instruction *> &DeletedInstrs);
457 bool optimizePhiTypes(Function &F);
458 bool performAddressTypePromotion(
459 Instruction *&Inst, bool AllowPromotionWithoutCommonHeader,
460 bool HasPromoted, TypePromotionTransaction &TPT,
461 SmallVectorImpl<Instruction *> &SpeculativelyMovedExts);
462 bool splitBranchCondition(Function &F);
463 bool simplifyOffsetableRelocate(GCStatepointInst &I);
464
465 bool tryToSinkFreeOperands(Instruction *I);
466 bool replaceMathCmpWithIntrinsic(BinaryOperator *BO, Value *Arg0, Value *Arg1,
467 CmpInst *Cmp, Intrinsic::ID IID);
468 bool optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT);
469 bool optimizeURem(Instruction *Rem);
470 bool combineToUSubWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
471 bool combineToUAddWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
472 bool unfoldPowerOf2Test(CmpInst *Cmp);
473 void verifyBFIUpdates(Function &F);
474 bool _run(Function &F);
475};
476
477class CodeGenPrepareLegacyPass : public FunctionPass {
478public:
479 static char ID; // Pass identification, replacement for typeid
480
481 CodeGenPrepareLegacyPass() : FunctionPass(ID) {}
482
483 bool runOnFunction(Function &F) override;
484
485 StringRef getPassName() const override { return "CodeGen Prepare"; }
486
487 void getAnalysisUsage(AnalysisUsage &AU) const override {
488 // FIXME: When we can selectively preserve passes, preserve the domtree.
489 AU.addRequired<ProfileSummaryInfoWrapperPass>();
490 AU.addRequired<TargetLibraryInfoWrapperPass>();
491 AU.addRequired<TargetPassConfig>();
492 AU.addRequired<TargetTransformInfoWrapperPass>();
493 AU.addRequired<DominatorTreeWrapperPass>();
494 AU.addRequired<LoopInfoWrapperPass>();
495 AU.addRequired<BranchProbabilityInfoWrapperPass>();
496 AU.addRequired<BlockFrequencyInfoWrapperPass>();
497 AU.addUsedIfAvailable<BasicBlockSectionsProfileReaderWrapperPass>();
498 }
499};
500
501} // end anonymous namespace
502
503char CodeGenPrepareLegacyPass::ID = 0;
504
505bool CodeGenPrepareLegacyPass::runOnFunction(Function &F) {
506 if (skipFunction(F))
507 return false;
508 auto TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
509 CodeGenPrepare CGP(TM);
510 CGP.DL = &F.getDataLayout();
511 CGP.SubtargetInfo = TM->getSubtargetImpl(F);
512 CGP.TLI = CGP.SubtargetInfo->getTargetLowering();
513 CGP.TRI = CGP.SubtargetInfo->getRegisterInfo();
514 CGP.TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
515 CGP.TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
516 CGP.LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
517 CGP.BPI = &getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
518 CGP.BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
519 CGP.PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
520 auto BBSPRWP =
521 getAnalysisIfAvailable<BasicBlockSectionsProfileReaderWrapperPass>();
522 CGP.BBSectionsProfileReader = BBSPRWP ? &BBSPRWP->getBBSPR() : nullptr;
523 DomTreeUpdater DTUpdater(
524 &getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
525 DomTreeUpdater::UpdateStrategy::Lazy);
526 CGP.DTU = &DTUpdater;
527
528 return CGP._run(F);
529}
530
531INITIALIZE_PASS_BEGIN(CodeGenPrepareLegacyPass, DEBUG_TYPE,
532 "Optimize for code generation", false, false)
533INITIALIZE_PASS_DEPENDENCY(BasicBlockSectionsProfileReaderWrapperPass)
534INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
535INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
536INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
537INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
538INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
539INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
540INITIALIZE_PASS_END(CodeGenPrepareLegacyPass, DEBUG_TYPE,
541 "Optimize for code generation", false, false)
542
543FunctionPass *llvm::createCodeGenPrepareLegacyPass() {
544 return new CodeGenPrepareLegacyPass();
545}
546
547PreservedAnalyses CodeGenPreparePass::run(Function &F,
548 FunctionAnalysisManager &AM) {
549 CodeGenPrepare CGP(TM);
550
551 bool Changed = CGP.run(F, AM);
552 if (!Changed)
553 return PreservedAnalyses::all();
554
555 PreservedAnalyses PA;
556 PA.preserve<TargetLibraryAnalysis>();
557 PA.preserve<TargetIRAnalysis>();
558 return PA;
559}
560
561bool CodeGenPrepare::run(Function &F, FunctionAnalysisManager &AM) {
562 DL = &F.getDataLayout();
563 SubtargetInfo = TM->getSubtargetImpl(F);
564 TLI = SubtargetInfo->getTargetLowering();
565 TRI = SubtargetInfo->getRegisterInfo();
566 TLInfo = &AM.getResult<TargetLibraryAnalysis>(IR&: F);
567 TTI = &AM.getResult<TargetIRAnalysis>(IR&: F);
568 LI = &AM.getResult<LoopAnalysis>(IR&: F);
569 BPI = &AM.getResult<BranchProbabilityAnalysis>(IR&: F);
570 BFI = &AM.getResult<BlockFrequencyAnalysis>(IR&: F);
571 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(IR&: F);
572 PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(IR&: *F.getParent());
573 if (!PSI)
574 reportFatalUsageError(reason: "this pass requires the profile-summary module "
575 "analysis to be available");
576 BBSectionsProfileReader =
577 AM.getCachedResult<BasicBlockSectionsProfileReaderAnalysis>(IR&: F);
578 DomTreeUpdater DTUpdater(&AM.getResult<DominatorTreeAnalysis>(IR&: F),
579 DomTreeUpdater::UpdateStrategy::Lazy);
580 DTU = &DTUpdater;
581 return _run(F);
582}
583
584bool CodeGenPrepare::_run(Function &F) {
585 bool EverMadeChange = false;
586
587 OptSize = F.hasOptSize();
588 // Use the basic-block-sections profile to promote hot functions to .text.hot
589 // if requested.
590 if (BBSectionsGuidedSectionPrefix && BBSectionsProfileReader &&
591 BBSectionsProfileReader->isFunctionHot(FuncName: F.getName())) {
592 (void)F.setSectionPrefix("hot");
593 } else if (ProfileGuidedSectionPrefix) {
594 // The hot attribute overwrites profile count based hotness while profile
595 // counts based hotness overwrite the cold attribute.
596 // This is a conservative behabvior.
597 if (F.hasFnAttribute(Kind: Attribute::Hot) ||
598 PSI->isFunctionHotInCallGraph(F: &F, BFI&: *BFI))
599 (void)F.setSectionPrefix("hot");
600 // If PSI shows this function is not hot, we will placed the function
601 // into unlikely section if (1) PSI shows this is a cold function, or
602 // (2) the function has a attribute of cold.
603 else if (PSI->isFunctionColdInCallGraph(F: &F, BFI&: *BFI) ||
604 F.hasFnAttribute(Kind: Attribute::Cold))
605 (void)F.setSectionPrefix("unlikely");
606 else if (ProfileUnknownInSpecialSection && PSI->hasPartialSampleProfile() &&
607 PSI->isFunctionHotnessUnknown(F))
608 (void)F.setSectionPrefix("unknown");
609 }
610
611 /// This optimization identifies DIV instructions that can be
612 /// profitably bypassed and carried out with a shorter, faster divide.
613 if (!OptSize && !PSI->hasHugeWorkingSetSize() && TLI->isSlowDivBypassed()) {
614 const DenseMap<unsigned int, unsigned int> &BypassWidths =
615 TLI->getBypassSlowDivWidths();
616 BasicBlock *BB = &*F.begin();
617 while (BB != nullptr) {
618 // bypassSlowDivision may create new BBs, but we don't want to reapply the
619 // optimization to those blocks.
620 BasicBlock *Next = BB->getNextNode();
621 if (!llvm::shouldOptimizeForSize(BB, PSI, BFI))
622 EverMadeChange |= bypassSlowDivision(BB, BypassWidth: BypassWidths, DTU, LI);
623 BB = Next;
624 }
625 }
626
627 // Get rid of @llvm.assume builtins before attempting to eliminate empty
628 // blocks, since there might be blocks that only contain @llvm.assume calls
629 // (plus arguments that we can get rid of).
630 EverMadeChange |= eliminateAssumptions(F);
631
632 auto resetLoopInfo = [this]() {
633 LI->releaseMemory();
634 LI->analyze(DomTree: DTU->getDomTree());
635 };
636
637 // Eliminate blocks that contain only PHI nodes and an
638 // unconditional branch.
639 bool ResetLI = false;
640 EverMadeChange |= eliminateMostlyEmptyBlocks(F, ResetLI);
641 if (ResetLI)
642 resetLoopInfo();
643
644 if (!DisableBranchOpts)
645 EverMadeChange |= splitBranchCondition(F);
646
647 // Split some critical edges where one of the sources is an indirect branch,
648 // to help generate sane code for PHIs involving such edges.
649 bool Split = SplitIndirectBrCriticalEdges(F, /*IgnoreBlocksWithoutPHI=*/true,
650 BPI, BFI, DTU);
651 EverMadeChange |= Split;
652 if (Split)
653 resetLoopInfo();
654
655#ifndef NDEBUG
656 if (VerifyDomInfo)
657 assert(getDT().verify(DominatorTree::VerificationLevel::Fast) &&
658 "Incorrect DominatorTree updates in CGP");
659
660 if (VerifyLoopInfo)
661 LI->verify(getDT());
662#endif
663
664 // If we are optimzing huge function, we need to consider the build time.
665 // Because the basic algorithm's complex is near O(N!).
666 IsHugeFunc = F.size() > HugeFuncThresholdInCGPP;
667
668 bool MadeChange = true;
669 bool FuncIterated = false;
670 while (MadeChange) {
671 MadeChange = false;
672
673 // This is required because optimizeBlock() calls getDT() inside the loop
674 // below, which flushes pending updates and may delete dead blocks, leading
675 // to iterator invalidation.
676 DTU->flush();
677
678 for (BasicBlock &BB : llvm::make_early_inc_range(Range&: F)) {
679 if (FuncIterated && !FreshBBs.contains(Ptr: &BB))
680 continue;
681
682 ModifyDT ModifiedDTOnIteration = ModifyDT::NotModifyDT;
683 bool Changed = optimizeBlock(BB, ModifiedDT&: ModifiedDTOnIteration);
684
685 MadeChange |= Changed;
686 if (IsHugeFunc) {
687 // If the BB is updated, it may still has chance to be optimized.
688 // This usually happen at sink optimization.
689 // For example:
690 //
691 // bb0:
692 // %and = and i32 %a, 4
693 // %cmp = icmp eq i32 %and, 0
694 //
695 // If the %cmp sink to other BB, the %and will has chance to sink.
696 if (Changed)
697 FreshBBs.insert(Ptr: &BB);
698 else if (FuncIterated)
699 FreshBBs.erase(Ptr: &BB);
700 } else {
701 // For small/normal functions, we restart BB iteration if the dominator
702 // tree of the Function was changed.
703 if (ModifiedDTOnIteration != ModifyDT::NotModifyDT)
704 break;
705 }
706 }
707 // We have iterated all the BB in the (only work for huge) function.
708 FuncIterated = IsHugeFunc;
709
710 if (EnableTypePromotionMerge && !ValToSExtendedUses.empty())
711 MadeChange |= mergeSExts(F);
712 if (!LargeOffsetGEPMap.empty())
713 MadeChange |= splitLargeGEPOffsets();
714 MadeChange |= optimizePhiTypes(F);
715
716 if (MadeChange)
717 eliminateFallThrough(F);
718
719#ifndef NDEBUG
720 if (VerifyDomInfo)
721 assert(getDT().verify(DominatorTree::VerificationLevel::Fast) &&
722 "Incorrect DominatorTree updates in CGP");
723
724 if (VerifyLoopInfo)
725 LI->verify(getDT());
726#endif
727
728 // Really free removed instructions during promotion.
729 for (Instruction *I : RemovedInsts)
730 I->deleteValue();
731
732 EverMadeChange |= MadeChange;
733 SeenChainsForSExt.clear();
734 ValToSExtendedUses.clear();
735 RemovedInsts.clear();
736 LargeOffsetGEPMap.clear();
737 LargeOffsetGEPID.clear();
738 }
739
740 NewGEPBases.clear();
741 SunkAddrs.clear();
742
743 // LoopInfo is not needed anymore and ConstantFoldTerminator can break it.
744 LI = nullptr;
745
746 if (!DisableBranchOpts) {
747 MadeChange = false;
748 // Use a set vector to get deterministic iteration order. The order the
749 // blocks are removed may affect whether or not PHI nodes in successors
750 // are removed.
751 SmallSetVector<BasicBlock *, 8> WorkList;
752 for (BasicBlock &BB : F) {
753 SmallVector<BasicBlock *, 2> Successors(successors(BB: &BB));
754 MadeChange |= ConstantFoldTerminator(BB: &BB, DeleteDeadConditions: true, TLI: nullptr, DTU);
755 if (!MadeChange)
756 continue;
757
758 for (BasicBlock *Succ : Successors)
759 if (pred_empty(BB: Succ))
760 WorkList.insert(X: Succ);
761 }
762
763 // Delete the dead blocks and any of their dead successors.
764 MadeChange |= !WorkList.empty();
765 while (!WorkList.empty()) {
766 BasicBlock *BB = WorkList.pop_back_val();
767 SmallVector<BasicBlock *, 2> Successors(successors(BB));
768
769 DeleteDeadBlock(BB, DTU);
770
771 for (BasicBlock *Succ : Successors)
772 if (pred_empty(BB: Succ))
773 WorkList.insert(X: Succ);
774 }
775
776 // Flush pending DT updates in order to finalise deletion of dead blocks.
777 DTU->flush();
778
779 // Merge pairs of basic blocks with unconditional branches, connected by
780 // a single edge.
781 if (EverMadeChange || MadeChange)
782 MadeChange |= eliminateFallThrough(F);
783
784 EverMadeChange |= MadeChange;
785 }
786
787 if (!DisableGCOpts) {
788 SmallVector<GCStatepointInst *, 2> Statepoints;
789 for (BasicBlock &BB : F)
790 for (Instruction &I : BB)
791 if (auto *SP = dyn_cast<GCStatepointInst>(Val: &I))
792 Statepoints.push_back(Elt: SP);
793 for (auto &I : Statepoints)
794 EverMadeChange |= simplifyOffsetableRelocate(I&: *I);
795 }
796
797 // Do this last to clean up use-before-def scenarios introduced by other
798 // preparatory transforms.
799 EverMadeChange |= placeDbgValues(F);
800 EverMadeChange |= placePseudoProbes(F);
801
802#ifndef NDEBUG
803 if (VerifyBFIUpdates)
804 verifyBFIUpdates(F);
805#endif
806
807 return EverMadeChange;
808}
809
810bool CodeGenPrepare::eliminateAssumptions(Function &F) {
811 bool MadeChange = false;
812 for (BasicBlock &BB : F) {
813 CurInstIterator = BB.begin();
814 while (CurInstIterator != BB.end()) {
815 Instruction *I = &*(CurInstIterator++);
816 if (auto *Assume = dyn_cast<AssumeInst>(Val: I)) {
817 MadeChange = true;
818 Value *Operand = Assume->getOperand(i_nocapture: 0);
819 Assume->eraseFromParent();
820
821 resetIteratorIfInvalidatedWhileCalling(BB: &BB, f: [&]() {
822 RecursivelyDeleteTriviallyDeadInstructions(V: Operand, TLI: TLInfo, MSSAU: nullptr);
823 });
824 }
825 }
826 }
827 return MadeChange;
828}
829
830/// An instruction is about to be deleted, so remove all references to it in our
831/// GEP-tracking data strcutures.
832void CodeGenPrepare::removeAllAssertingVHReferences(Value *V) {
833 LargeOffsetGEPMap.erase(Key: V);
834 NewGEPBases.erase(V);
835
836 auto GEP = dyn_cast<GetElementPtrInst>(Val: V);
837 if (!GEP)
838 return;
839
840 LargeOffsetGEPID.erase(Val: GEP);
841
842 auto VecI = LargeOffsetGEPMap.find(Key: GEP->getPointerOperand());
843 if (VecI == LargeOffsetGEPMap.end())
844 return;
845
846 auto &GEPVector = VecI->second;
847 llvm::erase_if(C&: GEPVector, P: [=](auto &Elt) { return Elt.first == GEP; });
848
849 if (GEPVector.empty())
850 LargeOffsetGEPMap.erase(Iterator: VecI);
851}
852
853// Verify BFI has been updated correctly by recomputing BFI and comparing them.
854[[maybe_unused]] void CodeGenPrepare::verifyBFIUpdates(Function &F) {
855 DominatorTree NewDT(F);
856 LoopInfo NewLI(NewDT);
857 BranchProbabilityInfo NewBPI(F, NewLI, TLInfo);
858 BlockFrequencyInfo NewBFI(F, NewBPI, NewLI);
859 NewBFI.verifyMatch(Other&: *BFI);
860}
861
862/// Merge basic blocks which are connected by a single edge, where one of the
863/// basic blocks has a single successor pointing to the other basic block,
864/// which has a single predecessor.
865bool CodeGenPrepare::eliminateFallThrough(Function &F) {
866 bool Changed = false;
867 SmallPtrSet<BasicBlock *, 8> Preds;
868 // Scan all of the blocks in the function, except for the entry block.
869 for (auto &Block : llvm::drop_begin(RangeOrContainer&: F)) {
870 auto *BB = &Block;
871 if (DTU->isBBPendingDeletion(DelBB: BB))
872 continue;
873 // If the destination block has a single pred, then this is a trivial
874 // edge, just collapse it.
875 BasicBlock *SinglePred = BB->getSinglePredecessor();
876
877 // Don't merge if BB's address is taken.
878 if (!SinglePred || SinglePred == BB || BB->hasAddressTaken())
879 continue;
880
881 if (isa<UncondBrInst>(Val: SinglePred->getTerminator())) {
882 Changed = true;
883 LLVM_DEBUG(dbgs() << "To merge:\n" << *BB << "\n\n\n");
884
885 // Merge BB into SinglePred and delete it.
886 MergeBlockIntoPredecessor(BB, DTU, LI);
887 Preds.insert(Ptr: SinglePred);
888
889 if (IsHugeFunc) {
890 // Update FreshBBs to optimize the merged BB.
891 FreshBBs.insert(Ptr: SinglePred);
892 FreshBBs.erase(Ptr: BB);
893 }
894 }
895 }
896
897 // (Repeatedly) merging blocks into their predecessors can create redundant
898 // debug intrinsics.
899 for (auto *Pred : Preds)
900 if (!DTU->isBBPendingDeletion(DelBB: Pred))
901 RemoveRedundantDbgInstrs(BB: Pred);
902
903 return Changed;
904}
905
906/// Find a destination block from BB if BB is mergeable empty block.
907BasicBlock *CodeGenPrepare::findDestBlockOfMergeableEmptyBlock(BasicBlock *BB) {
908 // If this block doesn't end with an uncond branch, ignore it.
909 UncondBrInst *BI = dyn_cast<UncondBrInst>(Val: BB->getTerminator());
910 if (!BI)
911 return nullptr;
912
913 // If the instruction before the branch (skipping debug info) isn't a phi
914 // node, then other stuff is happening here.
915 BasicBlock::iterator BBI = BI->getIterator();
916 if (BBI != BB->begin()) {
917 --BBI;
918 if (!isa<PHINode>(Val: BBI))
919 return nullptr;
920 }
921
922 // Do not break infinite loops.
923 BasicBlock *DestBB = BI->getSuccessor();
924 if (DestBB == BB)
925 return nullptr;
926
927 if (!canMergeBlocks(BB, DestBB))
928 DestBB = nullptr;
929
930 return DestBB;
931}
932
933/// Eliminate blocks that contain only PHI nodes, debug info directives, and an
934/// unconditional branch. Passes before isel (e.g. LSR/loopsimplify) often split
935/// edges in ways that are non-optimal for isel. Start by eliminating these
936/// blocks so we can split them the way we want them.
937bool CodeGenPrepare::eliminateMostlyEmptyBlocks(Function &F, bool &ResetLI) {
938 SmallPtrSet<BasicBlock *, 16> Preheaders;
939 SmallVector<Loop *, 16> LoopList(LI->begin(), LI->end());
940 while (!LoopList.empty()) {
941 Loop *L = LoopList.pop_back_val();
942 llvm::append_range(C&: LoopList, R&: *L);
943 if (BasicBlock *Preheader = L->getLoopPreheader())
944 Preheaders.insert(Ptr: Preheader);
945 }
946
947 ResetLI = false;
948 bool MadeChange = false;
949 // Note that this intentionally skips the entry block.
950 for (auto &Block : llvm::drop_begin(RangeOrContainer&: F)) {
951 // Delete phi nodes that could block deleting other empty blocks.
952 if (!DisableDeletePHIs)
953 MadeChange |= DeleteDeadPHIs(BB: &Block, TLI: TLInfo);
954 }
955
956 for (auto &Block : llvm::drop_begin(RangeOrContainer&: F)) {
957 auto *BB = &Block;
958 if (DTU->isBBPendingDeletion(DelBB: BB))
959 continue;
960 BasicBlock *DestBB = findDestBlockOfMergeableEmptyBlock(BB);
961 if (!DestBB ||
962 !isMergingEmptyBlockProfitable(BB, DestBB, isPreheader: Preheaders.count(Ptr: BB)))
963 continue;
964
965 ResetLI |= eliminateMostlyEmptyBlock(BB);
966 MadeChange = true;
967 }
968 return MadeChange;
969}
970
971bool CodeGenPrepare::isMergingEmptyBlockProfitable(BasicBlock *BB,
972 BasicBlock *DestBB,
973 bool isPreheader) {
974 // Do not delete loop preheaders if doing so would create a critical edge.
975 // Loop preheaders can be good locations to spill registers. If the
976 // preheader is deleted and we create a critical edge, registers may be
977 // spilled in the loop body instead.
978 if (!DisablePreheaderProtect && isPreheader &&
979 !(BB->getSinglePredecessor() &&
980 BB->getSinglePredecessor()->getSingleSuccessor()))
981 return false;
982
983 // Skip merging if the block's successor is also a successor to any callbr
984 // that leads to this block.
985 // FIXME: Is this really needed? Is this a correctness issue?
986 for (BasicBlock *Pred : predecessors(BB)) {
987 if (isa<CallBrInst>(Val: Pred->getTerminator()) &&
988 llvm::is_contained(Range: successors(BB: Pred), Element: DestBB))
989 return false;
990 }
991
992 // Try to skip merging if the unique predecessor of BB is terminated by a
993 // switch or indirect branch instruction, and BB is used as an incoming block
994 // of PHIs in DestBB. In such case, merging BB and DestBB would cause ISel to
995 // add COPY instructions in the predecessor of BB instead of BB (if it is not
996 // merged). Note that the critical edge created by merging such blocks wont be
997 // split in MachineSink because the jump table is not analyzable. By keeping
998 // such empty block (BB), ISel will place COPY instructions in BB, not in the
999 // predecessor of BB.
1000 BasicBlock *Pred = BB->getUniquePredecessor();
1001 if (!Pred || !(isa<SwitchInst>(Val: Pred->getTerminator()) ||
1002 isa<IndirectBrInst>(Val: Pred->getTerminator())))
1003 return true;
1004
1005 if (BB->getTerminator() != &*BB->getFirstNonPHIOrDbg())
1006 return true;
1007
1008 // We use a simple cost heuristic which determine skipping merging is
1009 // profitable if the cost of skipping merging is less than the cost of
1010 // merging : Cost(skipping merging) < Cost(merging BB), where the
1011 // Cost(skipping merging) is Freq(BB) * (Cost(Copy) + Cost(Branch)), and
1012 // the Cost(merging BB) is Freq(Pred) * Cost(Copy).
1013 // Assuming Cost(Copy) == Cost(Branch), we could simplify it to :
1014 // Freq(Pred) / Freq(BB) > 2.
1015 // Note that if there are multiple empty blocks sharing the same incoming
1016 // value for the PHIs in the DestBB, we consider them together. In such
1017 // case, Cost(merging BB) will be the sum of their frequencies.
1018
1019 if (!isa<PHINode>(Val: DestBB->begin()))
1020 return true;
1021
1022 SmallPtrSet<BasicBlock *, 16> SameIncomingValueBBs;
1023
1024 // Find all other incoming blocks from which incoming values of all PHIs in
1025 // DestBB are the same as the ones from BB.
1026 for (BasicBlock *DestBBPred : predecessors(BB: DestBB)) {
1027 if (DestBBPred == BB)
1028 continue;
1029
1030 if (llvm::all_of(Range: DestBB->phis(), P: [&](const PHINode &DestPN) {
1031 return DestPN.getIncomingValueForBlock(BB) ==
1032 DestPN.getIncomingValueForBlock(BB: DestBBPred);
1033 }))
1034 SameIncomingValueBBs.insert(Ptr: DestBBPred);
1035 }
1036
1037 // See if all BB's incoming values are same as the value from Pred. In this
1038 // case, no reason to skip merging because COPYs are expected to be place in
1039 // Pred already.
1040 if (SameIncomingValueBBs.count(Ptr: Pred))
1041 return true;
1042
1043 BlockFrequency PredFreq = BFI->getBlockFreq(BB: Pred);
1044 BlockFrequency BBFreq = BFI->getBlockFreq(BB);
1045
1046 for (auto *SameValueBB : SameIncomingValueBBs)
1047 if (SameValueBB->getUniquePredecessor() == Pred &&
1048 DestBB == findDestBlockOfMergeableEmptyBlock(BB: SameValueBB))
1049 BBFreq += BFI->getBlockFreq(BB: SameValueBB);
1050
1051 std::optional<BlockFrequency> Limit = BBFreq.mul(Factor: FreqRatioToSkipMerge);
1052 return !Limit || PredFreq <= *Limit;
1053}
1054
1055/// Return true if we can merge BB into DestBB if there is a single
1056/// unconditional branch between them, and BB contains no other non-phi
1057/// instructions.
1058bool CodeGenPrepare::canMergeBlocks(const BasicBlock *BB,
1059 const BasicBlock *DestBB) const {
1060 // We only want to eliminate blocks whose phi nodes are used by phi nodes in
1061 // the successor. If there are more complex condition (e.g. preheaders),
1062 // don't mess around with them.
1063 for (const PHINode &PN : BB->phis()) {
1064 for (const User *U : PN.users()) {
1065 const Instruction *UI = cast<Instruction>(Val: U);
1066 if (UI->getParent() != DestBB || !isa<PHINode>(Val: UI))
1067 return false;
1068 // If User is inside DestBB block and it is a PHINode then check
1069 // incoming value. If incoming value is not from BB then this is
1070 // a complex condition (e.g. preheaders) we want to avoid here.
1071 if (UI->getParent() == DestBB) {
1072 if (const PHINode *UPN = dyn_cast<PHINode>(Val: UI))
1073 for (unsigned I = 0, E = UPN->getNumIncomingValues(); I != E; ++I) {
1074 Instruction *Insn = dyn_cast<Instruction>(Val: UPN->getIncomingValue(i: I));
1075 if (Insn && Insn->getParent() == BB &&
1076 Insn->getParent() != UPN->getIncomingBlock(i: I))
1077 return false;
1078 }
1079 }
1080 }
1081 }
1082
1083 // If BB and DestBB contain any common predecessors, then the phi nodes in BB
1084 // and DestBB may have conflicting incoming values for the block. If so, we
1085 // can't merge the block.
1086 const PHINode *DestBBPN = dyn_cast<PHINode>(Val: DestBB->begin());
1087 if (!DestBBPN)
1088 return true; // no conflict.
1089
1090 // Collect the preds of BB.
1091 SmallPtrSet<const BasicBlock *, 16> BBPreds;
1092 if (const PHINode *BBPN = dyn_cast<PHINode>(Val: BB->begin())) {
1093 // It is faster to get preds from a PHI than with pred_iterator.
1094 for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i)
1095 BBPreds.insert(Ptr: BBPN->getIncomingBlock(i));
1096 } else {
1097 BBPreds.insert_range(R: predecessors(BB));
1098 }
1099
1100 // Walk the preds of DestBB.
1101 for (unsigned i = 0, e = DestBBPN->getNumIncomingValues(); i != e; ++i) {
1102 BasicBlock *Pred = DestBBPN->getIncomingBlock(i);
1103 if (BBPreds.count(Ptr: Pred)) { // Common predecessor?
1104 for (const PHINode &PN : DestBB->phis()) {
1105 const Value *V1 = PN.getIncomingValueForBlock(BB: Pred);
1106 const Value *V2 = PN.getIncomingValueForBlock(BB);
1107
1108 // If V2 is a phi node in BB, look up what the mapped value will be.
1109 if (const PHINode *V2PN = dyn_cast<PHINode>(Val: V2))
1110 if (V2PN->getParent() == BB)
1111 V2 = V2PN->getIncomingValueForBlock(BB: Pred);
1112
1113 // If there is a conflict, bail out.
1114 if (V1 != V2)
1115 return false;
1116 }
1117 }
1118 }
1119
1120 return true;
1121}
1122
1123/// Replace all old uses with new ones, and push the updated BBs into FreshBBs.
1124static void replaceAllUsesWith(Value *Old, Value *New,
1125 SmallPtrSet<BasicBlock *, 32> &FreshBBs,
1126 bool IsHuge) {
1127 auto *OldI = dyn_cast<Instruction>(Val: Old);
1128 if (OldI) {
1129 for (Value::user_iterator UI = OldI->user_begin(), E = OldI->user_end();
1130 UI != E; ++UI) {
1131 Instruction *User = cast<Instruction>(Val: *UI);
1132 if (IsHuge)
1133 FreshBBs.insert(Ptr: User->getParent());
1134 }
1135 }
1136 Old->replaceAllUsesWith(V: New);
1137}
1138
1139/// Eliminate a basic block that has only phi's and an unconditional branch in
1140/// it.
1141/// Indicate that the LoopInfo was modified only if it wasn't updated.
1142bool CodeGenPrepare::eliminateMostlyEmptyBlock(BasicBlock *BB) {
1143 UncondBrInst *BI = cast<UncondBrInst>(Val: BB->getTerminator());
1144 BasicBlock *DestBB = BI->getSuccessor();
1145
1146 LLVM_DEBUG(dbgs() << "MERGING MOSTLY EMPTY BLOCKS - BEFORE:\n"
1147 << *BB << *DestBB);
1148
1149 // If the destination block has a single pred, then this is a trivial edge,
1150 // just collapse it.
1151 if (BasicBlock *SinglePred = DestBB->getSinglePredecessor()) {
1152 if (SinglePred != DestBB) {
1153 assert(SinglePred == BB &&
1154 "Single predecessor not the same as predecessor");
1155 // Merge DestBB into SinglePred/BB and delete it.
1156 MergeBlockIntoPredecessor(BB: DestBB, DTU, LI);
1157 // Note: BB(=SinglePred) will not be deleted on this path.
1158 // DestBB(=its single successor) is the one that was deleted.
1159 LLVM_DEBUG(dbgs() << "AFTER:\n" << *SinglePred << "\n\n\n");
1160
1161 if (IsHugeFunc) {
1162 // Update FreshBBs to optimize the merged BB.
1163 FreshBBs.insert(Ptr: SinglePred);
1164 FreshBBs.erase(Ptr: DestBB);
1165 }
1166 return false;
1167 }
1168 }
1169
1170 // Otherwise, we have multiple predecessors of BB. Update the PHIs in DestBB
1171 // to handle the new incoming edges it is about to have.
1172 for (PHINode &PN : DestBB->phis()) {
1173 // Remove the incoming value for BB, and remember it.
1174 Value *InVal = PN.removeIncomingValue(BB, DeletePHIIfEmpty: false);
1175
1176 // Two options: either the InVal is a phi node defined in BB or it is some
1177 // value that dominates BB.
1178 PHINode *InValPhi = dyn_cast<PHINode>(Val: InVal);
1179 if (InValPhi && InValPhi->getParent() == BB) {
1180 // Add all of the input values of the input PHI as inputs of this phi.
1181 for (unsigned i = 0, e = InValPhi->getNumIncomingValues(); i != e; ++i)
1182 PN.addIncoming(V: InValPhi->getIncomingValue(i),
1183 BB: InValPhi->getIncomingBlock(i));
1184 } else {
1185 // Otherwise, add one instance of the dominating value for each edge that
1186 // we will be adding.
1187 if (PHINode *BBPN = dyn_cast<PHINode>(Val: BB->begin())) {
1188 for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i)
1189 PN.addIncoming(V: InVal, BB: BBPN->getIncomingBlock(i));
1190 } else {
1191 for (BasicBlock *Pred : predecessors(BB))
1192 PN.addIncoming(V: InVal, BB: Pred);
1193 }
1194 }
1195 }
1196
1197 // Preserve loop Metadata.
1198 if (BI->hasMetadata(KindID: LLVMContext::MD_loop)) {
1199 for (auto *Pred : predecessors(BB))
1200 Pred->getTerminator()->copyMetadata(SrcInst: *BI, WL: LLVMContext::MD_loop);
1201 }
1202
1203 // The PHIs are now updated, change everything that refers to BB to use
1204 // DestBB and remove BB.
1205 SmallVector<DominatorTree::UpdateType, 8> DTUpdates;
1206 SmallPtrSet<BasicBlock *, 8> SeenPreds;
1207 SmallPtrSet<BasicBlock *, 8> PredOfDestBB(llvm::from_range,
1208 predecessors(BB: DestBB));
1209 for (auto *Pred : predecessors(BB)) {
1210 if (!PredOfDestBB.contains(Ptr: Pred)) {
1211 if (SeenPreds.insert(Ptr: Pred).second)
1212 DTUpdates.push_back(Elt: {DominatorTree::Insert, Pred, DestBB});
1213 }
1214 }
1215 SeenPreds.clear();
1216 for (auto *Pred : predecessors(BB)) {
1217 if (SeenPreds.insert(Ptr: Pred).second)
1218 DTUpdates.push_back(Elt: {DominatorTree::Delete, Pred, BB});
1219 }
1220 DTUpdates.push_back(Elt: {DominatorTree::Delete, BB, DestBB});
1221 BB->replaceAllUsesWith(V: DestBB);
1222 DTU->applyUpdates(Updates: DTUpdates);
1223 DTU->deleteBB(DelBB: BB);
1224 ++NumBlocksElim;
1225
1226 LLVM_DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n");
1227 return true;
1228}
1229
1230// Computes a map of base pointer relocation instructions to corresponding
1231// derived pointer relocation instructions given a vector of all relocate calls
1232static void computeBaseDerivedRelocateMap(
1233 const SmallVectorImpl<GCRelocateInst *> &AllRelocateCalls,
1234 MapVector<GCRelocateInst *, SmallVector<GCRelocateInst *, 0>>
1235 &RelocateInstMap) {
1236 // Collect information in two maps: one primarily for locating the base object
1237 // while filling the second map; the second map is the final structure holding
1238 // a mapping between Base and corresponding Derived relocate calls
1239 MapVector<std::pair<unsigned, unsigned>, GCRelocateInst *> RelocateIdxMap;
1240 for (auto *ThisRelocate : AllRelocateCalls) {
1241 auto K = std::make_pair(x: ThisRelocate->getBasePtrIndex(),
1242 y: ThisRelocate->getDerivedPtrIndex());
1243 RelocateIdxMap.insert(KV: std::make_pair(x&: K, y&: ThisRelocate));
1244 }
1245 for (auto &Item : RelocateIdxMap) {
1246 std::pair<unsigned, unsigned> Key = Item.first;
1247 if (Key.first == Key.second)
1248 // Base relocation: nothing to insert
1249 continue;
1250
1251 GCRelocateInst *I = Item.second;
1252 auto BaseKey = std::make_pair(x&: Key.first, y&: Key.first);
1253
1254 // We're iterating over RelocateIdxMap so we cannot modify it.
1255 auto MaybeBase = RelocateIdxMap.find(Key: BaseKey);
1256 if (MaybeBase == RelocateIdxMap.end())
1257 // TODO: We might want to insert a new base object relocate and gep off
1258 // that, if there are enough derived object relocates.
1259 continue;
1260
1261 RelocateInstMap[MaybeBase->second].push_back(Elt: I);
1262 }
1263}
1264
1265// Accepts a GEP and extracts the operands into a vector provided they're all
1266// small integer constants
1267static bool getGEPSmallConstantIntOffsetV(GetElementPtrInst *GEP,
1268 SmallVectorImpl<Value *> &OffsetV) {
1269 for (unsigned i = 1; i < GEP->getNumOperands(); i++) {
1270 // Only accept small constant integer operands
1271 auto *Op = dyn_cast<ConstantInt>(Val: GEP->getOperand(i_nocapture: i));
1272 if (!Op || Op->getZExtValue() > 20)
1273 return false;
1274 }
1275
1276 for (unsigned i = 1; i < GEP->getNumOperands(); i++)
1277 OffsetV.push_back(Elt: GEP->getOperand(i_nocapture: i));
1278 return true;
1279}
1280
1281// Takes a RelocatedBase (base pointer relocation instruction) and Targets to
1282// replace, computes a replacement, and affects it.
1283static bool
1284simplifyRelocatesOffABase(GCRelocateInst *RelocatedBase,
1285 const SmallVectorImpl<GCRelocateInst *> &Targets) {
1286 bool MadeChange = false;
1287 // We must ensure the relocation of derived pointer is defined after
1288 // relocation of base pointer. If we find a relocation corresponding to base
1289 // defined earlier than relocation of base then we move relocation of base
1290 // right before found relocation. We consider only relocation in the same
1291 // basic block as relocation of base. Relocations from other basic block will
1292 // be skipped by optimization and we do not care about them.
1293 for (auto R = RelocatedBase->getParent()->getFirstInsertionPt();
1294 &*R != RelocatedBase; ++R)
1295 if (auto *RI = dyn_cast<GCRelocateInst>(Val&: R))
1296 if (RI->getStatepoint() == RelocatedBase->getStatepoint())
1297 if (RI->getBasePtrIndex() == RelocatedBase->getBasePtrIndex()) {
1298 RelocatedBase->moveBefore(InsertPos: RI->getIterator());
1299 MadeChange = true;
1300 break;
1301 }
1302
1303 for (GCRelocateInst *ToReplace : Targets) {
1304 assert(ToReplace->getBasePtrIndex() == RelocatedBase->getBasePtrIndex() &&
1305 "Not relocating a derived object of the original base object");
1306 if (ToReplace->getBasePtrIndex() == ToReplace->getDerivedPtrIndex()) {
1307 // A duplicate relocate call. TODO: coalesce duplicates.
1308 continue;
1309 }
1310
1311 if (RelocatedBase->getParent() != ToReplace->getParent()) {
1312 // Base and derived relocates are in different basic blocks.
1313 // In this case transform is only valid when base dominates derived
1314 // relocate. However it would be too expensive to check dominance
1315 // for each such relocate, so we skip the whole transformation.
1316 continue;
1317 }
1318
1319 Value *Base = ToReplace->getBasePtr();
1320 auto *Derived = dyn_cast<GetElementPtrInst>(Val: ToReplace->getDerivedPtr());
1321 if (!Derived || Derived->getPointerOperand() != Base)
1322 continue;
1323
1324 SmallVector<Value *, 2> OffsetV;
1325 if (!getGEPSmallConstantIntOffsetV(GEP: Derived, OffsetV))
1326 continue;
1327
1328 // Create a Builder and replace the target callsite with a gep
1329 assert(RelocatedBase->getNextNode() &&
1330 "Should always have one since it's not a terminator");
1331
1332 // Insert after RelocatedBase
1333 IRBuilder<> Builder(RelocatedBase->getNextNode());
1334 Builder.SetCurrentDebugLocation(ToReplace->getDebugLoc());
1335
1336 // If gc_relocate does not match the actual type, cast it to the right type.
1337 // In theory, there must be a bitcast after gc_relocate if the type does not
1338 // match, and we should reuse it to get the derived pointer. But it could be
1339 // cases like this:
1340 // bb1:
1341 // ...
1342 // %g1 = call coldcc i8 addrspace(1)*
1343 // @llvm.experimental.gc.relocate.p1i8(...) br label %merge
1344 //
1345 // bb2:
1346 // ...
1347 // %g2 = call coldcc i8 addrspace(1)*
1348 // @llvm.experimental.gc.relocate.p1i8(...) br label %merge
1349 //
1350 // merge:
1351 // %p1 = phi i8 addrspace(1)* [ %g1, %bb1 ], [ %g2, %bb2 ]
1352 // %cast = bitcast i8 addrspace(1)* %p1 in to i32 addrspace(1)*
1353 //
1354 // In this case, we can not find the bitcast any more. So we insert a new
1355 // bitcast no matter there is already one or not. In this way, we can handle
1356 // all cases, and the extra bitcast should be optimized away in later
1357 // passes.
1358 Value *ActualRelocatedBase = RelocatedBase;
1359 if (RelocatedBase->getType() != Base->getType()) {
1360 ActualRelocatedBase =
1361 Builder.CreateBitCast(V: RelocatedBase, DestTy: Base->getType());
1362 }
1363 Value *Replacement =
1364 Builder.CreateGEP(Ty: Derived->getSourceElementType(), Ptr: ActualRelocatedBase,
1365 IdxList: ArrayRef(OffsetV));
1366 Replacement->takeName(V: ToReplace);
1367 // If the newly generated derived pointer's type does not match the original
1368 // derived pointer's type, cast the new derived pointer to match it. Same
1369 // reasoning as above.
1370 Value *ActualReplacement = Replacement;
1371 if (Replacement->getType() != ToReplace->getType()) {
1372 ActualReplacement =
1373 Builder.CreateBitCast(V: Replacement, DestTy: ToReplace->getType());
1374 }
1375 ToReplace->replaceAllUsesWith(V: ActualReplacement);
1376 ToReplace->eraseFromParent();
1377
1378 MadeChange = true;
1379 }
1380 return MadeChange;
1381}
1382
1383// Turns this:
1384//
1385// %base = ...
1386// %ptr = gep %base + 15
1387// %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr)
1388// %base' = relocate(%tok, i32 4, i32 4)
1389// %ptr' = relocate(%tok, i32 4, i32 5)
1390// %val = load %ptr'
1391//
1392// into this:
1393//
1394// %base = ...
1395// %ptr = gep %base + 15
1396// %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr)
1397// %base' = gc.relocate(%tok, i32 4, i32 4)
1398// %ptr' = gep %base' + 15
1399// %val = load %ptr'
1400bool CodeGenPrepare::simplifyOffsetableRelocate(GCStatepointInst &I) {
1401 bool MadeChange = false;
1402 SmallVector<GCRelocateInst *, 2> AllRelocateCalls;
1403 for (auto *U : I.users())
1404 if (GCRelocateInst *Relocate = dyn_cast<GCRelocateInst>(Val: U))
1405 // Collect all the relocate calls associated with a statepoint
1406 AllRelocateCalls.push_back(Elt: Relocate);
1407
1408 // We need at least one base pointer relocation + one derived pointer
1409 // relocation to mangle
1410 if (AllRelocateCalls.size() < 2)
1411 return false;
1412
1413 // RelocateInstMap is a mapping from the base relocate instruction to the
1414 // corresponding derived relocate instructions
1415 MapVector<GCRelocateInst *, SmallVector<GCRelocateInst *, 0>> RelocateInstMap;
1416 computeBaseDerivedRelocateMap(AllRelocateCalls, RelocateInstMap);
1417 if (RelocateInstMap.empty())
1418 return false;
1419
1420 for (auto &Item : RelocateInstMap)
1421 // Item.first is the RelocatedBase to offset against
1422 // Item.second is the vector of Targets to replace
1423 MadeChange = simplifyRelocatesOffABase(RelocatedBase: Item.first, Targets: Item.second);
1424 return MadeChange;
1425}
1426
1427/// Sink the specified cast instruction into its user blocks.
1428static bool SinkCast(CastInst *CI) {
1429 BasicBlock *DefBB = CI->getParent();
1430
1431 /// InsertedCasts - Only insert a cast in each block once.
1432 DenseMap<BasicBlock *, CastInst *> InsertedCasts;
1433
1434 bool MadeChange = false;
1435 for (Value::user_iterator UI = CI->user_begin(), E = CI->user_end();
1436 UI != E;) {
1437 Use &TheUse = UI.getUse();
1438 Instruction *User = cast<Instruction>(Val: *UI);
1439
1440 // Figure out which BB this cast is used in. For PHI's this is the
1441 // appropriate predecessor block.
1442 BasicBlock *UserBB = User->getParent();
1443 if (PHINode *PN = dyn_cast<PHINode>(Val: User)) {
1444 UserBB = PN->getIncomingBlock(U: TheUse);
1445 }
1446
1447 // Preincrement use iterator so we don't invalidate it.
1448 ++UI;
1449
1450 // The first insertion point of a block containing an EH pad is after the
1451 // pad. If the pad is the user, we cannot sink the cast past the pad.
1452 if (User->isEHPad())
1453 continue;
1454
1455 // If the block selected to receive the cast is an EH pad that does not
1456 // allow non-PHI instructions before the terminator, we can't sink the
1457 // cast.
1458 if (UserBB->getTerminator()->isEHPad())
1459 continue;
1460
1461 // If this user is in the same block as the cast, don't change the cast.
1462 if (UserBB == DefBB)
1463 continue;
1464
1465 // If we have already inserted a cast into this block, use it.
1466 CastInst *&InsertedCast = InsertedCasts[UserBB];
1467
1468 if (!InsertedCast) {
1469 BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
1470 assert(InsertPt != UserBB->end());
1471 InsertedCast = cast<CastInst>(Val: CI->clone());
1472 InsertedCast->insertBefore(BB&: *UserBB, InsertPos: InsertPt);
1473 }
1474
1475 // Replace a use of the cast with a use of the new cast.
1476 TheUse = InsertedCast;
1477 MadeChange = true;
1478 ++NumCastUses;
1479 }
1480
1481 // If we removed all uses, nuke the cast.
1482 if (CI->use_empty()) {
1483 salvageDebugInfo(I&: *CI);
1484 CI->eraseFromParent();
1485 MadeChange = true;
1486 }
1487
1488 return MadeChange;
1489}
1490
1491/// If the specified cast instruction is a noop copy (e.g. it's casting from
1492/// one pointer type to another, i32->i8 on PPC), sink it into user blocks to
1493/// reduce the number of virtual registers that must be created and coalesced.
1494///
1495/// Return true if any changes are made.
1496static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI,
1497 const DataLayout &DL) {
1498 // Sink only "cheap" (or nop) address-space casts. This is a weaker condition
1499 // than sinking only nop casts, but is helpful on some platforms.
1500 if (auto *ASC = dyn_cast<AddrSpaceCastInst>(Val: CI)) {
1501 if (!TLI.isFreeAddrSpaceCast(SrcAS: ASC->getSrcAddressSpace(),
1502 DestAS: ASC->getDestAddressSpace()))
1503 return false;
1504 }
1505
1506 // If this is a noop copy,
1507 EVT SrcVT = TLI.getValueType(DL, Ty: CI->getOperand(i_nocapture: 0)->getType());
1508 EVT DstVT = TLI.getValueType(DL, Ty: CI->getType());
1509
1510 // This is an fp<->int conversion?
1511 if (SrcVT.isInteger() != DstVT.isInteger())
1512 return false;
1513
1514 // If this is an extension, it will be a zero or sign extension, which
1515 // isn't a noop.
1516 if (SrcVT.bitsLT(VT: DstVT))
1517 return false;
1518
1519 // If these values will be promoted, find out what they will be promoted
1520 // to. This helps us consider truncates on PPC as noop copies when they
1521 // are.
1522 if (TLI.getTypeAction(Context&: CI->getContext(), VT: SrcVT) ==
1523 TargetLowering::TypePromoteInteger)
1524 SrcVT = TLI.getTypeToTransformTo(Context&: CI->getContext(), VT: SrcVT);
1525 if (TLI.getTypeAction(Context&: CI->getContext(), VT: DstVT) ==
1526 TargetLowering::TypePromoteInteger)
1527 DstVT = TLI.getTypeToTransformTo(Context&: CI->getContext(), VT: DstVT);
1528
1529 // If, after promotion, these are the same types, this is a noop copy.
1530 if (SrcVT != DstVT)
1531 return false;
1532
1533 return SinkCast(CI);
1534}
1535
1536// Match a simple increment by constant operation. Note that if a sub is
1537// matched, the step is negated (as if the step had been canonicalized to
1538// an add, even though we leave the instruction alone.)
1539static bool matchIncrement(const Instruction *IVInc, Instruction *&LHS,
1540 Constant *&Step) {
1541 if (match(V: IVInc, P: m_Add(L: m_Instruction(I&: LHS), R: m_Constant(C&: Step))) ||
1542 match(V: IVInc, P: m_ExtractValue<0>(V: m_Intrinsic<Intrinsic::uadd_with_overflow>(
1543 Op0: m_Instruction(I&: LHS), Op1: m_Constant(C&: Step)))))
1544 return true;
1545 if (match(V: IVInc, P: m_Sub(L: m_Instruction(I&: LHS), R: m_Constant(C&: Step))) ||
1546 match(V: IVInc, P: m_ExtractValue<0>(V: m_Intrinsic<Intrinsic::usub_with_overflow>(
1547 Op0: m_Instruction(I&: LHS), Op1: m_Constant(C&: Step))))) {
1548 Step = ConstantExpr::getNeg(C: Step);
1549 return true;
1550 }
1551 return false;
1552}
1553
1554/// If given \p PN is an inductive variable with value IVInc coming from the
1555/// backedge, and on each iteration it gets increased by Step, return pair
1556/// <IVInc, Step>. Otherwise, return std::nullopt.
1557static std::optional<std::pair<Instruction *, Constant *>>
1558getIVIncrement(const PHINode *PN, const LoopInfo *LI) {
1559 const Loop *L = LI->getLoopFor(BB: PN->getParent());
1560 if (!L || L->getHeader() != PN->getParent() || !L->getLoopLatch())
1561 return std::nullopt;
1562 auto *IVInc =
1563 dyn_cast<Instruction>(Val: PN->getIncomingValueForBlock(BB: L->getLoopLatch()));
1564 if (!IVInc || LI->getLoopFor(BB: IVInc->getParent()) != L)
1565 return std::nullopt;
1566 Instruction *LHS = nullptr;
1567 Constant *Step = nullptr;
1568 if (matchIncrement(IVInc, LHS, Step) && LHS == PN)
1569 return std::make_pair(x&: IVInc, y&: Step);
1570 return std::nullopt;
1571}
1572
1573static bool isIVIncrement(const Value *V, const LoopInfo *LI) {
1574 auto *I = dyn_cast<Instruction>(Val: V);
1575 if (!I)
1576 return false;
1577 Instruction *LHS = nullptr;
1578 Constant *Step = nullptr;
1579 if (!matchIncrement(IVInc: I, LHS, Step))
1580 return false;
1581 if (auto *PN = dyn_cast<PHINode>(Val: LHS))
1582 if (auto IVInc = getIVIncrement(PN, LI))
1583 return IVInc->first == I;
1584 return false;
1585}
1586
1587bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO,
1588 Value *Arg0, Value *Arg1,
1589 CmpInst *Cmp,
1590 Intrinsic::ID IID) {
1591 auto IsReplacableIVIncrement = [this, &Cmp](BinaryOperator *BO) {
1592 if (!isIVIncrement(V: BO, LI))
1593 return false;
1594 const Loop *L = LI->getLoopFor(BB: BO->getParent());
1595 assert(L && "L should not be null after isIVIncrement()");
1596 // Do not risk on moving increment into a child loop.
1597 if (LI->getLoopFor(BB: Cmp->getParent()) != L)
1598 return false;
1599
1600 // Finally, we need to ensure that the insert point will dominate all
1601 // existing uses of the increment.
1602
1603 auto &DT = getDT();
1604 if (DT.dominates(A: Cmp->getParent(), B: BO->getParent()))
1605 // If we're moving up the dom tree, all uses are trivially dominated.
1606 // (This is the common case for code produced by LSR.)
1607 return true;
1608
1609 // Otherwise, special case the single use in the phi recurrence.
1610 return BO->hasOneUse() && DT.dominates(A: Cmp->getParent(), B: L->getLoopLatch());
1611 };
1612 if (BO->getParent() != Cmp->getParent() && !IsReplacableIVIncrement(BO)) {
1613 // We used to use a dominator tree here to allow multi-block optimization.
1614 // But that was problematic because:
1615 // 1. It could cause a perf regression by hoisting the math op into the
1616 // critical path.
1617 // 2. It could cause a perf regression by creating a value that was live
1618 // across multiple blocks and increasing register pressure.
1619 // 3. Use of a dominator tree could cause large compile-time regression.
1620 // This is because we recompute the DT on every change in the main CGP
1621 // run-loop. The recomputing is probably unnecessary in many cases, so if
1622 // that was fixed, using a DT here would be ok.
1623 //
1624 // There is one important particular case we still want to handle: if BO is
1625 // the IV increment. Important properties that make it profitable:
1626 // - We can speculate IV increment anywhere in the loop (as long as the
1627 // indvar Phi is its only user);
1628 // - Upon computing Cmp, we effectively compute something equivalent to the
1629 // IV increment (despite it loops differently in the IR). So moving it up
1630 // to the cmp point does not really increase register pressure.
1631 return false;
1632 }
1633
1634 // We allow matching the canonical IR (add X, C) back to (usubo X, -C).
1635 if (BO->getOpcode() == Instruction::Add &&
1636 IID == Intrinsic::usub_with_overflow) {
1637 assert(isa<Constant>(Arg1) && "Unexpected input for usubo");
1638 Arg1 = ConstantExpr::getNeg(C: cast<Constant>(Val: Arg1));
1639 }
1640
1641 // Insert at the first instruction of the pair.
1642 Instruction *InsertPt = nullptr;
1643 for (Instruction &Iter : *Cmp->getParent()) {
1644 // If BO is an XOR, it is not guaranteed that it comes after both inputs to
1645 // the overflow intrinsic are defined.
1646 if ((BO->getOpcode() != Instruction::Xor && &Iter == BO) || &Iter == Cmp) {
1647 InsertPt = &Iter;
1648 break;
1649 }
1650 }
1651 assert(InsertPt != nullptr && "Parent block did not contain cmp or binop");
1652
1653 IRBuilder<> Builder(InsertPt);
1654 Value *MathOV = Builder.CreateBinaryIntrinsic(ID: IID, LHS: Arg0, RHS: Arg1);
1655 if (BO->getOpcode() != Instruction::Xor) {
1656 Value *Math = Builder.CreateExtractValue(Agg: MathOV, Idxs: 0, Name: "math");
1657 replaceAllUsesWith(Old: BO, New: Math, FreshBBs, IsHuge: IsHugeFunc);
1658 } else
1659 assert(BO->hasOneUse() &&
1660 "Patterns with XOr should use the BO only in the compare");
1661 Value *OV = Builder.CreateExtractValue(Agg: MathOV, Idxs: 1, Name: "ov");
1662 replaceAllUsesWith(Old: Cmp, New: OV, FreshBBs, IsHuge: IsHugeFunc);
1663 Cmp->eraseFromParent();
1664 BO->eraseFromParent();
1665 return true;
1666}
1667
1668/// Match special-case patterns that check for unsigned add overflow.
1669static bool matchUAddWithOverflowConstantEdgeCases(CmpInst *Cmp,
1670 BinaryOperator *&Add) {
1671 // Add = add A, 1; Cmp = icmp eq A,-1 (overflow if A is max val)
1672 // Add = add A,-1; Cmp = icmp ne A, 0 (overflow if A is non-zero)
1673 Value *A = Cmp->getOperand(i_nocapture: 0), *B = Cmp->getOperand(i_nocapture: 1);
1674
1675 // We are not expecting non-canonical/degenerate code. Just bail out.
1676 if (isa<Constant>(Val: A))
1677 return false;
1678
1679 ICmpInst::Predicate Pred = Cmp->getPredicate();
1680 if (Pred == ICmpInst::ICMP_EQ && match(V: B, P: m_AllOnes()))
1681 B = ConstantInt::get(Ty: B->getType(), V: 1);
1682 else if (Pred == ICmpInst::ICMP_NE && match(V: B, P: m_ZeroInt()))
1683 B = Constant::getAllOnesValue(Ty: B->getType());
1684 else
1685 return false;
1686
1687 // Check the users of the variable operand of the compare looking for an add
1688 // with the adjusted constant.
1689 for (User *U : A->users()) {
1690 if (match(V: U, P: m_Add(L: m_Specific(V: A), R: m_Specific(V: B)))) {
1691 Add = cast<BinaryOperator>(Val: U);
1692 return true;
1693 }
1694 }
1695 return false;
1696}
1697
1698/// Try to combine the compare into a call to the llvm.uadd.with.overflow
1699/// intrinsic. Return true if any changes were made.
1700bool CodeGenPrepare::combineToUAddWithOverflow(CmpInst *Cmp,
1701 ModifyDT &ModifiedDT) {
1702 bool EdgeCase = false;
1703 Value *A, *B;
1704 BinaryOperator *Add;
1705 if (!match(V: Cmp, P: m_UAddWithOverflow(L: m_Value(V&: A), R: m_Value(V&: B), S: m_BinOp(I&: Add)))) {
1706 if (!matchUAddWithOverflowConstantEdgeCases(Cmp, Add))
1707 return false;
1708 // Set A and B in case we match matchUAddWithOverflowConstantEdgeCases.
1709 A = Add->getOperand(i_nocapture: 0);
1710 B = Add->getOperand(i_nocapture: 1);
1711 EdgeCase = true;
1712 }
1713
1714 if (!TLI->shouldFormOverflowOp(Opcode: ISD::UADDO,
1715 VT: TLI->getValueType(DL: *DL, Ty: Add->getType()),
1716 MathUsed: Add->hasNUsesOrMore(N: EdgeCase ? 1 : 2)))
1717 return false;
1718
1719 // We don't want to move around uses of condition values this late, so we
1720 // check if it is legal to create the call to the intrinsic in the basic
1721 // block containing the icmp.
1722 if (Add->getParent() != Cmp->getParent() && !Add->hasOneUse())
1723 return false;
1724
1725 if (!replaceMathCmpWithIntrinsic(BO: Add, Arg0: A, Arg1: B, Cmp,
1726 IID: Intrinsic::uadd_with_overflow))
1727 return false;
1728
1729 // Reset callers - do not crash by iterating over a dead instruction.
1730 ModifiedDT = ModifyDT::ModifyInstDT;
1731 return true;
1732}
1733
1734bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp,
1735 ModifyDT &ModifiedDT) {
1736 // We are not expecting non-canonical/degenerate code. Just bail out.
1737 Value *A = Cmp->getOperand(i_nocapture: 0), *B = Cmp->getOperand(i_nocapture: 1);
1738 if (isa<Constant>(Val: A) && isa<Constant>(Val: B))
1739 return false;
1740
1741 // Convert (A u> B) to (A u< B) to simplify pattern matching.
1742 ICmpInst::Predicate Pred = Cmp->getPredicate();
1743 if (Pred == ICmpInst::ICMP_UGT) {
1744 std::swap(a&: A, b&: B);
1745 Pred = ICmpInst::ICMP_ULT;
1746 }
1747 // Convert special-case: (A == 0) is the same as (A u< 1).
1748 if (Pred == ICmpInst::ICMP_EQ && match(V: B, P: m_ZeroInt())) {
1749 B = ConstantInt::get(Ty: B->getType(), V: 1);
1750 Pred = ICmpInst::ICMP_ULT;
1751 }
1752 // Convert special-case: (A != 0) is the same as (0 u< A).
1753 if (Pred == ICmpInst::ICMP_NE && match(V: B, P: m_ZeroInt())) {
1754 std::swap(a&: A, b&: B);
1755 Pred = ICmpInst::ICMP_ULT;
1756 }
1757 if (Pred != ICmpInst::ICMP_ULT)
1758 return false;
1759
1760 // Walk the users of a variable operand of a compare looking for a subtract or
1761 // add with that same operand. Also match the 2nd operand of the compare to
1762 // the add/sub, but that may be a negated constant operand of an add.
1763 Value *CmpVariableOperand = isa<Constant>(Val: A) ? B : A;
1764 BinaryOperator *Sub = nullptr;
1765 for (User *U : CmpVariableOperand->users()) {
1766 // A - B, A u< B --> usubo(A, B)
1767 if (match(V: U, P: m_Sub(L: m_Specific(V: A), R: m_Specific(V: B)))) {
1768 Sub = cast<BinaryOperator>(Val: U);
1769 break;
1770 }
1771
1772 // A + (-C), A u< C (canonicalized form of (sub A, C))
1773 const APInt *CmpC, *AddC;
1774 if (match(V: U, P: m_Add(L: m_Specific(V: A), R: m_APInt(Res&: AddC))) &&
1775 match(V: B, P: m_APInt(Res&: CmpC)) && *AddC == -(*CmpC)) {
1776 Sub = cast<BinaryOperator>(Val: U);
1777 break;
1778 }
1779 }
1780 if (!Sub)
1781 return false;
1782
1783 if (!TLI->shouldFormOverflowOp(Opcode: ISD::USUBO,
1784 VT: TLI->getValueType(DL: *DL, Ty: Sub->getType()),
1785 MathUsed: Sub->hasNUsesOrMore(N: 1)))
1786 return false;
1787
1788 // We don't want to move around uses of condition values this late, so we
1789 // check if it is legal to create the call to the intrinsic in the basic
1790 // block containing the icmp.
1791 if (Sub->getParent() != Cmp->getParent() && !Sub->hasOneUse())
1792 return false;
1793
1794 if (!replaceMathCmpWithIntrinsic(BO: Sub, Arg0: Sub->getOperand(i_nocapture: 0), Arg1: Sub->getOperand(i_nocapture: 1),
1795 Cmp, IID: Intrinsic::usub_with_overflow))
1796 return false;
1797
1798 // Reset callers - do not crash by iterating over a dead instruction.
1799 ModifiedDT = ModifyDT::ModifyInstDT;
1800 return true;
1801}
1802
1803// Decanonicalizes icmp+ctpop power-of-two test if ctpop is slow.
1804// The same transformation exists in DAG combiner, but we repeat it here because
1805// DAG builder can break the pattern by moving icmp into a successor block.
1806bool CodeGenPrepare::unfoldPowerOf2Test(CmpInst *Cmp) {
1807 CmpPredicate Pred;
1808 Value *X;
1809 const APInt *C;
1810
1811 // (icmp (ctpop x), c)
1812 if (!match(V: Cmp, P: m_ICmp(Pred, L: m_Ctpop(Op0: m_Value(V&: X)), R: m_APIntAllowPoison(Res&: C))))
1813 return false;
1814
1815 // We're only interested in "is power of 2 [or zero]" patterns.
1816 bool IsStrictlyPowerOf2Test = ICmpInst::isEquality(P: Pred) && *C == 1;
1817 bool IsPowerOf2OrZeroTest = (Pred == CmpInst::ICMP_ULT && *C == 2) ||
1818 (Pred == CmpInst::ICMP_UGT && *C == 1);
1819 if (!IsStrictlyPowerOf2Test && !IsPowerOf2OrZeroTest)
1820 return false;
1821
1822 // Some targets have better codegen for `ctpop(x) u</u>= 2/1`than for
1823 // `ctpop(x) ==/!= 1`. If ctpop is fast, only try changing the comparison,
1824 // and otherwise expand ctpop into a few simple instructions.
1825 Type *OpTy = X->getType();
1826 if (TLI->isCtpopFast(VT: TLI->getValueType(DL: *DL, Ty: OpTy))) {
1827 // Look for `ctpop(x) ==/!= 1`, where `ctpop(x)` is known to be non-zero.
1828 if (!IsStrictlyPowerOf2Test || !isKnownNonZero(V: Cmp->getOperand(i_nocapture: 0), Q: *DL))
1829 return false;
1830
1831 // ctpop(x) == 1 -> ctpop(x) u< 2
1832 // ctpop(x) != 1 -> ctpop(x) u> 1
1833 if (Pred == ICmpInst::ICMP_EQ) {
1834 Cmp->setOperand(i_nocapture: 1, Val_nocapture: ConstantInt::get(Ty: OpTy, V: 2));
1835 Cmp->setPredicate(ICmpInst::ICMP_ULT);
1836 } else {
1837 Cmp->setPredicate(ICmpInst::ICMP_UGT);
1838 }
1839 return true;
1840 }
1841
1842 Value *NewCmp;
1843 if (IsPowerOf2OrZeroTest ||
1844 (IsStrictlyPowerOf2Test && isKnownNonZero(V: Cmp->getOperand(i_nocapture: 0), Q: *DL))) {
1845 // ctpop(x) u< 2 -> (x & (x - 1)) == 0
1846 // ctpop(x) u> 1 -> (x & (x - 1)) != 0
1847 IRBuilder<> Builder(Cmp);
1848 Value *Sub = Builder.CreateAdd(LHS: X, RHS: Constant::getAllOnesValue(Ty: OpTy));
1849 Value *And = Builder.CreateAnd(LHS: X, RHS: Sub);
1850 CmpInst::Predicate NewPred =
1851 (Pred == CmpInst::ICMP_ULT || Pred == CmpInst::ICMP_EQ)
1852 ? CmpInst::ICMP_EQ
1853 : CmpInst::ICMP_NE;
1854 NewCmp = Builder.CreateICmp(P: NewPred, LHS: And, RHS: ConstantInt::getNullValue(Ty: OpTy));
1855 } else {
1856 // ctpop(x) == 1 -> (x ^ (x - 1)) u> (x - 1)
1857 // ctpop(x) != 1 -> (x ^ (x - 1)) u<= (x - 1)
1858 IRBuilder<> Builder(Cmp);
1859 Value *Sub = Builder.CreateAdd(LHS: X, RHS: Constant::getAllOnesValue(Ty: OpTy));
1860 Value *Xor = Builder.CreateXor(LHS: X, RHS: Sub);
1861 CmpInst::Predicate NewPred =
1862 Pred == CmpInst::ICMP_EQ ? CmpInst::ICMP_UGT : CmpInst::ICMP_ULE;
1863 NewCmp = Builder.CreateICmp(P: NewPred, LHS: Xor, RHS: Sub);
1864 }
1865
1866 Cmp->replaceAllUsesWith(V: NewCmp);
1867 RecursivelyDeleteTriviallyDeadInstructions(V: Cmp);
1868 return true;
1869}
1870
1871/// Sink the given CmpInst into user blocks to reduce the number of virtual
1872/// registers that must be created and coalesced. This is a clear win except on
1873/// targets with multiple condition code registers (PowerPC), where it might
1874/// lose; some adjustment may be wanted there.
1875///
1876/// Return true if any changes are made.
1877static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI,
1878 const DataLayout &DL) {
1879 if (TLI.hasMultipleConditionRegisters(VT: EVT::getEVT(Ty: Cmp->getType())))
1880 return false;
1881
1882 // Avoid sinking soft-FP comparisons, since this can move them into a loop.
1883 if (TLI.useSoftFloat() && isa<FCmpInst>(Val: Cmp))
1884 return false;
1885
1886 bool UsedInPhiOrCurrentBlock = any_of(Range: Cmp->users(), P: [Cmp](User *U) {
1887 return isa<PHINode>(Val: U) ||
1888 cast<Instruction>(Val: U)->getParent() == Cmp->getParent();
1889 });
1890
1891 // Avoid sinking larger than legal integer comparisons unless its ONLY used in
1892 // another BB.
1893 if (UsedInPhiOrCurrentBlock && Cmp->getOperand(i_nocapture: 0)->getType()->isIntegerTy() &&
1894 Cmp->getOperand(i_nocapture: 0)->getType()->getScalarSizeInBits() >
1895 DL.getLargestLegalIntTypeSizeInBits())
1896 return false;
1897
1898 // Only insert a cmp in each block once.
1899 DenseMap<BasicBlock *, CmpInst *> InsertedCmps;
1900
1901 bool MadeChange = false;
1902 for (Value::user_iterator UI = Cmp->user_begin(), E = Cmp->user_end();
1903 UI != E;) {
1904 Use &TheUse = UI.getUse();
1905 Instruction *User = cast<Instruction>(Val: *UI);
1906
1907 // Preincrement use iterator so we don't invalidate it.
1908 ++UI;
1909
1910 // Don't bother for PHI nodes.
1911 if (isa<PHINode>(Val: User))
1912 continue;
1913
1914 // Figure out which BB this cmp is used in.
1915 BasicBlock *UserBB = User->getParent();
1916 BasicBlock *DefBB = Cmp->getParent();
1917
1918 // If this user is in the same block as the cmp, don't change the cmp.
1919 if (UserBB == DefBB)
1920 continue;
1921
1922 // If we have already inserted a cmp into this block, use it.
1923 CmpInst *&InsertedCmp = InsertedCmps[UserBB];
1924
1925 if (!InsertedCmp) {
1926 BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
1927 assert(InsertPt != UserBB->end());
1928 InsertedCmp = CmpInst::Create(Op: Cmp->getOpcode(), Pred: Cmp->getPredicate(),
1929 S1: Cmp->getOperand(i_nocapture: 0), S2: Cmp->getOperand(i_nocapture: 1), Name: "");
1930 InsertedCmp->insertBefore(BB&: *UserBB, InsertPos: InsertPt);
1931 // Propagate the debug info.
1932 InsertedCmp->setDebugLoc(Cmp->getDebugLoc());
1933 }
1934
1935 // Replace a use of the cmp with a use of the new cmp.
1936 TheUse = InsertedCmp;
1937 MadeChange = true;
1938 ++NumCmpUses;
1939 }
1940
1941 // If we removed all uses, nuke the cmp.
1942 if (Cmp->use_empty()) {
1943 Cmp->eraseFromParent();
1944 MadeChange = true;
1945 }
1946
1947 return MadeChange;
1948}
1949
1950/// For pattern like:
1951///
1952/// DomCond = icmp sgt/slt CmpOp0, CmpOp1 (might not be in DomBB)
1953/// ...
1954/// DomBB:
1955/// ...
1956/// br DomCond, TrueBB, CmpBB
1957/// CmpBB: (with DomBB being the single predecessor)
1958/// ...
1959/// Cmp = icmp eq CmpOp0, CmpOp1
1960/// ...
1961///
1962/// It would use two comparison on targets that lowering of icmp sgt/slt is
1963/// different from lowering of icmp eq (PowerPC). This function try to convert
1964/// 'Cmp = icmp eq CmpOp0, CmpOp1' to ' Cmp = icmp slt/sgt CmpOp0, CmpOp1'.
1965/// After that, DomCond and Cmp can use the same comparison so reduce one
1966/// comparison.
1967///
1968/// Return true if any changes are made.
1969static bool foldICmpWithDominatingICmp(CmpInst *Cmp,
1970 const TargetLowering &TLI) {
1971 if (!EnableICMP_EQToICMP_ST && TLI.isEqualityCmpFoldedWithSignedCmp())
1972 return false;
1973
1974 ICmpInst::Predicate Pred = Cmp->getPredicate();
1975 if (Pred != ICmpInst::ICMP_EQ)
1976 return false;
1977
1978 // If icmp eq has users other than CondBrInst and SelectInst, converting it to
1979 // icmp slt/sgt would introduce more redundant LLVM IR.
1980 for (User *U : Cmp->users()) {
1981 if (isa<CondBrInst>(Val: U))
1982 continue;
1983 if (isa<SelectInst>(Val: U) && cast<SelectInst>(Val: U)->getCondition() == Cmp)
1984 continue;
1985 return false;
1986 }
1987
1988 // This is a cheap/incomplete check for dominance - just match a single
1989 // predecessor with a conditional branch.
1990 BasicBlock *CmpBB = Cmp->getParent();
1991 BasicBlock *DomBB = CmpBB->getSinglePredecessor();
1992 if (!DomBB)
1993 return false;
1994
1995 // We want to ensure that the only way control gets to the comparison of
1996 // interest is that a less/greater than comparison on the same operands is
1997 // false.
1998 Value *DomCond;
1999 BasicBlock *TrueBB, *FalseBB;
2000 if (!match(V: DomBB->getTerminator(), P: m_Br(C: m_Value(V&: DomCond), T&: TrueBB, F&: FalseBB)))
2001 return false;
2002 if (CmpBB != FalseBB)
2003 return false;
2004
2005 Value *CmpOp0 = Cmp->getOperand(i_nocapture: 0), *CmpOp1 = Cmp->getOperand(i_nocapture: 1);
2006 CmpPredicate DomPred;
2007 if (!match(V: DomCond, P: m_ICmp(Pred&: DomPred, L: m_Specific(V: CmpOp0), R: m_Specific(V: CmpOp1))))
2008 return false;
2009 if (DomPred != ICmpInst::ICMP_SGT && DomPred != ICmpInst::ICMP_SLT)
2010 return false;
2011
2012 // Convert the equality comparison to the opposite of the dominating
2013 // comparison and swap the direction for all branch/select users.
2014 // We have conceptually converted:
2015 // Res = (a < b) ? <LT_RES> : (a == b) ? <EQ_RES> : <GT_RES>;
2016 // to
2017 // Res = (a < b) ? <LT_RES> : (a > b) ? <GT_RES> : <EQ_RES>;
2018 // And similarly for branches.
2019 for (User *U : Cmp->users()) {
2020 if (auto *BI = dyn_cast<CondBrInst>(Val: U)) {
2021 BI->swapSuccessors();
2022 continue;
2023 }
2024 if (auto *SI = dyn_cast<SelectInst>(Val: U)) {
2025 // Swap operands
2026 SI->swapValues();
2027 SI->swapProfMetadata();
2028 continue;
2029 }
2030 llvm_unreachable("Must be a branch or a select");
2031 }
2032 Cmp->setPredicate(CmpInst::getSwappedPredicate(pred: DomPred));
2033 return true;
2034}
2035
2036/// Many architectures use the same instruction for both subtract and cmp. Try
2037/// to swap cmp operands to match subtract operations to allow for CSE.
2038static bool swapICmpOperandsToExposeCSEOpportunities(CmpInst *Cmp) {
2039 Value *Op0 = Cmp->getOperand(i_nocapture: 0);
2040 Value *Op1 = Cmp->getOperand(i_nocapture: 1);
2041 if (!Op0->getType()->isIntegerTy() || isa<Constant>(Val: Op0) ||
2042 isa<Constant>(Val: Op1) || Op0 == Op1)
2043 return false;
2044
2045 // If a subtract already has the same operands as a compare, swapping would be
2046 // bad. If a subtract has the same operands as a compare but in reverse order,
2047 // then swapping is good.
2048 int GoodToSwap = 0;
2049 unsigned NumInspected = 0;
2050 for (const User *U : Op0->users()) {
2051 // Avoid walking many users.
2052 if (++NumInspected > 128)
2053 return false;
2054 if (match(V: U, P: m_Sub(L: m_Specific(V: Op1), R: m_Specific(V: Op0))))
2055 GoodToSwap++;
2056 else if (match(V: U, P: m_Sub(L: m_Specific(V: Op0), R: m_Specific(V: Op1))))
2057 GoodToSwap--;
2058 }
2059
2060 if (GoodToSwap > 0) {
2061 Cmp->swapOperands();
2062 return true;
2063 }
2064 return false;
2065}
2066
2067static bool foldFCmpToFPClassTest(CmpInst *Cmp, const TargetLowering &TLI,
2068 const DataLayout &DL) {
2069 FCmpInst *FCmp = dyn_cast<FCmpInst>(Val: Cmp);
2070 if (!FCmp)
2071 return false;
2072
2073 // Don't fold if the target offers free fabs and the predicate is legal.
2074 EVT VT = TLI.getValueType(DL, Ty: Cmp->getOperand(i_nocapture: 0)->getType());
2075 if (TLI.isFAbsFree(VT) &&
2076 TLI.isCondCodeLegal(CC: getFCmpCondCode(Pred: FCmp->getPredicate()),
2077 VT: VT.getSimpleVT()))
2078 return false;
2079
2080 // Reverse the canonicalization if it is a FP class test
2081 auto ShouldReverseTransform = [](FPClassTest ClassTest) {
2082 return ClassTest == fcInf || ClassTest == (fcInf | fcNan);
2083 };
2084 auto [ClassVal, ClassTest] =
2085 fcmpToClassTest(Pred: FCmp->getPredicate(), F: *FCmp->getParent()->getParent(),
2086 LHS: FCmp->getOperand(i_nocapture: 0), RHS: FCmp->getOperand(i_nocapture: 1));
2087 if (!ClassVal)
2088 return false;
2089
2090 if (!ShouldReverseTransform(ClassTest) && !ShouldReverseTransform(~ClassTest))
2091 return false;
2092
2093 IRBuilder<> Builder(Cmp);
2094 Value *IsFPClass = Builder.createIsFPClass(FPNum: ClassVal, Test: ClassTest);
2095 Cmp->replaceAllUsesWith(V: IsFPClass);
2096 RecursivelyDeleteTriviallyDeadInstructions(V: Cmp);
2097 return true;
2098}
2099
2100static bool isRemOfLoopIncrementWithLoopInvariant(
2101 Instruction *Rem, const LoopInfo *LI, Value *&RemAmtOut, Value *&AddInstOut,
2102 Value *&AddOffsetOut, PHINode *&LoopIncrPNOut) {
2103 Value *Incr, *RemAmt;
2104 // NB: If RemAmt is a power of 2 it *should* have been transformed by now.
2105 if (!match(V: Rem, P: m_URem(L: m_Value(V&: Incr), R: m_Value(V&: RemAmt))))
2106 return false;
2107
2108 Value *AddInst, *AddOffset;
2109 // Find out loop increment PHI.
2110 auto *PN = dyn_cast<PHINode>(Val: Incr);
2111 if (PN != nullptr) {
2112 AddInst = nullptr;
2113 AddOffset = nullptr;
2114 } else {
2115 // Search through a NUW add on top of the loop increment.
2116 Value *V0, *V1;
2117 if (!match(V: Incr, P: m_NUWAdd(L: m_Value(V&: V0), R: m_Value(V&: V1))))
2118 return false;
2119
2120 AddInst = Incr;
2121 PN = dyn_cast<PHINode>(Val: V0);
2122 if (PN != nullptr) {
2123 AddOffset = V1;
2124 } else {
2125 PN = dyn_cast<PHINode>(Val: V1);
2126 AddOffset = V0;
2127 }
2128 }
2129
2130 if (!PN)
2131 return false;
2132
2133 // This isn't strictly necessary, what we really need is one increment and any
2134 // amount of initial values all being the same.
2135 if (PN->getNumIncomingValues() != 2)
2136 return false;
2137
2138 // Only trivially analyzable loops.
2139 Loop *L = LI->getLoopFor(BB: PN->getParent());
2140 if (!L || !L->getLoopPreheader() || !L->getLoopLatch())
2141 return false;
2142
2143 // Req that the remainder is in the loop
2144 if (!L->contains(Inst: Rem))
2145 return false;
2146
2147 // Only works if the remainder amount is a loop invaraint
2148 if (!L->isLoopInvariant(V: RemAmt))
2149 return false;
2150
2151 // Only works if the AddOffset is a loop invaraint
2152 if (AddOffset && !L->isLoopInvariant(V: AddOffset))
2153 return false;
2154
2155 // Is the PHI a loop increment?
2156 auto LoopIncrInfo = getIVIncrement(PN, LI);
2157 if (!LoopIncrInfo)
2158 return false;
2159
2160 // We need remainder_amount % increment_amount to be zero. Increment of one
2161 // satisfies that without any special logic and is overwhelmingly the common
2162 // case.
2163 if (!match(V: LoopIncrInfo->second, P: m_One()))
2164 return false;
2165
2166 // Need the increment to not overflow.
2167 if (!match(V: LoopIncrInfo->first, P: m_c_NUWAdd(L: m_Specific(V: PN), R: m_Value())))
2168 return false;
2169
2170 // Set output variables.
2171 RemAmtOut = RemAmt;
2172 LoopIncrPNOut = PN;
2173 AddInstOut = AddInst;
2174 AddOffsetOut = AddOffset;
2175
2176 return true;
2177}
2178
2179// Try to transform:
2180//
2181// for(i = Start; i < End; ++i)
2182// Rem = (i nuw+ IncrLoopInvariant) u% RemAmtLoopInvariant;
2183//
2184// ->
2185//
2186// Rem = (Start nuw+ IncrLoopInvariant) % RemAmtLoopInvariant;
2187// for(i = Start; i < End; ++i, ++rem)
2188// Rem = rem == RemAmtLoopInvariant ? 0 : Rem;
2189static bool foldURemOfLoopIncrement(Instruction *Rem, const DataLayout *DL,
2190 const LoopInfo *LI,
2191 SmallPtrSet<BasicBlock *, 32> &FreshBBs,
2192 bool IsHuge) {
2193 Value *AddOffset, *RemAmt, *AddInst;
2194 PHINode *LoopIncrPN;
2195 if (!isRemOfLoopIncrementWithLoopInvariant(Rem, LI, RemAmtOut&: RemAmt, AddInstOut&: AddInst,
2196 AddOffsetOut&: AddOffset, LoopIncrPNOut&: LoopIncrPN))
2197 return false;
2198
2199 // Only non-constant remainder as the extra IV is probably not profitable
2200 // in that case.
2201 //
2202 // Potential TODO(1): `urem` of a const ends up as `mul` + `shift` + `add`. If
2203 // we can rule out register pressure and ensure this `urem` is executed each
2204 // iteration, its probably profitable to handle the const case as well.
2205 //
2206 // Potential TODO(2): Should we have a check for how "nested" this remainder
2207 // operation is? The new code runs every iteration so if the remainder is
2208 // guarded behind unlikely conditions this might not be worth it.
2209 if (match(V: RemAmt, P: m_ImmConstant()))
2210 return false;
2211
2212 Loop *L = LI->getLoopFor(BB: LoopIncrPN->getParent());
2213 Value *Start = LoopIncrPN->getIncomingValueForBlock(BB: L->getLoopPreheader());
2214 // If we have add create initial value for remainder.
2215 // The logic here is:
2216 // (urem (add nuw Start, IncrLoopInvariant), RemAmtLoopInvariant
2217 //
2218 // Only proceed if the expression simplifies (otherwise we can't fully
2219 // optimize out the urem).
2220 if (AddInst) {
2221 assert(AddOffset && "We found an add but missing values");
2222 // Without dom-condition/assumption cache we aren't likely to get much out
2223 // of a context instruction.
2224 Start = simplifyAddInst(LHS: Start, RHS: AddOffset,
2225 IsNSW: match(V: AddInst, P: m_NSWAdd(L: m_Value(), R: m_Value())),
2226 /*IsNUW=*/true, Q: *DL);
2227 if (!Start)
2228 return false;
2229 }
2230
2231 // If we can't fully optimize out the `rem`, skip this transform.
2232 Start = simplifyURemInst(LHS: Start, RHS: RemAmt, Q: *DL);
2233 if (!Start)
2234 return false;
2235
2236 // Create new remainder with induction variable.
2237 Type *Ty = Rem->getType();
2238 IRBuilder<> Builder(Rem->getContext());
2239
2240 Builder.SetInsertPoint(LoopIncrPN);
2241 PHINode *NewRem = Builder.CreatePHI(Ty, NumReservedValues: 2);
2242
2243 Builder.SetInsertPoint(cast<Instruction>(
2244 Val: LoopIncrPN->getIncomingValueForBlock(BB: L->getLoopLatch())));
2245 // `(add (urem x, y), 1)` is always nuw.
2246 Value *RemAdd = Builder.CreateNUWAdd(LHS: NewRem, RHS: ConstantInt::get(Ty, V: 1));
2247 Value *RemCmp = Builder.CreateICmp(P: ICmpInst::ICMP_EQ, LHS: RemAdd, RHS: RemAmt);
2248 Value *RemSel =
2249 Builder.CreateSelect(C: RemCmp, True: Constant::getNullValue(Ty), False: RemAdd);
2250
2251 NewRem->addIncoming(V: Start, BB: L->getLoopPreheader());
2252 NewRem->addIncoming(V: RemSel, BB: L->getLoopLatch());
2253
2254 // Insert all touched BBs.
2255 FreshBBs.insert(Ptr: LoopIncrPN->getParent());
2256 FreshBBs.insert(Ptr: L->getLoopLatch());
2257 FreshBBs.insert(Ptr: Rem->getParent());
2258 if (AddInst)
2259 FreshBBs.insert(Ptr: cast<Instruction>(Val: AddInst)->getParent());
2260 replaceAllUsesWith(Old: Rem, New: NewRem, FreshBBs, IsHuge);
2261 Rem->eraseFromParent();
2262 if (AddInst && AddInst->use_empty())
2263 cast<Instruction>(Val: AddInst)->eraseFromParent();
2264 return true;
2265}
2266
2267bool CodeGenPrepare::optimizeURem(Instruction *Rem) {
2268 if (foldURemOfLoopIncrement(Rem, DL, LI, FreshBBs, IsHuge: IsHugeFunc))
2269 return true;
2270 return false;
2271}
2272
2273bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
2274 if (sinkCmpExpression(Cmp, TLI: *TLI, DL: *DL))
2275 return true;
2276
2277 if (combineToUAddWithOverflow(Cmp, ModifiedDT))
2278 return true;
2279
2280 if (combineToUSubWithOverflow(Cmp, ModifiedDT))
2281 return true;
2282
2283 if (unfoldPowerOf2Test(Cmp))
2284 return true;
2285
2286 if (foldICmpWithDominatingICmp(Cmp, TLI: *TLI))
2287 return true;
2288
2289 if (swapICmpOperandsToExposeCSEOpportunities(Cmp))
2290 return true;
2291
2292 if (foldFCmpToFPClassTest(Cmp, TLI: *TLI, DL: *DL))
2293 return true;
2294
2295 return false;
2296}
2297
2298/// Duplicate and sink the given 'and' instruction into user blocks where it is
2299/// used in a compare to allow isel to generate better code for targets where
2300/// this operation can be combined.
2301///
2302/// Return true if any changes are made.
2303static bool sinkAndCmp0Expression(Instruction *AndI, const TargetLowering &TLI,
2304 SetOfInstrs &InsertedInsts) {
2305 // Double-check that we're not trying to optimize an instruction that was
2306 // already optimized by some other part of this pass.
2307 assert(!InsertedInsts.count(AndI) &&
2308 "Attempting to optimize already optimized and instruction");
2309 (void)InsertedInsts;
2310
2311 // Nothing to do for single use in same basic block.
2312 if (AndI->hasOneUse() &&
2313 AndI->getParent() == cast<Instruction>(Val: *AndI->user_begin())->getParent())
2314 return false;
2315
2316 // Try to avoid cases where sinking/duplicating is likely to increase register
2317 // pressure.
2318 if (!isa<ConstantInt>(Val: AndI->getOperand(i: 0)) &&
2319 !isa<ConstantInt>(Val: AndI->getOperand(i: 1)) &&
2320 AndI->getOperand(i: 0)->hasOneUse() && AndI->getOperand(i: 1)->hasOneUse())
2321 return false;
2322
2323 for (auto *U : AndI->users()) {
2324 Instruction *User = cast<Instruction>(Val: U);
2325
2326 // Only sink 'and' feeding icmp with 0.
2327 if (!isa<ICmpInst>(Val: User))
2328 return false;
2329
2330 auto *CmpC = dyn_cast<ConstantInt>(Val: User->getOperand(i: 1));
2331 if (!CmpC || !CmpC->isZero())
2332 return false;
2333 }
2334
2335 if (!TLI.isMaskAndCmp0FoldingBeneficial(AndI: *AndI))
2336 return false;
2337
2338 LLVM_DEBUG(dbgs() << "found 'and' feeding only icmp 0;\n");
2339 LLVM_DEBUG(AndI->getParent()->dump());
2340
2341 // Push the 'and' into the same block as the icmp 0. There should only be
2342 // one (icmp (and, 0)) in each block, since CSE/GVN should have removed any
2343 // others, so we don't need to keep track of which BBs we insert into.
2344 for (Value::user_iterator UI = AndI->user_begin(), E = AndI->user_end();
2345 UI != E;) {
2346 Use &TheUse = UI.getUse();
2347 Instruction *User = cast<Instruction>(Val: *UI);
2348
2349 // Preincrement use iterator so we don't invalidate it.
2350 ++UI;
2351
2352 LLVM_DEBUG(dbgs() << "sinking 'and' use: " << *User << "\n");
2353
2354 // Keep the 'and' in the same place if the use is already in the same block.
2355 Instruction *InsertPt =
2356 User->getParent() == AndI->getParent() ? AndI : User;
2357 Instruction *InsertedAnd = BinaryOperator::Create(
2358 Op: Instruction::And, S1: AndI->getOperand(i: 0), S2: AndI->getOperand(i: 1), Name: "",
2359 InsertBefore: InsertPt->getIterator());
2360 // Propagate the debug info.
2361 InsertedAnd->setDebugLoc(AndI->getDebugLoc());
2362
2363 // Replace a use of the 'and' with a use of the new 'and'.
2364 TheUse = InsertedAnd;
2365 ++NumAndUses;
2366 LLVM_DEBUG(User->getParent()->dump());
2367 }
2368
2369 // We removed all uses, nuke the and.
2370 AndI->eraseFromParent();
2371 return true;
2372}
2373
2374/// Check if the candidates could be combined with a shift instruction, which
2375/// includes:
2376/// 1. Truncate instruction
2377/// 2. And instruction and the imm is a mask of the low bits:
2378/// imm & (imm+1) == 0
2379static bool isExtractBitsCandidateUse(Instruction *User) {
2380 if (!isa<TruncInst>(Val: User)) {
2381 if (User->getOpcode() != Instruction::And ||
2382 !isa<ConstantInt>(Val: User->getOperand(i: 1)))
2383 return false;
2384
2385 const APInt &Cimm = cast<ConstantInt>(Val: User->getOperand(i: 1))->getValue();
2386
2387 if ((Cimm & (Cimm + 1)).getBoolValue())
2388 return false;
2389 }
2390 return true;
2391}
2392
2393/// Sink both shift and truncate instruction to the use of truncate's BB.
2394static bool
2395SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI,
2396 DenseMap<BasicBlock *, BinaryOperator *> &InsertedShifts,
2397 const TargetLowering &TLI, const DataLayout &DL) {
2398 BasicBlock *UserBB = User->getParent();
2399 DenseMap<BasicBlock *, CastInst *> InsertedTruncs;
2400 auto *TruncI = cast<TruncInst>(Val: User);
2401 bool MadeChange = false;
2402
2403 for (Value::user_iterator TruncUI = TruncI->user_begin(),
2404 TruncE = TruncI->user_end();
2405 TruncUI != TruncE;) {
2406
2407 Use &TruncTheUse = TruncUI.getUse();
2408 Instruction *TruncUser = cast<Instruction>(Val: *TruncUI);
2409 // Preincrement use iterator so we don't invalidate it.
2410
2411 ++TruncUI;
2412
2413 int ISDOpcode = TLI.InstructionOpcodeToISD(Opcode: TruncUser->getOpcode());
2414 if (!ISDOpcode)
2415 continue;
2416
2417 // If the use is actually a legal node, there will not be an
2418 // implicit truncate.
2419 // FIXME: always querying the result type is just an
2420 // approximation; some nodes' legality is determined by the
2421 // operand or other means. There's no good way to find out though.
2422 if (TLI.isOperationLegalOrCustom(
2423 Op: ISDOpcode, VT: TLI.getValueType(DL, Ty: TruncUser->getType(), AllowUnknown: true)))
2424 continue;
2425
2426 // Don't bother for PHI nodes.
2427 if (isa<PHINode>(Val: TruncUser))
2428 continue;
2429
2430 BasicBlock *TruncUserBB = TruncUser->getParent();
2431
2432 if (UserBB == TruncUserBB)
2433 continue;
2434
2435 BinaryOperator *&InsertedShift = InsertedShifts[TruncUserBB];
2436 CastInst *&InsertedTrunc = InsertedTruncs[TruncUserBB];
2437
2438 if (!InsertedShift && !InsertedTrunc) {
2439 BasicBlock::iterator InsertPt = TruncUserBB->getFirstInsertionPt();
2440 assert(InsertPt != TruncUserBB->end());
2441 // Sink the shift
2442 if (ShiftI->getOpcode() == Instruction::AShr)
2443 InsertedShift =
2444 BinaryOperator::CreateAShr(V1: ShiftI->getOperand(i_nocapture: 0), V2: CI, Name: "");
2445 else
2446 InsertedShift =
2447 BinaryOperator::CreateLShr(V1: ShiftI->getOperand(i_nocapture: 0), V2: CI, Name: "");
2448 InsertedShift->setDebugLoc(ShiftI->getDebugLoc());
2449 InsertedShift->insertBefore(BB&: *TruncUserBB, InsertPos: InsertPt);
2450
2451 // Sink the trunc
2452 BasicBlock::iterator TruncInsertPt = TruncUserBB->getFirstInsertionPt();
2453 TruncInsertPt++;
2454 // It will go ahead of any debug-info.
2455 TruncInsertPt.setHeadBit(true);
2456 assert(TruncInsertPt != TruncUserBB->end());
2457
2458 InsertedTrunc = CastInst::Create(TruncI->getOpcode(), S: InsertedShift,
2459 Ty: TruncI->getType(), Name: "");
2460 InsertedTrunc->insertBefore(BB&: *TruncUserBB, InsertPos: TruncInsertPt);
2461 InsertedTrunc->setDebugLoc(TruncI->getDebugLoc());
2462
2463 MadeChange = true;
2464
2465 TruncTheUse = InsertedTrunc;
2466 }
2467 }
2468 return MadeChange;
2469}
2470
2471/// Sink the shift *right* instruction into user blocks if the uses could
2472/// potentially be combined with this shift instruction and generate BitExtract
2473/// instruction. It will only be applied if the architecture supports BitExtract
2474/// instruction. Here is an example:
2475/// BB1:
2476/// %x.extract.shift = lshr i64 %arg1, 32
2477/// BB2:
2478/// %x.extract.trunc = trunc i64 %x.extract.shift to i16
2479/// ==>
2480///
2481/// BB2:
2482/// %x.extract.shift.1 = lshr i64 %arg1, 32
2483/// %x.extract.trunc = trunc i64 %x.extract.shift.1 to i16
2484///
2485/// CodeGen will recognize the pattern in BB2 and generate BitExtract
2486/// instruction.
2487/// Return true if any changes are made.
2488static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,
2489 const TargetLowering &TLI,
2490 const DataLayout &DL) {
2491 BasicBlock *DefBB = ShiftI->getParent();
2492
2493 /// Only insert instructions in each block once.
2494 DenseMap<BasicBlock *, BinaryOperator *> InsertedShifts;
2495
2496 bool shiftIsLegal = TLI.isTypeLegal(VT: TLI.getValueType(DL, Ty: ShiftI->getType()));
2497
2498 bool MadeChange = false;
2499 for (Value::user_iterator UI = ShiftI->user_begin(), E = ShiftI->user_end();
2500 UI != E;) {
2501 Use &TheUse = UI.getUse();
2502 Instruction *User = cast<Instruction>(Val: *UI);
2503 // Preincrement use iterator so we don't invalidate it.
2504 ++UI;
2505
2506 // Don't bother for PHI nodes.
2507 if (isa<PHINode>(Val: User))
2508 continue;
2509
2510 if (!isExtractBitsCandidateUse(User))
2511 continue;
2512
2513 BasicBlock *UserBB = User->getParent();
2514
2515 if (UserBB == DefBB) {
2516 // If the shift and truncate instruction are in the same BB. The use of
2517 // the truncate(TruncUse) may still introduce another truncate if not
2518 // legal. In this case, we would like to sink both shift and truncate
2519 // instruction to the BB of TruncUse.
2520 // for example:
2521 // BB1:
2522 // i64 shift.result = lshr i64 opnd, imm
2523 // trunc.result = trunc shift.result to i16
2524 //
2525 // BB2:
2526 // ----> We will have an implicit truncate here if the architecture does
2527 // not have i16 compare.
2528 // cmp i16 trunc.result, opnd2
2529 //
2530 if (isa<TruncInst>(Val: User) &&
2531 shiftIsLegal
2532 // If the type of the truncate is legal, no truncate will be
2533 // introduced in other basic blocks.
2534 && (!TLI.isTypeLegal(VT: TLI.getValueType(DL, Ty: User->getType()))))
2535 MadeChange =
2536 SinkShiftAndTruncate(ShiftI, User, CI, InsertedShifts, TLI, DL);
2537
2538 continue;
2539 }
2540 // If we have already inserted a shift into this block, use it.
2541 BinaryOperator *&InsertedShift = InsertedShifts[UserBB];
2542
2543 if (!InsertedShift) {
2544 BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
2545 assert(InsertPt != UserBB->end());
2546
2547 if (ShiftI->getOpcode() == Instruction::AShr)
2548 InsertedShift =
2549 BinaryOperator::CreateAShr(V1: ShiftI->getOperand(i_nocapture: 0), V2: CI, Name: "");
2550 else
2551 InsertedShift =
2552 BinaryOperator::CreateLShr(V1: ShiftI->getOperand(i_nocapture: 0), V2: CI, Name: "");
2553 InsertedShift->insertBefore(BB&: *UserBB, InsertPos: InsertPt);
2554 InsertedShift->setDebugLoc(ShiftI->getDebugLoc());
2555
2556 MadeChange = true;
2557 }
2558
2559 // Replace a use of the shift with a use of the new shift.
2560 TheUse = InsertedShift;
2561 }
2562
2563 // If we removed all uses, or there are none, nuke the shift.
2564 if (ShiftI->use_empty()) {
2565 salvageDebugInfo(I&: *ShiftI);
2566 ShiftI->eraseFromParent();
2567 MadeChange = true;
2568 }
2569
2570 return MadeChange;
2571}
2572
2573/// If counting leading or trailing zeros is an expensive operation and a zero
2574/// input is defined, add a check for zero to avoid calling the intrinsic.
2575///
2576/// We want to transform:
2577/// %z = call i64 @llvm.cttz.i64(i64 %A, i1 false)
2578///
2579/// into:
2580/// entry:
2581/// %cmpz = icmp eq i64 %A, 0
2582/// br i1 %cmpz, label %cond.end, label %cond.false
2583/// cond.false:
2584/// %z = call i64 @llvm.cttz.i64(i64 %A, i1 true)
2585/// br label %cond.end
2586/// cond.end:
2587/// %ctz = phi i64 [ 64, %entry ], [ %z, %cond.false ]
2588///
2589/// If the transform is performed, return true and set ModifiedDT to true.
2590static bool despeculateCountZeros(IntrinsicInst *CountZeros,
2591 DomTreeUpdater *DTU, LoopInfo *LI,
2592 const TargetLowering *TLI,
2593 const DataLayout *DL, ModifyDT &ModifiedDT,
2594 SmallPtrSet<BasicBlock *, 32> &FreshBBs,
2595 bool IsHugeFunc) {
2596 // If a zero input is undefined, it doesn't make sense to despeculate that.
2597 if (match(V: CountZeros->getOperand(i_nocapture: 1), P: m_One()))
2598 return false;
2599
2600 // If it's cheap to speculate, there's nothing to do.
2601 Type *Ty = CountZeros->getType();
2602 auto IntrinsicID = CountZeros->getIntrinsicID();
2603 if ((IntrinsicID == Intrinsic::cttz && TLI->isCheapToSpeculateCttz(Ty)) ||
2604 (IntrinsicID == Intrinsic::ctlz && TLI->isCheapToSpeculateCtlz(Ty)))
2605 return false;
2606
2607 // Only handle scalar cases. Anything else requires too much work.
2608 unsigned SizeInBits = Ty->getScalarSizeInBits();
2609 if (Ty->isVectorTy())
2610 return false;
2611
2612 // Bail if the value is never zero.
2613 Use &Op = CountZeros->getOperandUse(i: 0);
2614 if (isKnownNonZero(V: Op, Q: *DL))
2615 return false;
2616
2617 // The intrinsic will be sunk behind a compare against zero and branch.
2618 BasicBlock *StartBlock = CountZeros->getParent();
2619 BasicBlock *CallBlock = SplitBlock(Old: StartBlock, SplitPt: CountZeros, DTU, LI,
2620 /* MSSAU */ nullptr, BBName: "cond.false");
2621 if (IsHugeFunc)
2622 FreshBBs.insert(Ptr: CallBlock);
2623
2624 // Create another block after the count zero intrinsic. A PHI will be added
2625 // in this block to select the result of the intrinsic or the bit-width
2626 // constant if the input to the intrinsic is zero.
2627 BasicBlock::iterator SplitPt = std::next(x: BasicBlock::iterator(CountZeros));
2628 // Any debug-info after CountZeros should not be included.
2629 SplitPt.setHeadBit(true);
2630 BasicBlock *EndBlock = SplitBlock(Old: CallBlock, SplitPt: &*SplitPt, DTU, LI,
2631 /* MSSAU */ nullptr, BBName: "cond.end");
2632 if (IsHugeFunc)
2633 FreshBBs.insert(Ptr: EndBlock);
2634
2635 // Set up a builder to create a compare, conditional branch, and PHI.
2636 IRBuilder<> Builder(CountZeros->getContext());
2637 Builder.SetInsertPoint(StartBlock->getTerminator());
2638 Builder.SetCurrentDebugLocation(CountZeros->getDebugLoc());
2639
2640 // Replace the unconditional branch that was created by the first split with
2641 // a compare against zero and a conditional branch.
2642 Value *Zero = Constant::getNullValue(Ty);
2643 // Avoid introducing branch on poison. This also replaces the ctz operand.
2644 if (!isGuaranteedNotToBeUndefOrPoison(V: Op))
2645 Op = Builder.CreateFreeze(V: Op, Name: Op->getName() + ".fr");
2646 Value *Cmp = Builder.CreateICmpEQ(LHS: Op, RHS: Zero, Name: "cmpz");
2647 Builder.CreateCondBr(Cond: Cmp, True: EndBlock, False: CallBlock);
2648 StartBlock->getTerminator()->eraseFromParent();
2649 DTU->applyUpdates(Updates: {{DominatorTree::Insert, StartBlock, EndBlock}});
2650
2651 // Create a PHI in the end block to select either the output of the intrinsic
2652 // or the bit width of the operand.
2653 Builder.SetInsertPoint(TheBB: EndBlock, IP: EndBlock->begin());
2654 PHINode *PN = Builder.CreatePHI(Ty, NumReservedValues: 2, Name: "ctz");
2655 replaceAllUsesWith(Old: CountZeros, New: PN, FreshBBs, IsHuge: IsHugeFunc);
2656 Value *BitWidth = Builder.getInt(AI: APInt(SizeInBits, SizeInBits));
2657 PN->addIncoming(V: BitWidth, BB: StartBlock);
2658 PN->addIncoming(V: CountZeros, BB: CallBlock);
2659
2660 // We are explicitly handling the zero case, so we can set the intrinsic's
2661 // undefined zero argument to 'true'. This will also prevent reprocessing the
2662 // intrinsic; we only despeculate when a zero input is defined.
2663 CountZeros->setArgOperand(i: 1, v: Builder.getTrue());
2664 ModifiedDT = ModifyDT::ModifyBBDT;
2665 return true;
2666}
2667
2668bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) {
2669 BasicBlock *BB = CI->getParent();
2670
2671 // Sink address computing for memory operands into the block.
2672 if (CI->isInlineAsm() && optimizeInlineAsmInst(CS: CI))
2673 return true;
2674
2675 // Align the pointer arguments to this call if the target thinks it's a good
2676 // idea
2677 unsigned MinSize;
2678 Align PrefAlign;
2679 if (TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) {
2680 for (auto &Arg : CI->args()) {
2681 // We want to align both objects whose address is used directly and
2682 // objects whose address is used in casts and GEPs, though it only makes
2683 // sense for GEPs if the offset is a multiple of the desired alignment and
2684 // if size - offset meets the size threshold.
2685 if (!Arg->getType()->isPointerTy())
2686 continue;
2687 APInt Offset(DL->getIndexSizeInBits(
2688 AS: cast<PointerType>(Val: Arg->getType())->getAddressSpace()),
2689 0);
2690 Value *Val = Arg->stripAndAccumulateInBoundsConstantOffsets(DL: *DL, Offset);
2691 uint64_t Offset2 = Offset.getLimitedValue();
2692 if (!isAligned(Lhs: PrefAlign, SizeInBytes: Offset2))
2693 continue;
2694 AllocaInst *AI;
2695 if ((AI = dyn_cast<AllocaInst>(Val)) && AI->getAlign() < PrefAlign) {
2696 std::optional<TypeSize> AllocaSize = AI->getAllocationSize(DL: *DL);
2697 if (AllocaSize && AllocaSize->getKnownMinValue() >= MinSize + Offset2)
2698 AI->setAlignment(PrefAlign);
2699 }
2700 // Global variables can only be aligned if they are defined in this
2701 // object (i.e. they are uniquely initialized in this object), and
2702 // over-aligning global variables that have an explicit section is
2703 // forbidden.
2704 GlobalVariable *GV;
2705 if ((GV = dyn_cast<GlobalVariable>(Val)) && GV->canIncreaseAlignment() &&
2706 GV->getPointerAlignment(DL: *DL) < PrefAlign &&
2707 GV->getGlobalSize(DL: *DL) >= MinSize + Offset2)
2708 GV->setAlignment(PrefAlign);
2709 }
2710 }
2711 // If this is a memcpy (or similar) then we may be able to improve the
2712 // alignment.
2713 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Val: CI)) {
2714 Align DestAlign = getKnownAlignment(V: MI->getDest(), DL: *DL);
2715 MaybeAlign MIDestAlign = MI->getDestAlign();
2716 if (!MIDestAlign || DestAlign > *MIDestAlign)
2717 MI->setDestAlignment(DestAlign);
2718 if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(Val: MI)) {
2719 MaybeAlign MTISrcAlign = MTI->getSourceAlign();
2720 Align SrcAlign = getKnownAlignment(V: MTI->getSource(), DL: *DL);
2721 if (!MTISrcAlign || SrcAlign > *MTISrcAlign)
2722 MTI->setSourceAlignment(SrcAlign);
2723 }
2724 }
2725
2726 // If we have a cold call site, try to sink addressing computation into the
2727 // cold block. This interacts with our handling for loads and stores to
2728 // ensure that we can fold all uses of a potential addressing computation
2729 // into their uses. TODO: generalize this to work over profiling data
2730 if (CI->hasFnAttr(Kind: Attribute::Cold) &&
2731 !llvm::shouldOptimizeForSize(BB, PSI, BFI))
2732 for (auto &Arg : CI->args()) {
2733 if (!Arg->getType()->isPointerTy())
2734 continue;
2735 unsigned AS = Arg->getType()->getPointerAddressSpace();
2736 if (optimizeMemoryInst(MemoryInst: CI, Addr: Arg, AccessTy: Arg->getType(), AddrSpace: AS))
2737 return true;
2738 }
2739
2740 IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: CI);
2741 if (II) {
2742 switch (II->getIntrinsicID()) {
2743 default:
2744 break;
2745 case Intrinsic::assume:
2746 llvm_unreachable("llvm.assume should have been removed already");
2747 case Intrinsic::allow_runtime_check:
2748 case Intrinsic::allow_ubsan_check:
2749 case Intrinsic::experimental_widenable_condition: {
2750 // Give up on future widening opportunities so that we can fold away dead
2751 // paths and merge blocks before going into block-local instruction
2752 // selection.
2753 if (II->use_empty()) {
2754 II->eraseFromParent();
2755 return true;
2756 }
2757 Constant *RetVal = ConstantInt::getTrue(Context&: II->getContext());
2758 resetIteratorIfInvalidatedWhileCalling(BB, f: [&]() {
2759 replaceAndRecursivelySimplify(I: CI, SimpleV: RetVal, TLI: TLInfo, DT: nullptr);
2760 });
2761 return true;
2762 }
2763 case Intrinsic::objectsize:
2764 llvm_unreachable("llvm.objectsize.* should have been lowered already");
2765 case Intrinsic::is_constant:
2766 llvm_unreachable("llvm.is.constant.* should have been lowered already");
2767 case Intrinsic::aarch64_stlxr:
2768 case Intrinsic::aarch64_stxr: {
2769 ZExtInst *ExtVal = dyn_cast<ZExtInst>(Val: CI->getArgOperand(i: 0));
2770 if (!ExtVal || !ExtVal->hasOneUse() ||
2771 ExtVal->getParent() == CI->getParent())
2772 return false;
2773 // Sink a zext feeding stlxr/stxr before it, so it can be folded into it.
2774 ExtVal->moveBefore(InsertPos: CI->getIterator());
2775 // Mark this instruction as "inserted by CGP", so that other
2776 // optimizations don't touch it.
2777 InsertedInsts.insert(Ptr: ExtVal);
2778 return true;
2779 }
2780
2781 case Intrinsic::launder_invariant_group:
2782 case Intrinsic::strip_invariant_group: {
2783 Value *ArgVal = II->getArgOperand(i: 0);
2784 auto it = LargeOffsetGEPMap.find(Key: II);
2785 if (it != LargeOffsetGEPMap.end()) {
2786 // Merge entries in LargeOffsetGEPMap to reflect the RAUW.
2787 // Make sure not to have to deal with iterator invalidation
2788 // after possibly adding ArgVal to LargeOffsetGEPMap.
2789 auto GEPs = std::move(it->second);
2790 LargeOffsetGEPMap[ArgVal].append(in_start: GEPs.begin(), in_end: GEPs.end());
2791 LargeOffsetGEPMap.erase(Key: II);
2792 }
2793
2794 replaceAllUsesWith(Old: II, New: ArgVal, FreshBBs, IsHuge: IsHugeFunc);
2795 II->eraseFromParent();
2796 return true;
2797 }
2798 case Intrinsic::cttz:
2799 case Intrinsic::ctlz:
2800 // If counting zeros is expensive, try to avoid it.
2801 return despeculateCountZeros(CountZeros: II, DTU, LI, TLI, DL, ModifiedDT, FreshBBs,
2802 IsHugeFunc);
2803 case Intrinsic::fshl:
2804 case Intrinsic::fshr:
2805 return optimizeFunnelShift(Fsh: II);
2806 case Intrinsic::masked_gather:
2807 return optimizeGatherScatterInst(MemoryInst: II, Ptr: II->getArgOperand(i: 0));
2808 case Intrinsic::masked_scatter:
2809 return optimizeGatherScatterInst(MemoryInst: II, Ptr: II->getArgOperand(i: 1));
2810 case Intrinsic::masked_load:
2811 // Treat v1X masked load as load X type.
2812 if (auto *VT = dyn_cast<FixedVectorType>(Val: II->getType())) {
2813 if (VT->getNumElements() == 1) {
2814 Value *PtrVal = II->getArgOperand(i: 0);
2815 unsigned AS = PtrVal->getType()->getPointerAddressSpace();
2816 if (optimizeMemoryInst(MemoryInst: II, Addr: PtrVal, AccessTy: VT->getElementType(), AddrSpace: AS))
2817 return true;
2818 }
2819 }
2820 return false;
2821 case Intrinsic::masked_store:
2822 // Treat v1X masked store as store X type.
2823 if (auto *VT =
2824 dyn_cast<FixedVectorType>(Val: II->getArgOperand(i: 0)->getType())) {
2825 if (VT->getNumElements() == 1) {
2826 Value *PtrVal = II->getArgOperand(i: 1);
2827 unsigned AS = PtrVal->getType()->getPointerAddressSpace();
2828 if (optimizeMemoryInst(MemoryInst: II, Addr: PtrVal, AccessTy: VT->getElementType(), AddrSpace: AS))
2829 return true;
2830 }
2831 }
2832 return false;
2833 case Intrinsic::umul_with_overflow:
2834 return optimizeMulWithOverflow(I: II, /*IsSigned=*/false, ModifiedDT);
2835 case Intrinsic::smul_with_overflow:
2836 return optimizeMulWithOverflow(I: II, /*IsSigned=*/true, ModifiedDT);
2837 }
2838
2839 SmallVector<Value *, 2> PtrOps;
2840 Type *AccessTy;
2841 if (TLI->getAddrModeArguments(II, PtrOps, AccessTy))
2842 while (!PtrOps.empty()) {
2843 Value *PtrVal = PtrOps.pop_back_val();
2844 unsigned AS = PtrVal->getType()->getPointerAddressSpace();
2845 if (optimizeMemoryInst(MemoryInst: II, Addr: PtrVal, AccessTy, AddrSpace: AS))
2846 return true;
2847 }
2848 }
2849
2850 // From here on out we're working with named functions.
2851 auto *Callee = CI->getCalledFunction();
2852 if (!Callee)
2853 return false;
2854
2855 // Lower all default uses of _chk calls. This is very similar
2856 // to what InstCombineCalls does, but here we are only lowering calls
2857 // to fortified library functions (e.g. __memcpy_chk) that have the default
2858 // "don't know" as the objectsize. Anything else should be left alone.
2859 FortifiedLibCallSimplifier Simplifier(TLInfo, true);
2860 IRBuilder<> Builder(CI);
2861 if (Value *V = Simplifier.optimizeCall(CI, B&: Builder)) {
2862 replaceAllUsesWith(Old: CI, New: V, FreshBBs, IsHuge: IsHugeFunc);
2863 CI->eraseFromParent();
2864 return true;
2865 }
2866
2867 // SCCP may have propagated, among other things, C++ static variables across
2868 // calls. If this happens to be the case, we may want to undo it in order to
2869 // avoid redundant pointer computation of the constant, as the function method
2870 // returning the constant needs to be executed anyways.
2871 auto GetUniformReturnValue = [](const Function *F) -> GlobalVariable * {
2872 if (!F->getReturnType()->isPointerTy())
2873 return nullptr;
2874
2875 GlobalVariable *UniformValue = nullptr;
2876 for (auto &BB : *F) {
2877 if (auto *RI = dyn_cast<ReturnInst>(Val: BB.getTerminator())) {
2878 if (auto *V = dyn_cast<GlobalVariable>(Val: RI->getReturnValue())) {
2879 if (!UniformValue)
2880 UniformValue = V;
2881 else if (V != UniformValue)
2882 return nullptr;
2883 } else {
2884 return nullptr;
2885 }
2886 }
2887 }
2888
2889 return UniformValue;
2890 };
2891
2892 if (Callee->hasExactDefinition()) {
2893 if (GlobalVariable *RV = GetUniformReturnValue(Callee)) {
2894 bool MadeChange = false;
2895 for (Use &U : make_early_inc_range(Range: RV->uses())) {
2896 auto *I = dyn_cast<Instruction>(Val: U.getUser());
2897 if (!I || I->getParent() != CI->getParent()) {
2898 // Limit to the same basic block to avoid extending the call-site live
2899 // range, which otherwise could increase register pressure.
2900 continue;
2901 }
2902 if (CI->comesBefore(Other: I)) {
2903 U.set(CI);
2904 MadeChange = true;
2905 }
2906 }
2907
2908 return MadeChange;
2909 }
2910 }
2911
2912 return false;
2913}
2914
2915static bool isIntrinsicOrLFToBeTailCalled(const TargetLibraryInfo *TLInfo,
2916 const CallInst *CI) {
2917 assert(CI && CI->use_empty());
2918
2919 if (const auto *II = dyn_cast<IntrinsicInst>(Val: CI))
2920 switch (II->getIntrinsicID()) {
2921 case Intrinsic::memset:
2922 case Intrinsic::memcpy:
2923 case Intrinsic::memmove:
2924 return true;
2925 default:
2926 return false;
2927 }
2928
2929 LibFunc LF;
2930 Function *Callee = CI->getCalledFunction();
2931 if (Callee && TLInfo && TLInfo->getLibFunc(FDecl: *Callee, F&: LF))
2932 switch (LF) {
2933 case LibFunc_strcpy:
2934 case LibFunc_strncpy:
2935 case LibFunc_strcat:
2936 case LibFunc_strncat:
2937 return true;
2938 default:
2939 return false;
2940 }
2941
2942 return false;
2943}
2944
2945/// Look for opportunities to duplicate return instructions to the predecessor
2946/// to enable tail call optimizations. The case it is currently looking for is
2947/// the following one. Known intrinsics or library function that may be tail
2948/// called are taken into account as well.
2949/// @code
2950/// bb0:
2951/// %tmp0 = tail call i32 @f0()
2952/// br label %return
2953/// bb1:
2954/// %tmp1 = tail call i32 @f1()
2955/// br label %return
2956/// bb2:
2957/// %tmp2 = tail call i32 @f2()
2958/// br label %return
2959/// return:
2960/// %retval = phi i32 [ %tmp0, %bb0 ], [ %tmp1, %bb1 ], [ %tmp2, %bb2 ]
2961/// ret i32 %retval
2962/// @endcode
2963///
2964/// =>
2965///
2966/// @code
2967/// bb0:
2968/// %tmp0 = tail call i32 @f0()
2969/// ret i32 %tmp0
2970/// bb1:
2971/// %tmp1 = tail call i32 @f1()
2972/// ret i32 %tmp1
2973/// bb2:
2974/// %tmp2 = tail call i32 @f2()
2975/// ret i32 %tmp2
2976/// @endcode
2977bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB,
2978 ModifyDT &ModifiedDT) {
2979 if (!BB->getTerminator())
2980 return false;
2981
2982 ReturnInst *RetI = dyn_cast<ReturnInst>(Val: BB->getTerminator());
2983 if (!RetI)
2984 return false;
2985
2986 assert(LI->getLoopFor(BB) == nullptr && "A return block cannot be in a loop");
2987
2988 PHINode *PN = nullptr;
2989 ExtractValueInst *EVI = nullptr;
2990 BitCastInst *BCI = nullptr;
2991 Value *V = RetI->getReturnValue();
2992 if (V) {
2993 BCI = dyn_cast<BitCastInst>(Val: V);
2994 if (BCI)
2995 V = BCI->getOperand(i_nocapture: 0);
2996
2997 EVI = dyn_cast<ExtractValueInst>(Val: V);
2998 if (EVI) {
2999 V = EVI->getOperand(i_nocapture: 0);
3000 if (!llvm::all_of(Range: EVI->indices(), P: equal_to(Arg: 0)))
3001 return false;
3002 }
3003
3004 PN = dyn_cast<PHINode>(Val: V);
3005 }
3006
3007 if (PN && PN->getParent() != BB)
3008 return false;
3009
3010 auto isLifetimeEndOrBitCastFor = [](const Instruction *Inst) {
3011 const BitCastInst *BC = dyn_cast<BitCastInst>(Val: Inst);
3012 if (BC && BC->hasOneUse())
3013 Inst = BC->user_back();
3014
3015 if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: Inst))
3016 return II->getIntrinsicID() == Intrinsic::lifetime_end;
3017 return false;
3018 };
3019
3020 SmallVector<const IntrinsicInst *, 4> FakeUses;
3021
3022 auto isFakeUse = [&FakeUses](const Instruction *Inst) {
3023 if (auto *II = dyn_cast<IntrinsicInst>(Val: Inst);
3024 II && II->getIntrinsicID() == Intrinsic::fake_use) {
3025 // Record the instruction so it can be preserved when the exit block is
3026 // removed. Do not preserve the fake use that uses the result of the
3027 // PHI instruction.
3028 // Do not copy fake uses that use the result of a PHI node.
3029 // FIXME: If we do want to copy the fake use into the return blocks, we
3030 // have to figure out which of the PHI node operands to use for each
3031 // copy.
3032 if (!isa<PHINode>(Val: II->getOperand(i_nocapture: 0))) {
3033 FakeUses.push_back(Elt: II);
3034 }
3035 return true;
3036 }
3037
3038 return false;
3039 };
3040
3041 // Make sure there are no instructions between the first instruction
3042 // and return.
3043 BasicBlock::const_iterator BI = BB->getFirstNonPHIIt();
3044 // Skip over pseudo-probes and the bitcast.
3045 while (&*BI == BCI || &*BI == EVI || isa<PseudoProbeInst>(Val: BI) ||
3046 isLifetimeEndOrBitCastFor(&*BI) || isFakeUse(&*BI))
3047 BI = std::next(x: BI);
3048 if (&*BI != RetI)
3049 return false;
3050
3051 // Only dup the ReturnInst if the CallInst is likely to be emitted as a tail
3052 // call.
3053 auto MayBePermittedAsTailCall = [&](const auto *CI) {
3054 return TLI->mayBeEmittedAsTailCall(CI) &&
3055 attributesPermitTailCall(BB->getParent(), CI, RetI, *TLI);
3056 };
3057
3058 SmallVector<BasicBlock *, 4> TailCallBBs;
3059 // Record the call instructions so we can insert any fake uses
3060 // that need to be preserved before them.
3061 SmallVector<CallInst *, 4> CallInsts;
3062 if (PN) {
3063 for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) {
3064 // Look through bitcasts.
3065 Value *IncomingVal = PN->getIncomingValue(i: I)->stripPointerCasts();
3066 CallInst *CI = dyn_cast<CallInst>(Val: IncomingVal);
3067 BasicBlock *PredBB = PN->getIncomingBlock(i: I);
3068 // Make sure the phi value is indeed produced by the tail call.
3069 if (CI && CI->hasOneUse() && CI->getParent() == PredBB &&
3070 MayBePermittedAsTailCall(CI)) {
3071 TailCallBBs.push_back(Elt: PredBB);
3072 CallInsts.push_back(Elt: CI);
3073 } else {
3074 // Consider the cases in which the phi value is indirectly produced by
3075 // the tail call, for example when encountering memset(), memmove(),
3076 // strcpy(), whose return value may have been optimized out. In such
3077 // cases, the value needs to be the first function argument.
3078 //
3079 // bb0:
3080 // tail call void @llvm.memset.p0.i64(ptr %0, i8 0, i64 %1)
3081 // br label %return
3082 // return:
3083 // %phi = phi ptr [ %0, %bb0 ], [ %2, %entry ]
3084 if (PredBB && PredBB->getSingleSuccessor() == BB)
3085 CI = dyn_cast_or_null<CallInst>(
3086 Val: PredBB->getTerminator()->getPrevNode());
3087
3088 if (CI && CI->use_empty() &&
3089 isIntrinsicOrLFToBeTailCalled(TLInfo, CI) &&
3090 IncomingVal == CI->getArgOperand(i: 0) &&
3091 MayBePermittedAsTailCall(CI)) {
3092 TailCallBBs.push_back(Elt: PredBB);
3093 CallInsts.push_back(Elt: CI);
3094 }
3095 }
3096 }
3097 } else {
3098 SmallPtrSet<BasicBlock *, 4> VisitedBBs;
3099 for (BasicBlock *Pred : predecessors(BB)) {
3100 if (!VisitedBBs.insert(Ptr: Pred).second)
3101 continue;
3102 if (Instruction *I = Pred->rbegin()->getPrevNode()) {
3103 CallInst *CI = dyn_cast<CallInst>(Val: I);
3104 if (CI && CI->use_empty() && MayBePermittedAsTailCall(CI)) {
3105 // Either we return void or the return value must be the first
3106 // argument of a known intrinsic or library function.
3107 if (!V || isa<UndefValue>(Val: V) ||
3108 (isIntrinsicOrLFToBeTailCalled(TLInfo, CI) &&
3109 V == CI->getArgOperand(i: 0))) {
3110 TailCallBBs.push_back(Elt: Pred);
3111 CallInsts.push_back(Elt: CI);
3112 }
3113 }
3114 }
3115 }
3116 }
3117
3118 bool Changed = false;
3119 for (auto const &TailCallBB : TailCallBBs) {
3120 // Make sure the call instruction is followed by an unconditional branch to
3121 // the return block.
3122 UncondBrInst *BI = dyn_cast<UncondBrInst>(Val: TailCallBB->getTerminator());
3123 if (!BI || BI->getSuccessor() != BB)
3124 continue;
3125
3126 // Duplicate the return into TailCallBB.
3127 (void)FoldReturnIntoUncondBranch(RI: RetI, BB, Pred: TailCallBB, DTU);
3128 assert(!VerifyBFIUpdates ||
3129 BFI->getBlockFreq(BB) >= BFI->getBlockFreq(TailCallBB));
3130 BFI->setBlockFreq(BB,
3131 Freq: (BFI->getBlockFreq(BB) - BFI->getBlockFreq(BB: TailCallBB)));
3132 ModifiedDT = ModifyDT::ModifyBBDT;
3133 Changed = true;
3134 ++NumRetsDup;
3135 }
3136
3137 // If we eliminated all predecessors of the block, delete the block now.
3138 if (Changed && !BB->hasAddressTaken() && pred_empty(BB)) {
3139 // Copy the fake uses found in the original return block to all blocks
3140 // that contain tail calls.
3141 for (auto *CI : CallInsts) {
3142 for (auto const *FakeUse : FakeUses) {
3143 auto *ClonedInst = FakeUse->clone();
3144 ClonedInst->insertBefore(InsertPos: CI->getIterator());
3145 }
3146 }
3147 DTU->deleteBB(DelBB: BB);
3148 }
3149
3150 return Changed;
3151}
3152
3153//===----------------------------------------------------------------------===//
3154// Memory Optimization
3155//===----------------------------------------------------------------------===//
3156
3157namespace {
3158
3159/// This is an extended version of TargetLowering::AddrMode
3160/// which holds actual Value*'s for register values.
3161struct ExtAddrMode : public TargetLowering::AddrMode {
3162 Value *BaseReg = nullptr;
3163 Value *ScaledReg = nullptr;
3164 Value *OriginalValue = nullptr;
3165 bool InBounds = true;
3166
3167 enum FieldName {
3168 NoField = 0x00,
3169 BaseRegField = 0x01,
3170 BaseGVField = 0x02,
3171 BaseOffsField = 0x04,
3172 ScaledRegField = 0x08,
3173 ScaleField = 0x10,
3174 MultipleFields = 0xff
3175 };
3176
3177 ExtAddrMode() = default;
3178
3179 void print(raw_ostream &OS) const;
3180 void dump() const;
3181
3182 // Replace From in ExtAddrMode with To.
3183 // E.g., SExt insts may be promoted and deleted. We should replace them with
3184 // the promoted values.
3185 void replaceWith(Value *From, Value *To) {
3186 if (ScaledReg == From)
3187 ScaledReg = To;
3188 }
3189
3190 FieldName compare(const ExtAddrMode &other) {
3191 // First check that the types are the same on each field, as differing types
3192 // is something we can't cope with later on.
3193 if (BaseReg && other.BaseReg &&
3194 BaseReg->getType() != other.BaseReg->getType())
3195 return MultipleFields;
3196 if (BaseGV && other.BaseGV && BaseGV->getType() != other.BaseGV->getType())
3197 return MultipleFields;
3198 if (ScaledReg && other.ScaledReg &&
3199 ScaledReg->getType() != other.ScaledReg->getType())
3200 return MultipleFields;
3201
3202 // Conservatively reject 'inbounds' mismatches.
3203 if (InBounds != other.InBounds)
3204 return MultipleFields;
3205
3206 // Check each field to see if it differs.
3207 unsigned Result = NoField;
3208 if (BaseReg != other.BaseReg)
3209 Result |= BaseRegField;
3210 if (BaseGV != other.BaseGV)
3211 Result |= BaseGVField;
3212 if (BaseOffs != other.BaseOffs)
3213 Result |= BaseOffsField;
3214 if (ScaledReg != other.ScaledReg)
3215 Result |= ScaledRegField;
3216 // Don't count 0 as being a different scale, because that actually means
3217 // unscaled (which will already be counted by having no ScaledReg).
3218 if (Scale && other.Scale && Scale != other.Scale)
3219 Result |= ScaleField;
3220
3221 if (llvm::popcount(Value: Result) > 1)
3222 return MultipleFields;
3223 else
3224 return static_cast<FieldName>(Result);
3225 }
3226
3227 // An AddrMode is trivial if it involves no calculation i.e. it is just a base
3228 // with no offset.
3229 bool isTrivial() {
3230 // An AddrMode is (BaseGV + BaseReg + BaseOffs + ScaleReg * Scale) so it is
3231 // trivial if at most one of these terms is nonzero, except that BaseGV and
3232 // BaseReg both being zero actually means a null pointer value, which we
3233 // consider to be 'non-zero' here.
3234 return !BaseOffs && !Scale && !(BaseGV && BaseReg);
3235 }
3236
3237 Value *GetFieldAsValue(FieldName Field, Type *IntPtrTy) {
3238 switch (Field) {
3239 default:
3240 return nullptr;
3241 case BaseRegField:
3242 return BaseReg;
3243 case BaseGVField:
3244 return BaseGV;
3245 case ScaledRegField:
3246 return ScaledReg;
3247 case BaseOffsField:
3248 return ConstantInt::getSigned(Ty: IntPtrTy, V: BaseOffs);
3249 }
3250 }
3251
3252 void SetCombinedField(FieldName Field, Value *V,
3253 const SmallVectorImpl<ExtAddrMode> &AddrModes) {
3254 switch (Field) {
3255 default:
3256 llvm_unreachable("Unhandled fields are expected to be rejected earlier");
3257 break;
3258 case ExtAddrMode::BaseRegField:
3259 BaseReg = V;
3260 break;
3261 case ExtAddrMode::BaseGVField:
3262 // A combined BaseGV is an Instruction, not a GlobalValue, so it goes
3263 // in the BaseReg field.
3264 assert(BaseReg == nullptr);
3265 BaseReg = V;
3266 BaseGV = nullptr;
3267 break;
3268 case ExtAddrMode::ScaledRegField:
3269 ScaledReg = V;
3270 // If we have a mix of scaled and unscaled addrmodes then we want scale
3271 // to be the scale and not zero.
3272 if (!Scale)
3273 for (const ExtAddrMode &AM : AddrModes)
3274 if (AM.Scale) {
3275 Scale = AM.Scale;
3276 break;
3277 }
3278 break;
3279 case ExtAddrMode::BaseOffsField:
3280 // The offset is no longer a constant, so it goes in ScaledReg with a
3281 // scale of 1.
3282 assert(ScaledReg == nullptr);
3283 ScaledReg = V;
3284 Scale = 1;
3285 BaseOffs = 0;
3286 break;
3287 }
3288 }
3289};
3290
3291#ifndef NDEBUG
3292static inline raw_ostream &operator<<(raw_ostream &OS, const ExtAddrMode &AM) {
3293 AM.print(OS);
3294 return OS;
3295}
3296#endif
3297
3298#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3299void ExtAddrMode::print(raw_ostream &OS) const {
3300 bool NeedPlus = false;
3301 OS << "[";
3302 if (InBounds)
3303 OS << "inbounds ";
3304 if (BaseGV) {
3305 OS << "GV:";
3306 BaseGV->printAsOperand(OS, /*PrintType=*/false);
3307 NeedPlus = true;
3308 }
3309
3310 if (BaseOffs) {
3311 OS << (NeedPlus ? " + " : "") << BaseOffs;
3312 NeedPlus = true;
3313 }
3314
3315 if (BaseReg) {
3316 OS << (NeedPlus ? " + " : "") << "Base:";
3317 BaseReg->printAsOperand(OS, /*PrintType=*/false);
3318 NeedPlus = true;
3319 }
3320 if (Scale) {
3321 OS << (NeedPlus ? " + " : "") << Scale << "*";
3322 ScaledReg->printAsOperand(OS, /*PrintType=*/false);
3323 }
3324
3325 OS << ']';
3326}
3327
3328LLVM_DUMP_METHOD void ExtAddrMode::dump() const {
3329 print(dbgs());
3330 dbgs() << '\n';
3331}
3332#endif
3333
3334} // end anonymous namespace
3335
3336namespace {
3337
3338/// This class provides transaction based operation on the IR.
3339/// Every change made through this class is recorded in the internal state and
3340/// can be undone (rollback) until commit is called.
3341/// CGP does not check if instructions could be speculatively executed when
3342/// moved. Preserving the original location would pessimize the debugging
3343/// experience, as well as negatively impact the quality of sample PGO.
3344class TypePromotionTransaction {
3345 /// This represents the common interface of the individual transaction.
3346 /// Each class implements the logic for doing one specific modification on
3347 /// the IR via the TypePromotionTransaction.
3348 class TypePromotionAction {
3349 protected:
3350 /// The Instruction modified.
3351 Instruction *Inst;
3352
3353 public:
3354 /// Constructor of the action.
3355 /// The constructor performs the related action on the IR.
3356 TypePromotionAction(Instruction *Inst) : Inst(Inst) {}
3357
3358 virtual ~TypePromotionAction() = default;
3359
3360 /// Undo the modification done by this action.
3361 /// When this method is called, the IR must be in the same state as it was
3362 /// before this action was applied.
3363 /// \pre Undoing the action works if and only if the IR is in the exact same
3364 /// state as it was directly after this action was applied.
3365 virtual void undo() = 0;
3366
3367 /// Advocate every change made by this action.
3368 /// When the results on the IR of the action are to be kept, it is important
3369 /// to call this function, otherwise hidden information may be kept forever.
3370 virtual void commit() {
3371 // Nothing to be done, this action is not doing anything.
3372 }
3373 };
3374
3375 /// Utility to remember the position of an instruction.
3376 class InsertionHandler {
3377 /// Position of an instruction.
3378 /// Either an instruction:
3379 /// - Is the first in a basic block: BB is used.
3380 /// - Has a previous instruction: PrevInst is used.
3381 struct {
3382 BasicBlock::iterator PrevInst;
3383 BasicBlock *BB;
3384 } Point;
3385 std::optional<DbgRecord::self_iterator> BeforeDbgRecord = std::nullopt;
3386
3387 /// Remember whether or not the instruction had a previous instruction.
3388 bool HasPrevInstruction;
3389
3390 public:
3391 /// Record the position of \p Inst.
3392 InsertionHandler(Instruction *Inst) {
3393 HasPrevInstruction = (Inst != &*(Inst->getParent()->begin()));
3394 BasicBlock *BB = Inst->getParent();
3395
3396 // Record where we would have to re-insert the instruction in the sequence
3397 // of DbgRecords, if we ended up reinserting.
3398 BeforeDbgRecord = Inst->getDbgReinsertionPosition();
3399
3400 if (HasPrevInstruction) {
3401 Point.PrevInst = std::prev(x: Inst->getIterator());
3402 } else {
3403 Point.BB = BB;
3404 }
3405 }
3406
3407 /// Insert \p Inst at the recorded position.
3408 void insert(Instruction *Inst) {
3409 if (HasPrevInstruction) {
3410 if (Inst->getParent())
3411 Inst->removeFromParent();
3412 Inst->insertAfter(InsertPos: Point.PrevInst);
3413 } else {
3414 BasicBlock::iterator Position = Point.BB->getFirstInsertionPt();
3415 if (Inst->getParent())
3416 Inst->moveBefore(BB&: *Point.BB, I: Position);
3417 else
3418 Inst->insertBefore(BB&: *Point.BB, InsertPos: Position);
3419 }
3420
3421 Inst->getParent()->reinsertInstInDbgRecords(I: Inst, Pos: BeforeDbgRecord);
3422 }
3423 };
3424
3425 /// Move an instruction before another.
3426 class InstructionMoveBefore : public TypePromotionAction {
3427 /// Original position of the instruction.
3428 InsertionHandler Position;
3429
3430 public:
3431 /// Move \p Inst before \p Before.
3432 InstructionMoveBefore(Instruction *Inst, BasicBlock::iterator Before)
3433 : TypePromotionAction(Inst), Position(Inst) {
3434 LLVM_DEBUG(dbgs() << "Do: move: " << *Inst << "\nbefore: " << *Before
3435 << "\n");
3436 Inst->moveBefore(InsertPos: Before);
3437 }
3438
3439 /// Move the instruction back to its original position.
3440 void undo() override {
3441 LLVM_DEBUG(dbgs() << "Undo: moveBefore: " << *Inst << "\n");
3442 Position.insert(Inst);
3443 }
3444 };
3445
3446 /// Set the operand of an instruction with a new value.
3447 class OperandSetter : public TypePromotionAction {
3448 /// Original operand of the instruction.
3449 Value *Origin;
3450
3451 /// Index of the modified instruction.
3452 unsigned Idx;
3453
3454 public:
3455 /// Set \p Idx operand of \p Inst with \p NewVal.
3456 OperandSetter(Instruction *Inst, unsigned Idx, Value *NewVal)
3457 : TypePromotionAction(Inst), Idx(Idx) {
3458 LLVM_DEBUG(dbgs() << "Do: setOperand: " << Idx << "\n"
3459 << "for:" << *Inst << "\n"
3460 << "with:" << *NewVal << "\n");
3461 Origin = Inst->getOperand(i: Idx);
3462 Inst->setOperand(i: Idx, Val: NewVal);
3463 }
3464
3465 /// Restore the original value of the instruction.
3466 void undo() override {
3467 LLVM_DEBUG(dbgs() << "Undo: setOperand:" << Idx << "\n"
3468 << "for: " << *Inst << "\n"
3469 << "with: " << *Origin << "\n");
3470 Inst->setOperand(i: Idx, Val: Origin);
3471 }
3472 };
3473
3474 /// Hide the operands of an instruction.
3475 /// Do as if this instruction was not using any of its operands.
3476 class OperandsHider : public TypePromotionAction {
3477 /// The list of original operands.
3478 SmallVector<Value *, 4> OriginalValues;
3479
3480 public:
3481 /// Remove \p Inst from the uses of the operands of \p Inst.
3482 OperandsHider(Instruction *Inst) : TypePromotionAction(Inst) {
3483 LLVM_DEBUG(dbgs() << "Do: OperandsHider: " << *Inst << "\n");
3484 unsigned NumOpnds = Inst->getNumOperands();
3485 OriginalValues.reserve(N: NumOpnds);
3486 for (unsigned It = 0; It < NumOpnds; ++It) {
3487 // Save the current operand.
3488 Value *Val = Inst->getOperand(i: It);
3489 OriginalValues.push_back(Elt: Val);
3490 // Set a dummy one.
3491 // We could use OperandSetter here, but that would imply an overhead
3492 // that we are not willing to pay.
3493 Inst->setOperand(i: It, Val: PoisonValue::get(T: Val->getType()));
3494 }
3495 }
3496
3497 /// Restore the original list of uses.
3498 void undo() override {
3499 LLVM_DEBUG(dbgs() << "Undo: OperandsHider: " << *Inst << "\n");
3500 for (unsigned It = 0, EndIt = OriginalValues.size(); It != EndIt; ++It)
3501 Inst->setOperand(i: It, Val: OriginalValues[It]);
3502 }
3503 };
3504
3505 /// Build a truncate instruction.
3506 class TruncBuilder : public TypePromotionAction {
3507 Value *Val;
3508
3509 public:
3510 /// Build a truncate instruction of \p Opnd producing a \p Ty
3511 /// result.
3512 /// trunc Opnd to Ty.
3513 TruncBuilder(Instruction *Opnd, Type *Ty) : TypePromotionAction(Opnd) {
3514 IRBuilder<> Builder(Opnd);
3515 Builder.SetCurrentDebugLocation(DebugLoc());
3516 Val = Builder.CreateTrunc(V: Opnd, DestTy: Ty, Name: "promoted");
3517 LLVM_DEBUG(dbgs() << "Do: TruncBuilder: " << *Val << "\n");
3518 }
3519
3520 /// Get the built value.
3521 Value *getBuiltValue() { return Val; }
3522
3523 /// Remove the built instruction.
3524 void undo() override {
3525 LLVM_DEBUG(dbgs() << "Undo: TruncBuilder: " << *Val << "\n");
3526 if (Instruction *IVal = dyn_cast<Instruction>(Val))
3527 IVal->eraseFromParent();
3528 }
3529 };
3530
3531 /// Build a sign extension instruction.
3532 class SExtBuilder : public TypePromotionAction {
3533 Value *Val;
3534
3535 public:
3536 /// Build a sign extension instruction of \p Opnd producing a \p Ty
3537 /// result.
3538 /// sext Opnd to Ty.
3539 SExtBuilder(Instruction *InsertPt, Value *Opnd, Type *Ty)
3540 : TypePromotionAction(InsertPt) {
3541 IRBuilder<> Builder(InsertPt);
3542 Val = Builder.CreateSExt(V: Opnd, DestTy: Ty, Name: "promoted");
3543 LLVM_DEBUG(dbgs() << "Do: SExtBuilder: " << *Val << "\n");
3544 }
3545
3546 /// Get the built value.
3547 Value *getBuiltValue() { return Val; }
3548
3549 /// Remove the built instruction.
3550 void undo() override {
3551 LLVM_DEBUG(dbgs() << "Undo: SExtBuilder: " << *Val << "\n");
3552 if (Instruction *IVal = dyn_cast<Instruction>(Val))
3553 IVal->eraseFromParent();
3554 }
3555 };
3556
3557 /// Build a zero extension instruction.
3558 class ZExtBuilder : public TypePromotionAction {
3559 Value *Val;
3560
3561 public:
3562 /// Build a zero extension instruction of \p Opnd producing a \p Ty
3563 /// result.
3564 /// zext Opnd to Ty.
3565 ZExtBuilder(Instruction *InsertPt, Value *Opnd, Type *Ty)
3566 : TypePromotionAction(InsertPt) {
3567 IRBuilder<> Builder(InsertPt);
3568 Builder.SetCurrentDebugLocation(DebugLoc());
3569 Val = Builder.CreateZExt(V: Opnd, DestTy: Ty, Name: "promoted");
3570 LLVM_DEBUG(dbgs() << "Do: ZExtBuilder: " << *Val << "\n");
3571 }
3572
3573 /// Get the built value.
3574 Value *getBuiltValue() { return Val; }
3575
3576 /// Remove the built instruction.
3577 void undo() override {
3578 LLVM_DEBUG(dbgs() << "Undo: ZExtBuilder: " << *Val << "\n");
3579 if (Instruction *IVal = dyn_cast<Instruction>(Val))
3580 IVal->eraseFromParent();
3581 }
3582 };
3583
3584 /// Mutate an instruction to another type.
3585 class TypeMutator : public TypePromotionAction {
3586 /// Record the original type.
3587 Type *OrigTy;
3588
3589 public:
3590 /// Mutate the type of \p Inst into \p NewTy.
3591 TypeMutator(Instruction *Inst, Type *NewTy)
3592 : TypePromotionAction(Inst), OrigTy(Inst->getType()) {
3593 LLVM_DEBUG(dbgs() << "Do: MutateType: " << *Inst << " with " << *NewTy
3594 << "\n");
3595 Inst->mutateType(Ty: NewTy);
3596 }
3597
3598 /// Mutate the instruction back to its original type.
3599 void undo() override {
3600 LLVM_DEBUG(dbgs() << "Undo: MutateType: " << *Inst << " with " << *OrigTy
3601 << "\n");
3602 Inst->mutateType(Ty: OrigTy);
3603 }
3604 };
3605
3606 /// Replace the uses of an instruction by another instruction.
3607 class UsesReplacer : public TypePromotionAction {
3608 /// Helper structure to keep track of the replaced uses.
3609 struct InstructionAndIdx {
3610 /// The instruction using the instruction.
3611 Instruction *Inst;
3612
3613 /// The index where this instruction is used for Inst.
3614 unsigned Idx;
3615
3616 InstructionAndIdx(Instruction *Inst, unsigned Idx)
3617 : Inst(Inst), Idx(Idx) {}
3618 };
3619
3620 /// Keep track of the original uses (pair Instruction, Index).
3621 SmallVector<InstructionAndIdx, 4> OriginalUses;
3622 /// Keep track of the debug users.
3623 SmallVector<DbgVariableRecord *, 1> DbgVariableRecords;
3624
3625 /// Keep track of the new value so that we can undo it by replacing
3626 /// instances of the new value with the original value.
3627 Value *New;
3628
3629 using use_iterator = SmallVectorImpl<InstructionAndIdx>::iterator;
3630
3631 public:
3632 /// Replace all the use of \p Inst by \p New.
3633 UsesReplacer(Instruction *Inst, Value *New)
3634 : TypePromotionAction(Inst), New(New) {
3635 LLVM_DEBUG(dbgs() << "Do: UsersReplacer: " << *Inst << " with " << *New
3636 << "\n");
3637 // Record the original uses.
3638 for (Use &U : Inst->uses()) {
3639 Instruction *UserI = cast<Instruction>(Val: U.getUser());
3640 OriginalUses.push_back(Elt: InstructionAndIdx(UserI, U.getOperandNo()));
3641 }
3642 // Record the debug uses separately. They are not in the instruction's
3643 // use list, but they are replaced by RAUW.
3644 findDbgValues(V: Inst, DbgVariableRecords);
3645
3646 // Now, we can replace the uses.
3647 Inst->replaceAllUsesWith(V: New);
3648 }
3649
3650 /// Reassign the original uses of Inst to Inst.
3651 void undo() override {
3652 LLVM_DEBUG(dbgs() << "Undo: UsersReplacer: " << *Inst << "\n");
3653 for (InstructionAndIdx &Use : OriginalUses)
3654 Use.Inst->setOperand(i: Use.Idx, Val: Inst);
3655 // RAUW has replaced all original uses with references to the new value,
3656 // including the debug uses. Since we are undoing the replacements,
3657 // the original debug uses must also be reinstated to maintain the
3658 // correctness and utility of debug value records.
3659 for (DbgVariableRecord *DVR : DbgVariableRecords)
3660 DVR->replaceVariableLocationOp(OldValue: New, NewValue: Inst);
3661 }
3662 };
3663
3664 /// Remove an instruction from the IR.
3665 class InstructionRemover : public TypePromotionAction {
3666 /// Original position of the instruction.
3667 InsertionHandler Inserter;
3668
3669 /// Helper structure to hide all the link to the instruction. In other
3670 /// words, this helps to do as if the instruction was removed.
3671 OperandsHider Hider;
3672
3673 /// Keep track of the uses replaced, if any.
3674 UsesReplacer *Replacer = nullptr;
3675
3676 /// Keep track of instructions removed.
3677 SetOfInstrs &RemovedInsts;
3678
3679 public:
3680 /// Remove all reference of \p Inst and optionally replace all its
3681 /// uses with New.
3682 /// \p RemovedInsts Keep track of the instructions removed by this Action.
3683 /// \pre If !Inst->use_empty(), then New != nullptr
3684 InstructionRemover(Instruction *Inst, SetOfInstrs &RemovedInsts,
3685 Value *New = nullptr)
3686 : TypePromotionAction(Inst), Inserter(Inst), Hider(Inst),
3687 RemovedInsts(RemovedInsts) {
3688 if (New)
3689 Replacer = new UsesReplacer(Inst, New);
3690 LLVM_DEBUG(dbgs() << "Do: InstructionRemover: " << *Inst << "\n");
3691 RemovedInsts.insert(Ptr: Inst);
3692 /// The instructions removed here will be freed after completing
3693 /// optimizeBlock() for all blocks as we need to keep track of the
3694 /// removed instructions during promotion.
3695 Inst->removeFromParent();
3696 }
3697
3698 ~InstructionRemover() override { delete Replacer; }
3699
3700 InstructionRemover &operator=(const InstructionRemover &other) = delete;
3701 InstructionRemover(const InstructionRemover &other) = delete;
3702
3703 /// Resurrect the instruction and reassign it to the proper uses if
3704 /// new value was provided when build this action.
3705 void undo() override {
3706 LLVM_DEBUG(dbgs() << "Undo: InstructionRemover: " << *Inst << "\n");
3707 Inserter.insert(Inst);
3708 if (Replacer)
3709 Replacer->undo();
3710 Hider.undo();
3711 RemovedInsts.erase(Ptr: Inst);
3712 }
3713 };
3714
3715public:
3716 /// Restoration point.
3717 /// The restoration point is a pointer to an action instead of an iterator
3718 /// because the iterator may be invalidated but not the pointer.
3719 using ConstRestorationPt = const TypePromotionAction *;
3720
3721 TypePromotionTransaction(SetOfInstrs &RemovedInsts)
3722 : RemovedInsts(RemovedInsts) {}
3723
3724 /// Advocate every changes made in that transaction. Return true if any change
3725 /// happen.
3726 bool commit();
3727
3728 /// Undo all the changes made after the given point.
3729 void rollback(ConstRestorationPt Point);
3730
3731 /// Get the current restoration point.
3732 ConstRestorationPt getRestorationPoint() const;
3733
3734 /// \name API for IR modification with state keeping to support rollback.
3735 /// @{
3736 /// Same as Instruction::setOperand.
3737 void setOperand(Instruction *Inst, unsigned Idx, Value *NewVal);
3738
3739 /// Same as Instruction::eraseFromParent.
3740 void eraseInstruction(Instruction *Inst, Value *NewVal = nullptr);
3741
3742 /// Same as Value::replaceAllUsesWith.
3743 void replaceAllUsesWith(Instruction *Inst, Value *New);
3744
3745 /// Same as Value::mutateType.
3746 void mutateType(Instruction *Inst, Type *NewTy);
3747
3748 /// Same as IRBuilder::createTrunc.
3749 Value *createTrunc(Instruction *Opnd, Type *Ty);
3750
3751 /// Same as IRBuilder::createSExt.
3752 Value *createSExt(Instruction *Inst, Value *Opnd, Type *Ty);
3753
3754 /// Same as IRBuilder::createZExt.
3755 Value *createZExt(Instruction *Inst, Value *Opnd, Type *Ty);
3756
3757private:
3758 /// The ordered list of actions made so far.
3759 SmallVector<std::unique_ptr<TypePromotionAction>, 16> Actions;
3760
3761 using CommitPt =
3762 SmallVectorImpl<std::unique_ptr<TypePromotionAction>>::iterator;
3763
3764 SetOfInstrs &RemovedInsts;
3765};
3766
3767} // end anonymous namespace
3768
3769void TypePromotionTransaction::setOperand(Instruction *Inst, unsigned Idx,
3770 Value *NewVal) {
3771 Actions.push_back(Elt: std::make_unique<TypePromotionTransaction::OperandSetter>(
3772 args&: Inst, args&: Idx, args&: NewVal));
3773}
3774
3775void TypePromotionTransaction::eraseInstruction(Instruction *Inst,
3776 Value *NewVal) {
3777 Actions.push_back(
3778 Elt: std::make_unique<TypePromotionTransaction::InstructionRemover>(
3779 args&: Inst, args&: RemovedInsts, args&: NewVal));
3780}
3781
3782void TypePromotionTransaction::replaceAllUsesWith(Instruction *Inst,
3783 Value *New) {
3784 Actions.push_back(
3785 Elt: std::make_unique<TypePromotionTransaction::UsesReplacer>(args&: Inst, args&: New));
3786}
3787
3788void TypePromotionTransaction::mutateType(Instruction *Inst, Type *NewTy) {
3789 Actions.push_back(
3790 Elt: std::make_unique<TypePromotionTransaction::TypeMutator>(args&: Inst, args&: NewTy));
3791}
3792
3793Value *TypePromotionTransaction::createTrunc(Instruction *Opnd, Type *Ty) {
3794 std::unique_ptr<TruncBuilder> Ptr(new TruncBuilder(Opnd, Ty));
3795 Value *Val = Ptr->getBuiltValue();
3796 Actions.push_back(Elt: std::move(Ptr));
3797 return Val;
3798}
3799
3800Value *TypePromotionTransaction::createSExt(Instruction *Inst, Value *Opnd,
3801 Type *Ty) {
3802 std::unique_ptr<SExtBuilder> Ptr(new SExtBuilder(Inst, Opnd, Ty));
3803 Value *Val = Ptr->getBuiltValue();
3804 Actions.push_back(Elt: std::move(Ptr));
3805 return Val;
3806}
3807
3808Value *TypePromotionTransaction::createZExt(Instruction *Inst, Value *Opnd,
3809 Type *Ty) {
3810 std::unique_ptr<ZExtBuilder> Ptr(new ZExtBuilder(Inst, Opnd, Ty));
3811 Value *Val = Ptr->getBuiltValue();
3812 Actions.push_back(Elt: std::move(Ptr));
3813 return Val;
3814}
3815
3816TypePromotionTransaction::ConstRestorationPt
3817TypePromotionTransaction::getRestorationPoint() const {
3818 return !Actions.empty() ? Actions.back().get() : nullptr;
3819}
3820
3821bool TypePromotionTransaction::commit() {
3822 for (std::unique_ptr<TypePromotionAction> &Action : Actions)
3823 Action->commit();
3824 bool Modified = !Actions.empty();
3825 Actions.clear();
3826 return Modified;
3827}
3828
3829void TypePromotionTransaction::rollback(
3830 TypePromotionTransaction::ConstRestorationPt Point) {
3831 while (!Actions.empty() && Point != Actions.back().get()) {
3832 std::unique_ptr<TypePromotionAction> Curr = Actions.pop_back_val();
3833 Curr->undo();
3834 }
3835}
3836
3837namespace {
3838
3839/// A helper class for matching addressing modes.
3840///
3841/// This encapsulates the logic for matching the target-legal addressing modes.
3842class AddressingModeMatcher {
3843 SmallVectorImpl<Instruction *> &AddrModeInsts;
3844 const TargetLowering &TLI;
3845 const TargetRegisterInfo &TRI;
3846 const DataLayout &DL;
3847 const LoopInfo &LI;
3848 const std::function<const DominatorTree &()> getDTFn;
3849
3850 /// AccessTy/MemoryInst - This is the type for the access (e.g. double) and
3851 /// the memory instruction that we're computing this address for.
3852 Type *AccessTy;
3853 unsigned AddrSpace;
3854 Instruction *MemoryInst;
3855
3856 /// This is the addressing mode that we're building up. This is
3857 /// part of the return value of this addressing mode matching stuff.
3858 ExtAddrMode &AddrMode;
3859
3860 /// The instructions inserted by other CodeGenPrepare optimizations.
3861 const SetOfInstrs &InsertedInsts;
3862
3863 /// A map from the instructions to their type before promotion.
3864 InstrToOrigTy &PromotedInsts;
3865
3866 /// The ongoing transaction where every action should be registered.
3867 TypePromotionTransaction &TPT;
3868
3869 // A GEP which has too large offset to be folded into the addressing mode.
3870 std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP;
3871
3872 /// This is set to true when we should not do profitability checks.
3873 /// When true, IsProfitableToFoldIntoAddressingMode always returns true.
3874 bool IgnoreProfitability;
3875
3876 /// True if we are optimizing for size.
3877 bool OptSize = false;
3878
3879 ProfileSummaryInfo *PSI;
3880 BlockFrequencyInfo *BFI;
3881
3882 AddressingModeMatcher(
3883 SmallVectorImpl<Instruction *> &AMI, const TargetLowering &TLI,
3884 const TargetRegisterInfo &TRI, const LoopInfo &LI,
3885 const std::function<const DominatorTree &()> getDTFn, Type *AT,
3886 unsigned AS, Instruction *MI, ExtAddrMode &AM,
3887 const SetOfInstrs &InsertedInsts, InstrToOrigTy &PromotedInsts,
3888 TypePromotionTransaction &TPT,
3889 std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP,
3890 bool OptSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
3891 : AddrModeInsts(AMI), TLI(TLI), TRI(TRI),
3892 DL(MI->getDataLayout()), LI(LI), getDTFn(getDTFn),
3893 AccessTy(AT), AddrSpace(AS), MemoryInst(MI), AddrMode(AM),
3894 InsertedInsts(InsertedInsts), PromotedInsts(PromotedInsts), TPT(TPT),
3895 LargeOffsetGEP(LargeOffsetGEP), OptSize(OptSize), PSI(PSI), BFI(BFI) {
3896 IgnoreProfitability = false;
3897 }
3898
3899public:
3900 /// Find the maximal addressing mode that a load/store of V can fold,
3901 /// give an access type of AccessTy. This returns a list of involved
3902 /// instructions in AddrModeInsts.
3903 /// \p InsertedInsts The instructions inserted by other CodeGenPrepare
3904 /// optimizations.
3905 /// \p PromotedInsts maps the instructions to their type before promotion.
3906 /// \p The ongoing transaction where every action should be registered.
3907 static ExtAddrMode
3908 Match(Value *V, Type *AccessTy, unsigned AS, Instruction *MemoryInst,
3909 SmallVectorImpl<Instruction *> &AddrModeInsts,
3910 const TargetLowering &TLI, const LoopInfo &LI,
3911 const std::function<const DominatorTree &()> getDTFn,
3912 const TargetRegisterInfo &TRI, const SetOfInstrs &InsertedInsts,
3913 InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT,
3914 std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP,
3915 bool OptSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {
3916 ExtAddrMode Result;
3917
3918 bool Success = AddressingModeMatcher(AddrModeInsts, TLI, TRI, LI, getDTFn,
3919 AccessTy, AS, MemoryInst, Result,
3920 InsertedInsts, PromotedInsts, TPT,
3921 LargeOffsetGEP, OptSize, PSI, BFI)
3922 .matchAddr(Addr: V, Depth: 0);
3923 (void)Success;
3924 assert(Success && "Couldn't select *anything*?");
3925 return Result;
3926 }
3927
3928private:
3929 bool matchScaledValue(Value *ScaleReg, int64_t Scale, unsigned Depth);
3930 bool matchAddr(Value *Addr, unsigned Depth);
3931 bool matchOperationAddr(User *AddrInst, unsigned Opcode, unsigned Depth,
3932 bool *MovedAway = nullptr);
3933 bool isProfitableToFoldIntoAddressingMode(Instruction *I,
3934 ExtAddrMode &AMBefore,
3935 ExtAddrMode &AMAfter);
3936 bool valueAlreadyLiveAtInst(Value *Val, Value *KnownLive1, Value *KnownLive2);
3937 bool isPromotionProfitable(unsigned NewCost, unsigned OldCost,
3938 Value *PromotedOperand) const;
3939};
3940
3941class PhiNodeSet;
3942
3943/// An iterator for PhiNodeSet.
3944class PhiNodeSetIterator {
3945 PhiNodeSet *const Set;
3946 size_t CurrentIndex = 0;
3947
3948public:
3949 /// The constructor. Start should point to either a valid element, or be equal
3950 /// to the size of the underlying SmallVector of the PhiNodeSet.
3951 PhiNodeSetIterator(PhiNodeSet *const Set, size_t Start);
3952 PHINode *operator*() const;
3953 PhiNodeSetIterator &operator++();
3954 bool operator==(const PhiNodeSetIterator &RHS) const;
3955 bool operator!=(const PhiNodeSetIterator &RHS) const;
3956};
3957
3958/// Keeps a set of PHINodes.
3959///
3960/// This is a minimal set implementation for a specific use case:
3961/// It is very fast when there are very few elements, but also provides good
3962/// performance when there are many. It is similar to SmallPtrSet, but also
3963/// provides iteration by insertion order, which is deterministic and stable
3964/// across runs. It is also similar to SmallSetVector, but provides removing
3965/// elements in O(1) time. This is achieved by not actually removing the element
3966/// from the underlying vector, so comes at the cost of using more memory, but
3967/// that is fine, since PhiNodeSets are used as short lived objects.
3968class PhiNodeSet {
3969 friend class PhiNodeSetIterator;
3970
3971 using MapType = SmallDenseMap<PHINode *, size_t, 32>;
3972 using iterator = PhiNodeSetIterator;
3973
3974 /// Keeps the elements in the order of their insertion in the underlying
3975 /// vector. To achieve constant time removal, it never deletes any element.
3976 SmallVector<PHINode *, 32> NodeList;
3977
3978 /// Keeps the elements in the underlying set implementation. This (and not the
3979 /// NodeList defined above) is the source of truth on whether an element
3980 /// is actually in the collection.
3981 MapType NodeMap;
3982
3983 /// Points to the first valid (not deleted) element when the set is not empty
3984 /// and the value is not zero. Equals to the size of the underlying vector
3985 /// when the set is empty. When the value is 0, as in the beginning, the
3986 /// first element may or may not be valid.
3987 size_t FirstValidElement = 0;
3988
3989public:
3990 /// Inserts a new element to the collection.
3991 /// \returns true if the element is actually added, i.e. was not in the
3992 /// collection before the operation.
3993 bool insert(PHINode *Ptr) {
3994 if (NodeMap.insert(KV: std::make_pair(x&: Ptr, y: NodeList.size())).second) {
3995 NodeList.push_back(Elt: Ptr);
3996 return true;
3997 }
3998 return false;
3999 }
4000
4001 /// Removes the element from the collection.
4002 /// \returns whether the element is actually removed, i.e. was in the
4003 /// collection before the operation.
4004 bool erase(PHINode *Ptr) {
4005 if (NodeMap.erase(Val: Ptr)) {
4006 SkipRemovedElements(CurrentIndex&: FirstValidElement);
4007 return true;
4008 }
4009 return false;
4010 }
4011
4012 /// Removes all elements and clears the collection.
4013 void clear() {
4014 NodeMap.clear();
4015 NodeList.clear();
4016 FirstValidElement = 0;
4017 }
4018
4019 /// \returns an iterator that will iterate the elements in the order of
4020 /// insertion.
4021 iterator begin() {
4022 if (FirstValidElement == 0)
4023 SkipRemovedElements(CurrentIndex&: FirstValidElement);
4024 return PhiNodeSetIterator(this, FirstValidElement);
4025 }
4026
4027 /// \returns an iterator that points to the end of the collection.
4028 iterator end() { return PhiNodeSetIterator(this, NodeList.size()); }
4029
4030 /// Returns the number of elements in the collection.
4031 size_t size() const { return NodeMap.size(); }
4032
4033 /// \returns 1 if the given element is in the collection, and 0 if otherwise.
4034 size_t count(PHINode *Ptr) const { return NodeMap.count(Val: Ptr); }
4035
4036private:
4037 /// Updates the CurrentIndex so that it will point to a valid element.
4038 ///
4039 /// If the element of NodeList at CurrentIndex is valid, it does not
4040 /// change it. If there are no more valid elements, it updates CurrentIndex
4041 /// to point to the end of the NodeList.
4042 void SkipRemovedElements(size_t &CurrentIndex) {
4043 while (CurrentIndex < NodeList.size()) {
4044 auto it = NodeMap.find(Val: NodeList[CurrentIndex]);
4045 // If the element has been deleted and added again later, NodeMap will
4046 // point to a different index, so CurrentIndex will still be invalid.
4047 if (it != NodeMap.end() && it->second == CurrentIndex)
4048 break;
4049 ++CurrentIndex;
4050 }
4051 }
4052};
4053
4054PhiNodeSetIterator::PhiNodeSetIterator(PhiNodeSet *const Set, size_t Start)
4055 : Set(Set), CurrentIndex(Start) {}
4056
4057PHINode *PhiNodeSetIterator::operator*() const {
4058 assert(CurrentIndex < Set->NodeList.size() &&
4059 "PhiNodeSet access out of range");
4060 return Set->NodeList[CurrentIndex];
4061}
4062
4063PhiNodeSetIterator &PhiNodeSetIterator::operator++() {
4064 assert(CurrentIndex < Set->NodeList.size() &&
4065 "PhiNodeSet access out of range");
4066 ++CurrentIndex;
4067 Set->SkipRemovedElements(CurrentIndex);
4068 return *this;
4069}
4070
4071bool PhiNodeSetIterator::operator==(const PhiNodeSetIterator &RHS) const {
4072 return CurrentIndex == RHS.CurrentIndex;
4073}
4074
4075bool PhiNodeSetIterator::operator!=(const PhiNodeSetIterator &RHS) const {
4076 return !((*this) == RHS);
4077}
4078
4079/// Keep track of simplification of Phi nodes.
4080/// Accept the set of all phi nodes and erase phi node from this set
4081/// if it is simplified.
4082class SimplificationTracker {
4083 DenseMap<Value *, Value *> Storage;
4084 // Tracks newly created Phi nodes. The elements are iterated by insertion
4085 // order.
4086 PhiNodeSet AllPhiNodes;
4087 // Tracks newly created Select nodes.
4088 SmallPtrSet<SelectInst *, 32> AllSelectNodes;
4089
4090public:
4091 Value *Get(Value *V) {
4092 do {
4093 auto SV = Storage.find(Val: V);
4094 if (SV == Storage.end())
4095 return V;
4096 V = SV->second;
4097 } while (true);
4098 }
4099
4100 void Put(Value *From, Value *To) { Storage.insert(KV: {From, To}); }
4101
4102 void ReplacePhi(PHINode *From, PHINode *To) {
4103 Value *OldReplacement = Get(V: From);
4104 while (OldReplacement != From) {
4105 From = To;
4106 To = dyn_cast<PHINode>(Val: OldReplacement);
4107 OldReplacement = Get(V: From);
4108 }
4109 assert(To && Get(To) == To && "Replacement PHI node is already replaced.");
4110 Put(From, To);
4111 From->replaceAllUsesWith(V: To);
4112 AllPhiNodes.erase(Ptr: From);
4113 From->eraseFromParent();
4114 }
4115
4116 PhiNodeSet &newPhiNodes() { return AllPhiNodes; }
4117
4118 void insertNewPhi(PHINode *PN) { AllPhiNodes.insert(Ptr: PN); }
4119
4120 void insertNewSelect(SelectInst *SI) { AllSelectNodes.insert(Ptr: SI); }
4121
4122 unsigned countNewPhiNodes() const { return AllPhiNodes.size(); }
4123
4124 unsigned countNewSelectNodes() const { return AllSelectNodes.size(); }
4125
4126 void destroyNewNodes(Type *CommonType) {
4127 // For safe erasing, replace the uses with dummy value first.
4128 auto *Dummy = PoisonValue::get(T: CommonType);
4129 for (auto *I : AllPhiNodes) {
4130 I->replaceAllUsesWith(V: Dummy);
4131 I->eraseFromParent();
4132 }
4133 AllPhiNodes.clear();
4134 for (auto *I : AllSelectNodes) {
4135 I->replaceAllUsesWith(V: Dummy);
4136 I->eraseFromParent();
4137 }
4138 AllSelectNodes.clear();
4139 }
4140};
4141
4142/// A helper class for combining addressing modes.
4143class AddressingModeCombiner {
4144 typedef DenseMap<Value *, Value *> FoldAddrToValueMapping;
4145 typedef std::pair<PHINode *, PHINode *> PHIPair;
4146
4147private:
4148 /// The addressing modes we've collected.
4149 SmallVector<ExtAddrMode, 16> AddrModes;
4150
4151 /// The field in which the AddrModes differ, when we have more than one.
4152 ExtAddrMode::FieldName DifferentField = ExtAddrMode::NoField;
4153
4154 /// Are the AddrModes that we have all just equal to their original values?
4155 bool AllAddrModesTrivial = true;
4156
4157 /// Common Type for all different fields in addressing modes.
4158 Type *CommonType = nullptr;
4159
4160 const DataLayout &DL;
4161
4162 /// Original Address.
4163 Value *Original;
4164
4165 /// Common value among addresses
4166 Value *CommonValue = nullptr;
4167
4168public:
4169 AddressingModeCombiner(const DataLayout &DL, Value *OriginalValue)
4170 : DL(DL), Original(OriginalValue) {}
4171
4172 ~AddressingModeCombiner() { eraseCommonValueIfDead(); }
4173
4174 /// Get the combined AddrMode
4175 const ExtAddrMode &getAddrMode() const { return AddrModes[0]; }
4176
4177 /// Add a new AddrMode if it's compatible with the AddrModes we already
4178 /// have.
4179 /// \return True iff we succeeded in doing so.
4180 bool addNewAddrMode(ExtAddrMode &NewAddrMode) {
4181 // Take note of if we have any non-trivial AddrModes, as we need to detect
4182 // when all AddrModes are trivial as then we would introduce a phi or select
4183 // which just duplicates what's already there.
4184 AllAddrModesTrivial = AllAddrModesTrivial && NewAddrMode.isTrivial();
4185
4186 // If this is the first addrmode then everything is fine.
4187 if (AddrModes.empty()) {
4188 AddrModes.emplace_back(Args&: NewAddrMode);
4189 return true;
4190 }
4191
4192 // Figure out how different this is from the other address modes, which we
4193 // can do just by comparing against the first one given that we only care
4194 // about the cumulative difference.
4195 ExtAddrMode::FieldName ThisDifferentField =
4196 AddrModes[0].compare(other: NewAddrMode);
4197 if (DifferentField == ExtAddrMode::NoField)
4198 DifferentField = ThisDifferentField;
4199 else if (DifferentField != ThisDifferentField)
4200 DifferentField = ExtAddrMode::MultipleFields;
4201
4202 // If NewAddrMode differs in more than one dimension we cannot handle it.
4203 bool CanHandle = DifferentField != ExtAddrMode::MultipleFields;
4204
4205 // If Scale Field is different then we reject.
4206 CanHandle = CanHandle && DifferentField != ExtAddrMode::ScaleField;
4207
4208 // We also must reject the case when base offset is different and
4209 // scale reg is not null, we cannot handle this case due to merge of
4210 // different offsets will be used as ScaleReg.
4211 CanHandle = CanHandle && (DifferentField != ExtAddrMode::BaseOffsField ||
4212 !NewAddrMode.ScaledReg);
4213
4214 // We also must reject the case when GV is different and BaseReg installed
4215 // due to we want to use base reg as a merge of GV values.
4216 CanHandle = CanHandle && (DifferentField != ExtAddrMode::BaseGVField ||
4217 !NewAddrMode.HasBaseReg);
4218
4219 // Even if NewAddMode is the same we still need to collect it due to
4220 // original value is different. And later we will need all original values
4221 // as anchors during finding the common Phi node.
4222 if (CanHandle)
4223 AddrModes.emplace_back(Args&: NewAddrMode);
4224 else
4225 AddrModes.clear();
4226
4227 return CanHandle;
4228 }
4229
4230 /// Combine the addressing modes we've collected into a single
4231 /// addressing mode.
4232 /// \return True iff we successfully combined them or we only had one so
4233 /// didn't need to combine them anyway.
4234 bool combineAddrModes() {
4235 // If we have no AddrModes then they can't be combined.
4236 if (AddrModes.size() == 0)
4237 return false;
4238
4239 // A single AddrMode can trivially be combined.
4240 if (AddrModes.size() == 1 || DifferentField == ExtAddrMode::NoField)
4241 return true;
4242
4243 // If the AddrModes we collected are all just equal to the value they are
4244 // derived from then combining them wouldn't do anything useful.
4245 if (AllAddrModesTrivial)
4246 return false;
4247
4248 if (!addrModeCombiningAllowed())
4249 return false;
4250
4251 // Build a map between <original value, basic block where we saw it> to
4252 // value of base register.
4253 // Bail out if there is no common type.
4254 FoldAddrToValueMapping Map;
4255 if (!initializeMap(Map))
4256 return false;
4257
4258 CommonValue = findCommon(Map);
4259 if (CommonValue)
4260 AddrModes[0].SetCombinedField(Field: DifferentField, V: CommonValue, AddrModes);
4261 return CommonValue != nullptr;
4262 }
4263
4264private:
4265 /// `CommonValue` may be a placeholder inserted by us.
4266 /// If the placeholder is not used, we should remove this dead instruction.
4267 void eraseCommonValueIfDead() {
4268 if (CommonValue && CommonValue->use_empty())
4269 if (Instruction *CommonInst = dyn_cast<Instruction>(Val: CommonValue))
4270 CommonInst->eraseFromParent();
4271 }
4272
4273 /// Initialize Map with anchor values. For address seen
4274 /// we set the value of different field saw in this address.
4275 /// At the same time we find a common type for different field we will
4276 /// use to create new Phi/Select nodes. Keep it in CommonType field.
4277 /// Return false if there is no common type found.
4278 bool initializeMap(FoldAddrToValueMapping &Map) {
4279 // Keep track of keys where the value is null. We will need to replace it
4280 // with constant null when we know the common type.
4281 SmallVector<Value *, 2> NullValue;
4282 Type *IntPtrTy = DL.getIntPtrType(AddrModes[0].OriginalValue->getType());
4283 for (auto &AM : AddrModes) {
4284 Value *DV = AM.GetFieldAsValue(Field: DifferentField, IntPtrTy);
4285 if (DV) {
4286 auto *Type = DV->getType();
4287 if (CommonType && CommonType != Type)
4288 return false;
4289 CommonType = Type;
4290 Map[AM.OriginalValue] = DV;
4291 } else {
4292 NullValue.push_back(Elt: AM.OriginalValue);
4293 }
4294 }
4295 assert(CommonType && "At least one non-null value must be!");
4296 for (auto *V : NullValue)
4297 Map[V] = Constant::getNullValue(Ty: CommonType);
4298 return true;
4299 }
4300
4301 /// We have mapping between value A and other value B where B was a field in
4302 /// addressing mode represented by A. Also we have an original value C
4303 /// representing an address we start with. Traversing from C through phi and
4304 /// selects we ended up with A's in a map. This utility function tries to find
4305 /// a value V which is a field in addressing mode C and traversing through phi
4306 /// nodes and selects we will end up in corresponded values B in a map.
4307 /// The utility will create a new Phi/Selects if needed.
4308 // The simple example looks as follows:
4309 // BB1:
4310 // p1 = b1 + 40
4311 // br cond BB2, BB3
4312 // BB2:
4313 // p2 = b2 + 40
4314 // br BB3
4315 // BB3:
4316 // p = phi [p1, BB1], [p2, BB2]
4317 // v = load p
4318 // Map is
4319 // p1 -> b1
4320 // p2 -> b2
4321 // Request is
4322 // p -> ?
4323 // The function tries to find or build phi [b1, BB1], [b2, BB2] in BB3.
4324 Value *findCommon(FoldAddrToValueMapping &Map) {
4325 // Tracks the simplification of newly created phi nodes. The reason we use
4326 // this mapping is because we will add new created Phi nodes in AddrToBase.
4327 // Simplification of Phi nodes is recursive, so some Phi node may
4328 // be simplified after we added it to AddrToBase. In reality this
4329 // simplification is possible only if original phi/selects were not
4330 // simplified yet.
4331 // Using this mapping we can find the current value in AddrToBase.
4332 SimplificationTracker ST;
4333
4334 // First step, DFS to create PHI nodes for all intermediate blocks.
4335 // Also fill traverse order for the second step.
4336 SmallVector<Value *, 32> TraverseOrder;
4337 InsertPlaceholders(Map, TraverseOrder, ST);
4338
4339 // Second Step, fill new nodes by merged values and simplify if possible.
4340 FillPlaceholders(Map, TraverseOrder, ST);
4341
4342 if (!AddrSinkNewSelects && ST.countNewSelectNodes() > 0) {
4343 ST.destroyNewNodes(CommonType);
4344 return nullptr;
4345 }
4346
4347 // Now we'd like to match New Phi nodes to existed ones.
4348 unsigned PhiNotMatchedCount = 0;
4349 if (!MatchPhiSet(ST, AllowNewPhiNodes: AddrSinkNewPhis, PhiNotMatchedCount)) {
4350 ST.destroyNewNodes(CommonType);
4351 return nullptr;
4352 }
4353
4354 auto *Result = ST.Get(V: Map.find(Val: Original)->second);
4355 if (Result) {
4356 NumMemoryInstsPhiCreated += ST.countNewPhiNodes() + PhiNotMatchedCount;
4357 NumMemoryInstsSelectCreated += ST.countNewSelectNodes();
4358 }
4359 return Result;
4360 }
4361
4362 /// Try to match PHI node to Candidate.
4363 /// Matcher tracks the matched Phi nodes.
4364 bool MatchPhiNode(PHINode *PHI, PHINode *Candidate,
4365 SmallSetVector<PHIPair, 8> &Matcher,
4366 PhiNodeSet &PhiNodesToMatch) {
4367 SmallVector<PHIPair, 8> WorkList;
4368 Matcher.insert(X: {PHI, Candidate});
4369 SmallPtrSet<PHINode *, 8> MatchedPHIs;
4370 MatchedPHIs.insert(Ptr: PHI);
4371 WorkList.push_back(Elt: {PHI, Candidate});
4372 SmallSet<PHIPair, 8> Visited;
4373 while (!WorkList.empty()) {
4374 auto Item = WorkList.pop_back_val();
4375 if (!Visited.insert(V: Item).second)
4376 continue;
4377 // We iterate over all incoming values to Phi to compare them.
4378 // If values are different and both of them Phi and the first one is a
4379 // Phi we added (subject to match) and both of them is in the same basic
4380 // block then we can match our pair if values match. So we state that
4381 // these values match and add it to work list to verify that.
4382 for (auto *B : Item.first->blocks()) {
4383 Value *FirstValue = Item.first->getIncomingValueForBlock(BB: B);
4384 Value *SecondValue = Item.second->getIncomingValueForBlock(BB: B);
4385 if (FirstValue == SecondValue)
4386 continue;
4387
4388 PHINode *FirstPhi = dyn_cast<PHINode>(Val: FirstValue);
4389 PHINode *SecondPhi = dyn_cast<PHINode>(Val: SecondValue);
4390
4391 // One of them is not Phi or
4392 // The first one is not Phi node from the set we'd like to match or
4393 // Phi nodes from different basic blocks then
4394 // we will not be able to match.
4395 if (!FirstPhi || !SecondPhi || !PhiNodesToMatch.count(Ptr: FirstPhi) ||
4396 FirstPhi->getParent() != SecondPhi->getParent())
4397 return false;
4398
4399 // If we already matched them then continue.
4400 if (Matcher.count(key: {FirstPhi, SecondPhi}))
4401 continue;
4402 // So the values are different and does not match. So we need them to
4403 // match. (But we register no more than one match per PHI node, so that
4404 // we won't later try to replace them twice.)
4405 if (MatchedPHIs.insert(Ptr: FirstPhi).second)
4406 Matcher.insert(X: {FirstPhi, SecondPhi});
4407 // But me must check it.
4408 WorkList.push_back(Elt: {FirstPhi, SecondPhi});
4409 }
4410 }
4411 return true;
4412 }
4413
4414 /// For the given set of PHI nodes (in the SimplificationTracker) try
4415 /// to find their equivalents.
4416 /// Returns false if this matching fails and creation of new Phi is disabled.
4417 bool MatchPhiSet(SimplificationTracker &ST, bool AllowNewPhiNodes,
4418 unsigned &PhiNotMatchedCount) {
4419 // Matched and PhiNodesToMatch iterate their elements in a deterministic
4420 // order, so the replacements (ReplacePhi) are also done in a deterministic
4421 // order.
4422 SmallSetVector<PHIPair, 8> Matched;
4423 SmallPtrSet<PHINode *, 8> WillNotMatch;
4424 PhiNodeSet &PhiNodesToMatch = ST.newPhiNodes();
4425 while (PhiNodesToMatch.size()) {
4426 PHINode *PHI = *PhiNodesToMatch.begin();
4427
4428 // Add us, if no Phi nodes in the basic block we do not match.
4429 WillNotMatch.clear();
4430 WillNotMatch.insert(Ptr: PHI);
4431
4432 // Traverse all Phis until we found equivalent or fail to do that.
4433 bool IsMatched = false;
4434 for (auto &P : PHI->getParent()->phis()) {
4435 // Skip new Phi nodes.
4436 if (PhiNodesToMatch.count(Ptr: &P))
4437 continue;
4438 if ((IsMatched = MatchPhiNode(PHI, Candidate: &P, Matcher&: Matched, PhiNodesToMatch)))
4439 break;
4440 // If it does not match, collect all Phi nodes from matcher.
4441 // if we end up with no match, them all these Phi nodes will not match
4442 // later.
4443 WillNotMatch.insert_range(R: llvm::make_first_range(c&: Matched));
4444 Matched.clear();
4445 }
4446 if (IsMatched) {
4447 // Replace all matched values and erase them.
4448 for (auto MV : Matched)
4449 ST.ReplacePhi(From: MV.first, To: MV.second);
4450 Matched.clear();
4451 continue;
4452 }
4453 // If we are not allowed to create new nodes then bail out.
4454 if (!AllowNewPhiNodes)
4455 return false;
4456 // Just remove all seen values in matcher. They will not match anything.
4457 PhiNotMatchedCount += WillNotMatch.size();
4458 for (auto *P : WillNotMatch)
4459 PhiNodesToMatch.erase(Ptr: P);
4460 }
4461 return true;
4462 }
4463 /// Fill the placeholders with values from predecessors and simplify them.
4464 void FillPlaceholders(FoldAddrToValueMapping &Map,
4465 SmallVectorImpl<Value *> &TraverseOrder,
4466 SimplificationTracker &ST) {
4467 while (!TraverseOrder.empty()) {
4468 Value *Current = TraverseOrder.pop_back_val();
4469 assert(Map.contains(Current) && "No node to fill!!!");
4470 Value *V = Map[Current];
4471
4472 if (SelectInst *Select = dyn_cast<SelectInst>(Val: V)) {
4473 // CurrentValue also must be Select.
4474 auto *CurrentSelect = cast<SelectInst>(Val: Current);
4475 auto *TrueValue = CurrentSelect->getTrueValue();
4476 assert(Map.contains(TrueValue) && "No True Value!");
4477 Select->setTrueValue(ST.Get(V: Map[TrueValue]));
4478 auto *FalseValue = CurrentSelect->getFalseValue();
4479 assert(Map.contains(FalseValue) && "No False Value!");
4480 Select->setFalseValue(ST.Get(V: Map[FalseValue]));
4481 } else {
4482 // Must be a Phi node then.
4483 auto *PHI = cast<PHINode>(Val: V);
4484 // Fill the Phi node with values from predecessors.
4485 for (auto *B : predecessors(BB: PHI->getParent())) {
4486 Value *PV = cast<PHINode>(Val: Current)->getIncomingValueForBlock(BB: B);
4487 assert(Map.contains(PV) && "No predecessor Value!");
4488 PHI->addIncoming(V: ST.Get(V: Map[PV]), BB: B);
4489 }
4490 }
4491 }
4492 }
4493
4494 /// Starting from original value recursively iterates over def-use chain up to
4495 /// known ending values represented in a map. For each traversed phi/select
4496 /// inserts a placeholder Phi or Select.
4497 /// Reports all new created Phi/Select nodes by adding them to set.
4498 /// Also reports and order in what values have been traversed.
4499 void InsertPlaceholders(FoldAddrToValueMapping &Map,
4500 SmallVectorImpl<Value *> &TraverseOrder,
4501 SimplificationTracker &ST) {
4502 SmallVector<Value *, 32> Worklist;
4503 assert((isa<PHINode>(Original) || isa<SelectInst>(Original)) &&
4504 "Address must be a Phi or Select node");
4505 auto *Dummy = PoisonValue::get(T: CommonType);
4506 Worklist.push_back(Elt: Original);
4507 while (!Worklist.empty()) {
4508 Value *Current = Worklist.pop_back_val();
4509 // if it is already visited or it is an ending value then skip it.
4510 if (Map.contains(Val: Current))
4511 continue;
4512 TraverseOrder.push_back(Elt: Current);
4513
4514 // CurrentValue must be a Phi node or select. All others must be covered
4515 // by anchors.
4516 if (SelectInst *CurrentSelect = dyn_cast<SelectInst>(Val: Current)) {
4517 // Is it OK to get metadata from OrigSelect?!
4518 // Create a Select placeholder with dummy value.
4519 SelectInst *Select =
4520 SelectInst::Create(C: CurrentSelect->getCondition(), S1: Dummy, S2: Dummy,
4521 NameStr: CurrentSelect->getName(),
4522 InsertBefore: CurrentSelect->getIterator(), MDFrom: CurrentSelect);
4523 Map[Current] = Select;
4524 ST.insertNewSelect(SI: Select);
4525 // We are interested in True and False values.
4526 Worklist.push_back(Elt: CurrentSelect->getTrueValue());
4527 Worklist.push_back(Elt: CurrentSelect->getFalseValue());
4528 } else {
4529 // It must be a Phi node then.
4530 PHINode *CurrentPhi = cast<PHINode>(Val: Current);
4531 unsigned PredCount = CurrentPhi->getNumIncomingValues();
4532 PHINode *PHI =
4533 PHINode::Create(Ty: CommonType, NumReservedValues: PredCount, NameStr: "sunk_phi", InsertBefore: CurrentPhi->getIterator());
4534 Map[Current] = PHI;
4535 ST.insertNewPhi(PN: PHI);
4536 append_range(C&: Worklist, R: CurrentPhi->incoming_values());
4537 }
4538 }
4539 }
4540
4541 bool addrModeCombiningAllowed() {
4542 if (DisableComplexAddrModes)
4543 return false;
4544 switch (DifferentField) {
4545 default:
4546 return false;
4547 case ExtAddrMode::BaseRegField:
4548 return AddrSinkCombineBaseReg;
4549 case ExtAddrMode::BaseGVField:
4550 return AddrSinkCombineBaseGV;
4551 case ExtAddrMode::BaseOffsField:
4552 return AddrSinkCombineBaseOffs;
4553 case ExtAddrMode::ScaledRegField:
4554 return AddrSinkCombineScaledReg;
4555 }
4556 }
4557};
4558} // end anonymous namespace
4559
4560/// Try adding ScaleReg*Scale to the current addressing mode.
4561/// Return true and update AddrMode if this addr mode is legal for the target,
4562/// false if not.
4563bool AddressingModeMatcher::matchScaledValue(Value *ScaleReg, int64_t Scale,
4564 unsigned Depth) {
4565 // If Scale is 1, then this is the same as adding ScaleReg to the addressing
4566 // mode. Just process that directly.
4567 if (Scale == 1)
4568 return matchAddr(Addr: ScaleReg, Depth);
4569
4570 // If the scale is 0, it takes nothing to add this.
4571 if (Scale == 0)
4572 return true;
4573
4574 // If we already have a scale of this value, we can add to it, otherwise, we
4575 // need an available scale field.
4576 if (AddrMode.Scale != 0 && AddrMode.ScaledReg != ScaleReg)
4577 return false;
4578
4579 ExtAddrMode TestAddrMode = AddrMode;
4580
4581 // Add scale to turn X*4+X*3 -> X*7. This could also do things like
4582 // [A+B + A*7] -> [B+A*8].
4583 TestAddrMode.Scale += Scale;
4584 TestAddrMode.ScaledReg = ScaleReg;
4585
4586 // If the new address isn't legal, bail out.
4587 if (!TLI.isLegalAddressingMode(DL, AM: TestAddrMode, Ty: AccessTy, AddrSpace))
4588 return false;
4589
4590 // It was legal, so commit it.
4591 AddrMode = TestAddrMode;
4592
4593 // Okay, we decided that we can add ScaleReg+Scale to AddrMode. Check now
4594 // to see if ScaleReg is actually X+C. If so, we can turn this into adding
4595 // X*Scale + C*Scale to addr mode. If we found available IV increment, do not
4596 // go any further: we can reuse it and cannot eliminate it.
4597 ConstantInt *CI = nullptr;
4598 Value *AddLHS = nullptr;
4599 if (isa<Instruction>(Val: ScaleReg) && // not a constant expr.
4600 match(V: ScaleReg, P: m_Add(L: m_Value(V&: AddLHS), R: m_ConstantInt(CI))) &&
4601 !isIVIncrement(V: ScaleReg, LI: &LI) && CI->getValue().isSignedIntN(N: 64)) {
4602 TestAddrMode.InBounds = false;
4603 TestAddrMode.ScaledReg = AddLHS;
4604 TestAddrMode.BaseOffs += CI->getSExtValue() * TestAddrMode.Scale;
4605
4606 // If this addressing mode is legal, commit it and remember that we folded
4607 // this instruction.
4608 if (TLI.isLegalAddressingMode(DL, AM: TestAddrMode, Ty: AccessTy, AddrSpace)) {
4609 AddrModeInsts.push_back(Elt: cast<Instruction>(Val: ScaleReg));
4610 AddrMode = TestAddrMode;
4611 return true;
4612 }
4613 // Restore status quo.
4614 TestAddrMode = AddrMode;
4615 }
4616
4617 // If this is an add recurrence with a constant step, return the increment
4618 // instruction and the canonicalized step.
4619 auto GetConstantStep =
4620 [this](const Value *V) -> std::optional<std::pair<Instruction *, APInt>> {
4621 auto *PN = dyn_cast<PHINode>(Val: V);
4622 if (!PN)
4623 return std::nullopt;
4624 auto IVInc = getIVIncrement(PN, LI: &LI);
4625 if (!IVInc)
4626 return std::nullopt;
4627 // TODO: The result of the intrinsics above is two-complement. However when
4628 // IV inc is expressed as add or sub, iv.next is potentially a poison value.
4629 // If it has nuw or nsw flags, we need to make sure that these flags are
4630 // inferrable at the point of memory instruction. Otherwise we are replacing
4631 // well-defined two-complement computation with poison. Currently, to avoid
4632 // potentially complex analysis needed to prove this, we reject such cases.
4633 if (auto *OIVInc = dyn_cast<OverflowingBinaryOperator>(Val: IVInc->first))
4634 if (OIVInc->hasNoSignedWrap() || OIVInc->hasNoUnsignedWrap())
4635 return std::nullopt;
4636 if (auto *ConstantStep = dyn_cast<ConstantInt>(Val: IVInc->second))
4637 return std::make_pair(x&: IVInc->first, y: ConstantStep->getValue());
4638 return std::nullopt;
4639 };
4640
4641 // Try to account for the following special case:
4642 // 1. ScaleReg is an inductive variable;
4643 // 2. We use it with non-zero offset;
4644 // 3. IV's increment is available at the point of memory instruction.
4645 //
4646 // In this case, we may reuse the IV increment instead of the IV Phi to
4647 // achieve the following advantages:
4648 // 1. If IV step matches the offset, we will have no need in the offset;
4649 // 2. Even if they don't match, we will reduce the overlap of living IV
4650 // and IV increment, that will potentially lead to better register
4651 // assignment.
4652 if (AddrMode.BaseOffs) {
4653 if (auto IVStep = GetConstantStep(ScaleReg)) {
4654 Instruction *IVInc = IVStep->first;
4655 // The following assert is important to ensure a lack of infinite loops.
4656 // This transforms is (intentionally) the inverse of the one just above.
4657 // If they don't agree on the definition of an increment, we'd alternate
4658 // back and forth indefinitely.
4659 assert(isIVIncrement(IVInc, &LI) && "implied by GetConstantStep");
4660 APInt Step = IVStep->second;
4661 APInt Offset = Step * AddrMode.Scale;
4662 if (Offset.isSignedIntN(N: 64)) {
4663 TestAddrMode.InBounds = false;
4664 TestAddrMode.ScaledReg = IVInc;
4665 TestAddrMode.BaseOffs -= Offset.getLimitedValue();
4666 // If this addressing mode is legal, commit it..
4667 // (Note that we defer the (expensive) domtree base legality check
4668 // to the very last possible point.)
4669 if (TLI.isLegalAddressingMode(DL, AM: TestAddrMode, Ty: AccessTy, AddrSpace) &&
4670 getDTFn().dominates(Def: IVInc, User: MemoryInst)) {
4671 AddrModeInsts.push_back(Elt: cast<Instruction>(Val: IVInc));
4672 AddrMode = TestAddrMode;
4673 return true;
4674 }
4675 // Restore status quo.
4676 TestAddrMode = AddrMode;
4677 }
4678 }
4679 }
4680
4681 // Otherwise, just return what we have.
4682 return true;
4683}
4684
4685/// This is a little filter, which returns true if an addressing computation
4686/// involving I might be folded into a load/store accessing it.
4687/// This doesn't need to be perfect, but needs to accept at least
4688/// the set of instructions that MatchOperationAddr can.
4689static bool MightBeFoldableInst(Instruction *I) {
4690 switch (I->getOpcode()) {
4691 case Instruction::BitCast:
4692 case Instruction::AddrSpaceCast:
4693 // Don't touch identity bitcasts.
4694 if (I->getType() == I->getOperand(i: 0)->getType())
4695 return false;
4696 return I->getType()->isIntOrPtrTy();
4697 case Instruction::PtrToInt:
4698 // PtrToInt is always a noop, as we know that the int type is pointer sized.
4699 return true;
4700 case Instruction::IntToPtr:
4701 // We know the input is intptr_t, so this is foldable.
4702 return true;
4703 case Instruction::Add:
4704 return true;
4705 case Instruction::Mul:
4706 case Instruction::Shl:
4707 // Can only handle X*C and X << C.
4708 return isa<ConstantInt>(Val: I->getOperand(i: 1));
4709 case Instruction::GetElementPtr:
4710 return true;
4711 default:
4712 return false;
4713 }
4714}
4715
4716/// Check whether or not \p Val is a legal instruction for \p TLI.
4717/// \note \p Val is assumed to be the product of some type promotion.
4718/// Therefore if \p Val has an undefined state in \p TLI, this is assumed
4719/// to be legal, as the non-promoted value would have had the same state.
4720static bool isPromotedInstructionLegal(const TargetLowering &TLI,
4721 const DataLayout &DL, Value *Val) {
4722 Instruction *PromotedInst = dyn_cast<Instruction>(Val);
4723 if (!PromotedInst)
4724 return false;
4725 int ISDOpcode = TLI.InstructionOpcodeToISD(Opcode: PromotedInst->getOpcode());
4726 // If the ISDOpcode is undefined, it was undefined before the promotion.
4727 if (!ISDOpcode)
4728 return true;
4729 // Otherwise, check if the promoted instruction is legal or not.
4730 return TLI.isOperationLegalOrCustom(
4731 Op: ISDOpcode, VT: TLI.getValueType(DL, Ty: PromotedInst->getType()));
4732}
4733
4734namespace {
4735
4736/// Hepler class to perform type promotion.
4737class TypePromotionHelper {
4738 /// Utility function to add a promoted instruction \p ExtOpnd to
4739 /// \p PromotedInsts and record the type of extension we have seen.
4740 static void addPromotedInst(InstrToOrigTy &PromotedInsts,
4741 Instruction *ExtOpnd, bool IsSExt) {
4742 ExtType ExtTy = IsSExt ? SignExtension : ZeroExtension;
4743 auto [It, Inserted] = PromotedInsts.try_emplace(Key: ExtOpnd);
4744 if (!Inserted) {
4745 // If the new extension is same as original, the information in
4746 // PromotedInsts[ExtOpnd] is still correct.
4747 if (It->second.getInt() == ExtTy)
4748 return;
4749
4750 // Now the new extension is different from old extension, we make
4751 // the type information invalid by setting extension type to
4752 // BothExtension.
4753 ExtTy = BothExtension;
4754 }
4755 It->second = TypeIsSExt(ExtOpnd->getType(), ExtTy);
4756 }
4757
4758 /// Utility function to query the original type of instruction \p Opnd
4759 /// with a matched extension type. If the extension doesn't match, we
4760 /// cannot use the information we had on the original type.
4761 /// BothExtension doesn't match any extension type.
4762 static const Type *getOrigType(const InstrToOrigTy &PromotedInsts,
4763 Instruction *Opnd, bool IsSExt) {
4764 ExtType ExtTy = IsSExt ? SignExtension : ZeroExtension;
4765 InstrToOrigTy::const_iterator It = PromotedInsts.find(Val: Opnd);
4766 if (It != PromotedInsts.end() && It->second.getInt() == ExtTy)
4767 return It->second.getPointer();
4768 return nullptr;
4769 }
4770
4771 /// Utility function to check whether or not a sign or zero extension
4772 /// of \p Inst with \p ConsideredExtType can be moved through \p Inst by
4773 /// either using the operands of \p Inst or promoting \p Inst.
4774 /// The type of the extension is defined by \p IsSExt.
4775 /// In other words, check if:
4776 /// ext (Ty Inst opnd1 opnd2 ... opndN) to ConsideredExtType.
4777 /// #1 Promotion applies:
4778 /// ConsideredExtType Inst (ext opnd1 to ConsideredExtType, ...).
4779 /// #2 Operand reuses:
4780 /// ext opnd1 to ConsideredExtType.
4781 /// \p PromotedInsts maps the instructions to their type before promotion.
4782 static bool canGetThrough(const Instruction *Inst, Type *ConsideredExtType,
4783 const InstrToOrigTy &PromotedInsts, bool IsSExt);
4784
4785 /// Utility function to determine if \p OpIdx should be promoted when
4786 /// promoting \p Inst.
4787 static bool shouldExtOperand(const Instruction *Inst, int OpIdx) {
4788 return !(isa<SelectInst>(Val: Inst) && OpIdx == 0);
4789 }
4790
4791 /// Utility function to promote the operand of \p Ext when this
4792 /// operand is a promotable trunc or sext or zext.
4793 /// \p PromotedInsts maps the instructions to their type before promotion.
4794 /// \p CreatedInstsCost[out] contains the cost of all instructions
4795 /// created to promote the operand of Ext.
4796 /// Newly added extensions are inserted in \p Exts.
4797 /// Newly added truncates are inserted in \p Truncs.
4798 /// Should never be called directly.
4799 /// \return The promoted value which is used instead of Ext.
4800 static Value *promoteOperandForTruncAndAnyExt(
4801 Instruction *Ext, TypePromotionTransaction &TPT,
4802 InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
4803 SmallVectorImpl<Instruction *> *Exts,
4804 SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI);
4805
4806 /// Utility function to promote the operand of \p Ext when this
4807 /// operand is promotable and is not a supported trunc or sext.
4808 /// \p PromotedInsts maps the instructions to their type before promotion.
4809 /// \p CreatedInstsCost[out] contains the cost of all the instructions
4810 /// created to promote the operand of Ext.
4811 /// Newly added extensions are inserted in \p Exts.
4812 /// Newly added truncates are inserted in \p Truncs.
4813 /// Should never be called directly.
4814 /// \return The promoted value which is used instead of Ext.
4815 static Value *promoteOperandForOther(Instruction *Ext,
4816 TypePromotionTransaction &TPT,
4817 InstrToOrigTy &PromotedInsts,
4818 unsigned &CreatedInstsCost,
4819 SmallVectorImpl<Instruction *> *Exts,
4820 SmallVectorImpl<Instruction *> *Truncs,
4821 const TargetLowering &TLI, bool IsSExt);
4822
4823 /// \see promoteOperandForOther.
4824 static Value *signExtendOperandForOther(
4825 Instruction *Ext, TypePromotionTransaction &TPT,
4826 InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
4827 SmallVectorImpl<Instruction *> *Exts,
4828 SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) {
4829 return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInstsCost,
4830 Exts, Truncs, TLI, IsSExt: true);
4831 }
4832
4833 /// \see promoteOperandForOther.
4834 static Value *zeroExtendOperandForOther(
4835 Instruction *Ext, TypePromotionTransaction &TPT,
4836 InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
4837 SmallVectorImpl<Instruction *> *Exts,
4838 SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) {
4839 return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInstsCost,
4840 Exts, Truncs, TLI, IsSExt: false);
4841 }
4842
4843public:
4844 /// Type for the utility function that promotes the operand of Ext.
4845 using Action = Value *(*)(Instruction *Ext, TypePromotionTransaction &TPT,
4846 InstrToOrigTy &PromotedInsts,
4847 unsigned &CreatedInstsCost,
4848 SmallVectorImpl<Instruction *> *Exts,
4849 SmallVectorImpl<Instruction *> *Truncs,
4850 const TargetLowering &TLI);
4851
4852 /// Given a sign/zero extend instruction \p Ext, return the appropriate
4853 /// action to promote the operand of \p Ext instead of using Ext.
4854 /// \return NULL if no promotable action is possible with the current
4855 /// sign extension.
4856 /// \p InsertedInsts keeps track of all the instructions inserted by the
4857 /// other CodeGenPrepare optimizations. This information is important
4858 /// because we do not want to promote these instructions as CodeGenPrepare
4859 /// will reinsert them later. Thus creating an infinite loop: create/remove.
4860 /// \p PromotedInsts maps the instructions to their type before promotion.
4861 static Action getAction(Instruction *Ext, const SetOfInstrs &InsertedInsts,
4862 const TargetLowering &TLI,
4863 const InstrToOrigTy &PromotedInsts);
4864};
4865
4866} // end anonymous namespace
4867
4868bool TypePromotionHelper::canGetThrough(const Instruction *Inst,
4869 Type *ConsideredExtType,
4870 const InstrToOrigTy &PromotedInsts,
4871 bool IsSExt) {
4872 // The promotion helper does not know how to deal with vector types yet.
4873 // To be able to fix that, we would need to fix the places where we
4874 // statically extend, e.g., constants and such.
4875 if (Inst->getType()->isVectorTy())
4876 return false;
4877
4878 // We can always get through zext.
4879 if (isa<ZExtInst>(Val: Inst))
4880 return true;
4881
4882 // sext(sext) is ok too.
4883 if (IsSExt && isa<SExtInst>(Val: Inst))
4884 return true;
4885
4886 // We can get through binary operator, if it is legal. In other words, the
4887 // binary operator must have a nuw or nsw flag.
4888 if (const auto *BinOp = dyn_cast<BinaryOperator>(Val: Inst))
4889 if (isa<OverflowingBinaryOperator>(Val: BinOp) &&
4890 ((!IsSExt && BinOp->hasNoUnsignedWrap()) ||
4891 (IsSExt && BinOp->hasNoSignedWrap())))
4892 return true;
4893
4894 // ext(and(opnd, cst)) --> and(ext(opnd), ext(cst))
4895 if ((Inst->getOpcode() == Instruction::And ||
4896 Inst->getOpcode() == Instruction::Or))
4897 return true;
4898
4899 // ext(xor(opnd, cst)) --> xor(ext(opnd), ext(cst))
4900 if (Inst->getOpcode() == Instruction::Xor) {
4901 // Make sure it is not a NOT.
4902 if (const auto *Cst = dyn_cast<ConstantInt>(Val: Inst->getOperand(i: 1)))
4903 if (!Cst->getValue().isAllOnes())
4904 return true;
4905 }
4906
4907 // zext(shrl(opnd, cst)) --> shrl(zext(opnd), zext(cst))
4908 // It may change a poisoned value into a regular value, like
4909 // zext i32 (shrl i8 %val, 12) --> shrl i32 (zext i8 %val), 12
4910 // poisoned value regular value
4911 // It should be OK since undef covers valid value.
4912 if (Inst->getOpcode() == Instruction::LShr && !IsSExt)
4913 return true;
4914
4915 // and(ext(shl(opnd, cst)), cst) --> and(shl(ext(opnd), ext(cst)), cst)
4916 // It may change a poisoned value into a regular value, like
4917 // zext i32 (shl i8 %val, 12) --> shl i32 (zext i8 %val), 12
4918 // poisoned value regular value
4919 // It should be OK since undef covers valid value.
4920 if (Inst->getOpcode() == Instruction::Shl && Inst->hasOneUse()) {
4921 const auto *ExtInst = cast<const Instruction>(Val: *Inst->user_begin());
4922 if (ExtInst->hasOneUse()) {
4923 const auto *AndInst = dyn_cast<const Instruction>(Val: *ExtInst->user_begin());
4924 if (AndInst && AndInst->getOpcode() == Instruction::And) {
4925 const auto *Cst = dyn_cast<ConstantInt>(Val: AndInst->getOperand(i: 1));
4926 if (Cst &&
4927 Cst->getValue().isIntN(N: Inst->getType()->getIntegerBitWidth()))
4928 return true;
4929 }
4930 }
4931 }
4932
4933 // Check if we can do the following simplification.
4934 // ext(trunc(opnd)) --> ext(opnd)
4935 if (!isa<TruncInst>(Val: Inst))
4936 return false;
4937
4938 Value *OpndVal = Inst->getOperand(i: 0);
4939 // Check if we can use this operand in the extension.
4940 // If the type is larger than the result type of the extension, we cannot.
4941 if (!OpndVal->getType()->isIntegerTy() ||
4942 OpndVal->getType()->getIntegerBitWidth() >
4943 ConsideredExtType->getIntegerBitWidth())
4944 return false;
4945
4946 // If the operand of the truncate is not an instruction, we will not have
4947 // any information on the dropped bits.
4948 // (Actually we could for constant but it is not worth the extra logic).
4949 Instruction *Opnd = dyn_cast<Instruction>(Val: OpndVal);
4950 if (!Opnd)
4951 return false;
4952
4953 // Check if the source of the type is narrow enough.
4954 // I.e., check that trunc just drops extended bits of the same kind of
4955 // the extension.
4956 // #1 get the type of the operand and check the kind of the extended bits.
4957 const Type *OpndType = getOrigType(PromotedInsts, Opnd, IsSExt);
4958 if (OpndType)
4959 ;
4960 else if ((IsSExt && isa<SExtInst>(Val: Opnd)) || (!IsSExt && isa<ZExtInst>(Val: Opnd)))
4961 OpndType = Opnd->getOperand(i: 0)->getType();
4962 else
4963 return false;
4964
4965 // #2 check that the truncate just drops extended bits.
4966 return Inst->getType()->getIntegerBitWidth() >=
4967 OpndType->getIntegerBitWidth();
4968}
4969
4970TypePromotionHelper::Action TypePromotionHelper::getAction(
4971 Instruction *Ext, const SetOfInstrs &InsertedInsts,
4972 const TargetLowering &TLI, const InstrToOrigTy &PromotedInsts) {
4973 assert((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
4974 "Unexpected instruction type");
4975 Instruction *ExtOpnd = dyn_cast<Instruction>(Val: Ext->getOperand(i: 0));
4976 Type *ExtTy = Ext->getType();
4977 bool IsSExt = isa<SExtInst>(Val: Ext);
4978 // If the operand of the extension is not an instruction, we cannot
4979 // get through.
4980 // If it, check we can get through.
4981 if (!ExtOpnd || !canGetThrough(Inst: ExtOpnd, ConsideredExtType: ExtTy, PromotedInsts, IsSExt))
4982 return nullptr;
4983
4984 // Do not promote if the operand has been added by codegenprepare.
4985 // Otherwise, it means we are undoing an optimization that is likely to be
4986 // redone, thus causing potential infinite loop.
4987 if (isa<TruncInst>(Val: ExtOpnd) && InsertedInsts.count(Ptr: ExtOpnd))
4988 return nullptr;
4989
4990 // SExt or Trunc instructions.
4991 // Return the related handler.
4992 if (isa<SExtInst>(Val: ExtOpnd) || isa<TruncInst>(Val: ExtOpnd) ||
4993 isa<ZExtInst>(Val: ExtOpnd))
4994 return promoteOperandForTruncAndAnyExt;
4995
4996 // Regular instruction.
4997 // Abort early if we will have to insert non-free instructions.
4998 if (!ExtOpnd->hasOneUse() && !TLI.isTruncateFree(FromTy: ExtTy, ToTy: ExtOpnd->getType()))
4999 return nullptr;
5000 return IsSExt ? signExtendOperandForOther : zeroExtendOperandForOther;
5001}
5002
5003Value *TypePromotionHelper::promoteOperandForTruncAndAnyExt(
5004 Instruction *SExt, TypePromotionTransaction &TPT,
5005 InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
5006 SmallVectorImpl<Instruction *> *Exts,
5007 SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) {
5008 // By construction, the operand of SExt is an instruction. Otherwise we cannot
5009 // get through it and this method should not be called.
5010 Instruction *SExtOpnd = cast<Instruction>(Val: SExt->getOperand(i: 0));
5011 Value *ExtVal = SExt;
5012 bool HasMergedNonFreeExt = false;
5013 if (isa<ZExtInst>(Val: SExtOpnd)) {
5014 // Replace s|zext(zext(opnd))
5015 // => zext(opnd).
5016 HasMergedNonFreeExt = !TLI.isExtFree(I: SExtOpnd);
5017 Value *ZExt =
5018 TPT.createZExt(Inst: SExt, Opnd: SExtOpnd->getOperand(i: 0), Ty: SExt->getType());
5019 TPT.replaceAllUsesWith(Inst: SExt, New: ZExt);
5020 TPT.eraseInstruction(Inst: SExt);
5021 ExtVal = ZExt;
5022 } else {
5023 // Replace z|sext(trunc(opnd)) or sext(sext(opnd))
5024 // => z|sext(opnd).
5025 TPT.setOperand(Inst: SExt, Idx: 0, NewVal: SExtOpnd->getOperand(i: 0));
5026 }
5027 CreatedInstsCost = 0;
5028
5029 // Remove dead code.
5030 if (SExtOpnd->use_empty())
5031 TPT.eraseInstruction(Inst: SExtOpnd);
5032
5033 // Check if the extension is still needed.
5034 Instruction *ExtInst = dyn_cast<Instruction>(Val: ExtVal);
5035 if (!ExtInst || ExtInst->getType() != ExtInst->getOperand(i: 0)->getType()) {
5036 if (ExtInst) {
5037 if (Exts)
5038 Exts->push_back(Elt: ExtInst);
5039 CreatedInstsCost = !TLI.isExtFree(I: ExtInst) && !HasMergedNonFreeExt;
5040 }
5041 return ExtVal;
5042 }
5043
5044 // At this point we have: ext ty opnd to ty.
5045 // Reassign the uses of ExtInst to the opnd and remove ExtInst.
5046 Value *NextVal = ExtInst->getOperand(i: 0);
5047 TPT.eraseInstruction(Inst: ExtInst, NewVal: NextVal);
5048 return NextVal;
5049}
5050
5051Value *TypePromotionHelper::promoteOperandForOther(
5052 Instruction *Ext, TypePromotionTransaction &TPT,
5053 InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
5054 SmallVectorImpl<Instruction *> *Exts,
5055 SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI,
5056 bool IsSExt) {
5057 // By construction, the operand of Ext is an instruction. Otherwise we cannot
5058 // get through it and this method should not be called.
5059 Instruction *ExtOpnd = cast<Instruction>(Val: Ext->getOperand(i: 0));
5060 CreatedInstsCost = 0;
5061 if (!ExtOpnd->hasOneUse()) {
5062 // ExtOpnd will be promoted.
5063 // All its uses, but Ext, will need to use a truncated value of the
5064 // promoted version.
5065 // Create the truncate now.
5066 Value *Trunc = TPT.createTrunc(Opnd: Ext, Ty: ExtOpnd->getType());
5067 if (Instruction *ITrunc = dyn_cast<Instruction>(Val: Trunc)) {
5068 // Insert it just after the definition.
5069 ITrunc->moveAfter(MovePos: ExtOpnd);
5070 if (Truncs)
5071 Truncs->push_back(Elt: ITrunc);
5072 }
5073
5074 TPT.replaceAllUsesWith(Inst: ExtOpnd, New: Trunc);
5075 // Restore the operand of Ext (which has been replaced by the previous call
5076 // to replaceAllUsesWith) to avoid creating a cycle trunc <-> sext.
5077 TPT.setOperand(Inst: Ext, Idx: 0, NewVal: ExtOpnd);
5078 }
5079
5080 // Get through the Instruction:
5081 // 1. Update its type.
5082 // 2. Replace the uses of Ext by Inst.
5083 // 3. Extend each operand that needs to be extended.
5084
5085 // Remember the original type of the instruction before promotion.
5086 // This is useful to know that the high bits are sign extended bits.
5087 addPromotedInst(PromotedInsts, ExtOpnd, IsSExt);
5088 // Step #1.
5089 TPT.mutateType(Inst: ExtOpnd, NewTy: Ext->getType());
5090 // Step #2.
5091 TPT.replaceAllUsesWith(Inst: Ext, New: ExtOpnd);
5092 // Step #3.
5093 LLVM_DEBUG(dbgs() << "Propagate Ext to operands\n");
5094 for (int OpIdx = 0, EndOpIdx = ExtOpnd->getNumOperands(); OpIdx != EndOpIdx;
5095 ++OpIdx) {
5096 LLVM_DEBUG(dbgs() << "Operand:\n" << *(ExtOpnd->getOperand(OpIdx)) << '\n');
5097 if (ExtOpnd->getOperand(i: OpIdx)->getType() == Ext->getType() ||
5098 !shouldExtOperand(Inst: ExtOpnd, OpIdx)) {
5099 LLVM_DEBUG(dbgs() << "No need to propagate\n");
5100 continue;
5101 }
5102 // Check if we can statically extend the operand.
5103 Value *Opnd = ExtOpnd->getOperand(i: OpIdx);
5104 if (const ConstantInt *Cst = dyn_cast<ConstantInt>(Val: Opnd)) {
5105 LLVM_DEBUG(dbgs() << "Statically extend\n");
5106 unsigned BitWidth = Ext->getType()->getIntegerBitWidth();
5107 APInt CstVal = IsSExt ? Cst->getValue().sext(width: BitWidth)
5108 : Cst->getValue().zext(width: BitWidth);
5109 TPT.setOperand(Inst: ExtOpnd, Idx: OpIdx, NewVal: ConstantInt::get(Ty: Ext->getType(), V: CstVal));
5110 continue;
5111 }
5112 // UndefValue are typed, so we have to statically sign extend them.
5113 if (isa<UndefValue>(Val: Opnd)) {
5114 LLVM_DEBUG(dbgs() << "Statically extend\n");
5115 TPT.setOperand(Inst: ExtOpnd, Idx: OpIdx, NewVal: UndefValue::get(T: Ext->getType()));
5116 continue;
5117 }
5118
5119 // Otherwise we have to explicitly sign extend the operand.
5120 Value *ValForExtOpnd = IsSExt
5121 ? TPT.createSExt(Inst: ExtOpnd, Opnd, Ty: Ext->getType())
5122 : TPT.createZExt(Inst: ExtOpnd, Opnd, Ty: Ext->getType());
5123 TPT.setOperand(Inst: ExtOpnd, Idx: OpIdx, NewVal: ValForExtOpnd);
5124 Instruction *InstForExtOpnd = dyn_cast<Instruction>(Val: ValForExtOpnd);
5125 if (!InstForExtOpnd)
5126 continue;
5127
5128 if (Exts)
5129 Exts->push_back(Elt: InstForExtOpnd);
5130
5131 CreatedInstsCost += !TLI.isExtFree(I: InstForExtOpnd);
5132 }
5133 LLVM_DEBUG(dbgs() << "Extension is useless now\n");
5134 TPT.eraseInstruction(Inst: Ext);
5135 return ExtOpnd;
5136}
5137
5138/// Check whether or not promoting an instruction to a wider type is profitable.
5139/// \p NewCost gives the cost of extension instructions created by the
5140/// promotion.
5141/// \p OldCost gives the cost of extension instructions before the promotion
5142/// plus the number of instructions that have been
5143/// matched in the addressing mode the promotion.
5144/// \p PromotedOperand is the value that has been promoted.
5145/// \return True if the promotion is profitable, false otherwise.
5146bool AddressingModeMatcher::isPromotionProfitable(
5147 unsigned NewCost, unsigned OldCost, Value *PromotedOperand) const {
5148 LLVM_DEBUG(dbgs() << "OldCost: " << OldCost << "\tNewCost: " << NewCost
5149 << '\n');
5150 // The cost of the new extensions is greater than the cost of the
5151 // old extension plus what we folded.
5152 // This is not profitable.
5153 if (NewCost > OldCost)
5154 return false;
5155 if (NewCost < OldCost)
5156 return true;
5157 // The promotion is neutral but it may help folding the sign extension in
5158 // loads for instance.
5159 // Check that we did not create an illegal instruction.
5160 return isPromotedInstructionLegal(TLI, DL, Val: PromotedOperand);
5161}
5162
5163/// Given an instruction or constant expr, see if we can fold the operation
5164/// into the addressing mode. If so, update the addressing mode and return
5165/// true, otherwise return false without modifying AddrMode.
5166/// If \p MovedAway is not NULL, it contains the information of whether or
5167/// not AddrInst has to be folded into the addressing mode on success.
5168/// If \p MovedAway == true, \p AddrInst will not be part of the addressing
5169/// because it has been moved away.
5170/// Thus AddrInst must not be added in the matched instructions.
5171/// This state can happen when AddrInst is a sext, since it may be moved away.
5172/// Therefore, AddrInst may not be valid when MovedAway is true and it must
5173/// not be referenced anymore.
5174bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode,
5175 unsigned Depth,
5176 bool *MovedAway) {
5177 // Avoid exponential behavior on extremely deep expression trees.
5178 if (Depth >= 5)
5179 return false;
5180
5181 // By default, all matched instructions stay in place.
5182 if (MovedAway)
5183 *MovedAway = false;
5184
5185 switch (Opcode) {
5186 case Instruction::PtrToInt:
5187 // PtrToInt is always a noop, as we know that the int type is pointer sized.
5188 return matchAddr(Addr: AddrInst->getOperand(i: 0), Depth);
5189 case Instruction::IntToPtr: {
5190 auto AS = AddrInst->getType()->getPointerAddressSpace();
5191 auto PtrTy = MVT::getIntegerVT(BitWidth: DL.getPointerSizeInBits(AS));
5192 // This inttoptr is a no-op if the integer type is pointer sized.
5193 if (TLI.getValueType(DL, Ty: AddrInst->getOperand(i: 0)->getType()) == PtrTy)
5194 return matchAddr(Addr: AddrInst->getOperand(i: 0), Depth);
5195 return false;
5196 }
5197 case Instruction::BitCast:
5198 // BitCast is always a noop, and we can handle it as long as it is
5199 // int->int or pointer->pointer (we don't want int<->fp or something).
5200 if (AddrInst->getOperand(i: 0)->getType()->isIntOrPtrTy() &&
5201 // Don't touch identity bitcasts. These were probably put here by LSR,
5202 // and we don't want to mess around with them. Assume it knows what it
5203 // is doing.
5204 AddrInst->getOperand(i: 0)->getType() != AddrInst->getType())
5205 return matchAddr(Addr: AddrInst->getOperand(i: 0), Depth);
5206 return false;
5207 case Instruction::AddrSpaceCast: {
5208 unsigned SrcAS =
5209 AddrInst->getOperand(i: 0)->getType()->getPointerAddressSpace();
5210 unsigned DestAS = AddrInst->getType()->getPointerAddressSpace();
5211 if (TLI.getTargetMachine().isNoopAddrSpaceCast(SrcAS, DestAS))
5212 return matchAddr(Addr: AddrInst->getOperand(i: 0), Depth);
5213 return false;
5214 }
5215 case Instruction::Add: {
5216 // Check to see if we can merge in one operand, then the other. If so, we
5217 // win.
5218 ExtAddrMode BackupAddrMode = AddrMode;
5219 unsigned OldSize = AddrModeInsts.size();
5220 // Start a transaction at this point.
5221 // The LHS may match but not the RHS.
5222 // Therefore, we need a higher level restoration point to undo partially
5223 // matched operation.
5224 TypePromotionTransaction::ConstRestorationPt LastKnownGood =
5225 TPT.getRestorationPoint();
5226
5227 // Try to match an integer constant second to increase its chance of ending
5228 // up in `BaseOffs`, resp. decrease its chance of ending up in `BaseReg`.
5229 int First = 0, Second = 1;
5230 if (isa<ConstantInt>(Val: AddrInst->getOperand(i: First))
5231 && !isa<ConstantInt>(Val: AddrInst->getOperand(i: Second)))
5232 std::swap(a&: First, b&: Second);
5233 AddrMode.InBounds = false;
5234 if (matchAddr(Addr: AddrInst->getOperand(i: First), Depth: Depth + 1) &&
5235 matchAddr(Addr: AddrInst->getOperand(i: Second), Depth: Depth + 1))
5236 return true;
5237
5238 // Restore the old addr mode info.
5239 AddrMode = BackupAddrMode;
5240 AddrModeInsts.resize(N: OldSize);
5241 TPT.rollback(Point: LastKnownGood);
5242
5243 // Otherwise this was over-aggressive. Try merging operands in the opposite
5244 // order.
5245 if (matchAddr(Addr: AddrInst->getOperand(i: Second), Depth: Depth + 1) &&
5246 matchAddr(Addr: AddrInst->getOperand(i: First), Depth: Depth + 1))
5247 return true;
5248
5249 // Otherwise we definitely can't merge the ADD in.
5250 AddrMode = BackupAddrMode;
5251 AddrModeInsts.resize(N: OldSize);
5252 TPT.rollback(Point: LastKnownGood);
5253 break;
5254 }
5255 // case Instruction::Or:
5256 // TODO: We can handle "Or Val, Imm" iff this OR is equivalent to an ADD.
5257 // break;
5258 case Instruction::Mul:
5259 case Instruction::Shl: {
5260 // Can only handle X*C and X << C.
5261 AddrMode.InBounds = false;
5262 ConstantInt *RHS = dyn_cast<ConstantInt>(Val: AddrInst->getOperand(i: 1));
5263 if (!RHS || RHS->getBitWidth() > 64)
5264 return false;
5265 int64_t Scale = Opcode == Instruction::Shl
5266 ? 1LL << RHS->getLimitedValue(Limit: RHS->getBitWidth() - 1)
5267 : RHS->getSExtValue();
5268
5269 return matchScaledValue(ScaleReg: AddrInst->getOperand(i: 0), Scale, Depth);
5270 }
5271 case Instruction::GetElementPtr: {
5272 // Scan the GEP. We check it if it contains constant offsets and at most
5273 // one variable offset.
5274 int VariableOperand = -1;
5275 unsigned VariableScale = 0;
5276
5277 int64_t ConstantOffset = 0;
5278 gep_type_iterator GTI = gep_type_begin(GEP: AddrInst);
5279 for (unsigned i = 1, e = AddrInst->getNumOperands(); i != e; ++i, ++GTI) {
5280 if (StructType *STy = GTI.getStructTypeOrNull()) {
5281 const StructLayout *SL = DL.getStructLayout(Ty: STy);
5282 unsigned Idx =
5283 cast<ConstantInt>(Val: AddrInst->getOperand(i))->getZExtValue();
5284 ConstantOffset += SL->getElementOffset(Idx);
5285 } else {
5286 TypeSize TS = GTI.getSequentialElementStride(DL);
5287 if (TS.isNonZero()) {
5288 // The optimisations below currently only work for fixed offsets.
5289 if (TS.isScalable())
5290 return false;
5291 int64_t TypeSize = TS.getFixedValue();
5292 if (ConstantInt *CI =
5293 dyn_cast<ConstantInt>(Val: AddrInst->getOperand(i))) {
5294 const APInt &CVal = CI->getValue();
5295 if (CVal.getSignificantBits() <= 64) {
5296 ConstantOffset += CVal.getSExtValue() * TypeSize;
5297 continue;
5298 }
5299 }
5300 // We only allow one variable index at the moment.
5301 if (VariableOperand != -1)
5302 return false;
5303
5304 // Remember the variable index.
5305 VariableOperand = i;
5306 VariableScale = TypeSize;
5307 }
5308 }
5309 }
5310
5311 // A common case is for the GEP to only do a constant offset. In this case,
5312 // just add it to the disp field and check validity.
5313 if (VariableOperand == -1) {
5314 AddrMode.BaseOffs += ConstantOffset;
5315 if (matchAddr(Addr: AddrInst->getOperand(i: 0), Depth: Depth + 1)) {
5316 if (!cast<GEPOperator>(Val: AddrInst)->isInBounds())
5317 AddrMode.InBounds = false;
5318 return true;
5319 }
5320 AddrMode.BaseOffs -= ConstantOffset;
5321
5322 if (EnableGEPOffsetSplit && isa<GetElementPtrInst>(Val: AddrInst) &&
5323 TLI.shouldConsiderGEPOffsetSplit() && Depth == 0 &&
5324 ConstantOffset > 0) {
5325 // Record GEPs with non-zero offsets as candidates for splitting in
5326 // the event that the offset cannot fit into the r+i addressing mode.
5327 // Simple and common case that only one GEP is used in calculating the
5328 // address for the memory access.
5329 Value *Base = AddrInst->getOperand(i: 0);
5330 auto *BaseI = dyn_cast<Instruction>(Val: Base);
5331 auto *GEP = cast<GetElementPtrInst>(Val: AddrInst);
5332 if (isa<Argument>(Val: Base) || isa<GlobalValue>(Val: Base) ||
5333 (BaseI && !isa<CastInst>(Val: BaseI) &&
5334 !isa<GetElementPtrInst>(Val: BaseI))) {
5335 // Make sure the parent block allows inserting non-PHI instructions
5336 // before the terminator.
5337 BasicBlock *Parent = BaseI ? BaseI->getParent()
5338 : &GEP->getFunction()->getEntryBlock();
5339 if (!Parent->getTerminator()->isEHPad())
5340 LargeOffsetGEP = std::make_pair(x&: GEP, y&: ConstantOffset);
5341 }
5342 }
5343
5344 return false;
5345 }
5346
5347 // Save the valid addressing mode in case we can't match.
5348 ExtAddrMode BackupAddrMode = AddrMode;
5349 unsigned OldSize = AddrModeInsts.size();
5350
5351 // See if the scale and offset amount is valid for this target.
5352 AddrMode.BaseOffs += ConstantOffset;
5353 if (!cast<GEPOperator>(Val: AddrInst)->isInBounds())
5354 AddrMode.InBounds = false;
5355
5356 // Match the base operand of the GEP.
5357 if (!matchAddr(Addr: AddrInst->getOperand(i: 0), Depth: Depth + 1)) {
5358 // If it couldn't be matched, just stuff the value in a register.
5359 if (AddrMode.HasBaseReg) {
5360 AddrMode = BackupAddrMode;
5361 AddrModeInsts.resize(N: OldSize);
5362 return false;
5363 }
5364 AddrMode.HasBaseReg = true;
5365 AddrMode.BaseReg = AddrInst->getOperand(i: 0);
5366 }
5367
5368 // Match the remaining variable portion of the GEP.
5369 if (!matchScaledValue(ScaleReg: AddrInst->getOperand(i: VariableOperand), Scale: VariableScale,
5370 Depth)) {
5371 // If it couldn't be matched, try stuffing the base into a register
5372 // instead of matching it, and retrying the match of the scale.
5373 AddrMode = BackupAddrMode;
5374 AddrModeInsts.resize(N: OldSize);
5375 if (AddrMode.HasBaseReg)
5376 return false;
5377 AddrMode.HasBaseReg = true;
5378 AddrMode.BaseReg = AddrInst->getOperand(i: 0);
5379 AddrMode.BaseOffs += ConstantOffset;
5380 if (!matchScaledValue(ScaleReg: AddrInst->getOperand(i: VariableOperand),
5381 Scale: VariableScale, Depth)) {
5382 // If even that didn't work, bail.
5383 AddrMode = BackupAddrMode;
5384 AddrModeInsts.resize(N: OldSize);
5385 return false;
5386 }
5387 }
5388
5389 return true;
5390 }
5391 case Instruction::SExt:
5392 case Instruction::ZExt: {
5393 Instruction *Ext = dyn_cast<Instruction>(Val: AddrInst);
5394 if (!Ext)
5395 return false;
5396
5397 // Try to move this ext out of the way of the addressing mode.
5398 // Ask for a method for doing so.
5399 TypePromotionHelper::Action TPH =
5400 TypePromotionHelper::getAction(Ext, InsertedInsts, TLI, PromotedInsts);
5401 if (!TPH)
5402 return false;
5403
5404 TypePromotionTransaction::ConstRestorationPt LastKnownGood =
5405 TPT.getRestorationPoint();
5406 unsigned CreatedInstsCost = 0;
5407 unsigned ExtCost = !TLI.isExtFree(I: Ext);
5408 Value *PromotedOperand =
5409 TPH(Ext, TPT, PromotedInsts, CreatedInstsCost, nullptr, nullptr, TLI);
5410 // SExt has been moved away.
5411 // Thus either it will be rematched later in the recursive calls or it is
5412 // gone. Anyway, we must not fold it into the addressing mode at this point.
5413 // E.g.,
5414 // op = add opnd, 1
5415 // idx = ext op
5416 // addr = gep base, idx
5417 // is now:
5418 // promotedOpnd = ext opnd <- no match here
5419 // op = promoted_add promotedOpnd, 1 <- match (later in recursive calls)
5420 // addr = gep base, op <- match
5421 if (MovedAway)
5422 *MovedAway = true;
5423
5424 assert(PromotedOperand &&
5425 "TypePromotionHelper should have filtered out those cases");
5426
5427 ExtAddrMode BackupAddrMode = AddrMode;
5428 unsigned OldSize = AddrModeInsts.size();
5429
5430 if (!matchAddr(Addr: PromotedOperand, Depth) ||
5431 // The total of the new cost is equal to the cost of the created
5432 // instructions.
5433 // The total of the old cost is equal to the cost of the extension plus
5434 // what we have saved in the addressing mode.
5435 !isPromotionProfitable(NewCost: CreatedInstsCost,
5436 OldCost: ExtCost + (AddrModeInsts.size() - OldSize),
5437 PromotedOperand)) {
5438 AddrMode = BackupAddrMode;
5439 AddrModeInsts.resize(N: OldSize);
5440 LLVM_DEBUG(dbgs() << "Sign extension does not pay off: rollback\n");
5441 TPT.rollback(Point: LastKnownGood);
5442 return false;
5443 }
5444
5445 // SExt has been deleted. Make sure it is not referenced by the AddrMode.
5446 AddrMode.replaceWith(From: Ext, To: PromotedOperand);
5447 return true;
5448 }
5449 case Instruction::Call:
5450 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: AddrInst)) {
5451 if (II->getIntrinsicID() == Intrinsic::threadlocal_address) {
5452 GlobalValue &GV = cast<GlobalValue>(Val&: *II->getArgOperand(i: 0));
5453 if (TLI.addressingModeSupportsTLS(GV))
5454 return matchAddr(Addr: AddrInst->getOperand(i: 0), Depth);
5455 }
5456 }
5457 break;
5458 }
5459 return false;
5460}
5461
5462/// If we can, try to add the value of 'Addr' into the current addressing mode.
5463/// If Addr can't be added to AddrMode this returns false and leaves AddrMode
5464/// unmodified. This assumes that Addr is either a pointer type or intptr_t
5465/// for the target.
5466///
5467bool AddressingModeMatcher::matchAddr(Value *Addr, unsigned Depth) {
5468 // Start a transaction at this point that we will rollback if the matching
5469 // fails.
5470 TypePromotionTransaction::ConstRestorationPt LastKnownGood =
5471 TPT.getRestorationPoint();
5472 if (ConstantInt *CI = dyn_cast<ConstantInt>(Val: Addr)) {
5473 if (CI->getValue().isSignedIntN(N: 64)) {
5474 // Check if the addition would result in a signed overflow.
5475 int64_t Result;
5476 bool Overflow =
5477 AddOverflow(X: AddrMode.BaseOffs, Y: CI->getSExtValue(), Result);
5478 if (!Overflow) {
5479 // Fold in immediates if legal for the target.
5480 AddrMode.BaseOffs = Result;
5481 if (TLI.isLegalAddressingMode(DL, AM: AddrMode, Ty: AccessTy, AddrSpace))
5482 return true;
5483 AddrMode.BaseOffs -= CI->getSExtValue();
5484 }
5485 }
5486 } else if (GlobalValue *GV = dyn_cast<GlobalValue>(Val: Addr)) {
5487 // If this is a global variable, try to fold it into the addressing mode.
5488 if (!AddrMode.BaseGV) {
5489 AddrMode.BaseGV = GV;
5490 if (TLI.isLegalAddressingMode(DL, AM: AddrMode, Ty: AccessTy, AddrSpace))
5491 return true;
5492 AddrMode.BaseGV = nullptr;
5493 }
5494 } else if (Instruction *I = dyn_cast<Instruction>(Val: Addr)) {
5495 ExtAddrMode BackupAddrMode = AddrMode;
5496 unsigned OldSize = AddrModeInsts.size();
5497
5498 // Check to see if it is possible to fold this operation.
5499 bool MovedAway = false;
5500 if (matchOperationAddr(AddrInst: I, Opcode: I->getOpcode(), Depth, MovedAway: &MovedAway)) {
5501 // This instruction may have been moved away. If so, there is nothing
5502 // to check here.
5503 if (MovedAway)
5504 return true;
5505 // Okay, it's possible to fold this. Check to see if it is actually
5506 // *profitable* to do so. We use a simple cost model to avoid increasing
5507 // register pressure too much.
5508 if (I->hasOneUse() ||
5509 isProfitableToFoldIntoAddressingMode(I, AMBefore&: BackupAddrMode, AMAfter&: AddrMode)) {
5510 AddrModeInsts.push_back(Elt: I);
5511 return true;
5512 }
5513
5514 // It isn't profitable to do this, roll back.
5515 AddrMode = BackupAddrMode;
5516 AddrModeInsts.resize(N: OldSize);
5517 TPT.rollback(Point: LastKnownGood);
5518 }
5519 } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Val: Addr)) {
5520 if (matchOperationAddr(AddrInst: CE, Opcode: CE->getOpcode(), Depth))
5521 return true;
5522 TPT.rollback(Point: LastKnownGood);
5523 } else if (isa<ConstantPointerNull>(Val: Addr)) {
5524 // Null pointer gets folded without affecting the addressing mode.
5525 return true;
5526 }
5527
5528 // Worse case, the target should support [reg] addressing modes. :)
5529 if (!AddrMode.HasBaseReg) {
5530 AddrMode.HasBaseReg = true;
5531 AddrMode.BaseReg = Addr;
5532 // Still check for legality in case the target supports [imm] but not [i+r].
5533 if (TLI.isLegalAddressingMode(DL, AM: AddrMode, Ty: AccessTy, AddrSpace))
5534 return true;
5535 AddrMode.HasBaseReg = false;
5536 AddrMode.BaseReg = nullptr;
5537 }
5538
5539 // If the base register is already taken, see if we can do [r+r].
5540 if (AddrMode.Scale == 0) {
5541 AddrMode.Scale = 1;
5542 AddrMode.ScaledReg = Addr;
5543 if (TLI.isLegalAddressingMode(DL, AM: AddrMode, Ty: AccessTy, AddrSpace))
5544 return true;
5545 AddrMode.Scale = 0;
5546 AddrMode.ScaledReg = nullptr;
5547 }
5548 // Couldn't match.
5549 TPT.rollback(Point: LastKnownGood);
5550 return false;
5551}
5552
5553/// Check to see if all uses of OpVal by the specified inline asm call are due
5554/// to memory operands. If so, return true, otherwise return false.
5555static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,
5556 const TargetLowering &TLI,
5557 const TargetRegisterInfo &TRI) {
5558 const Function *F = CI->getFunction();
5559 TargetLowering::AsmOperandInfoVector TargetConstraints =
5560 TLI.ParseConstraints(DL: F->getDataLayout(), TRI: &TRI, Call: *CI);
5561
5562 for (TargetLowering::AsmOperandInfo &OpInfo : TargetConstraints) {
5563 // Compute the constraint code and ConstraintType to use.
5564 TLI.ComputeConstraintToUse(OpInfo, Op: SDValue());
5565
5566 // If this asm operand is our Value*, and if it isn't an indirect memory
5567 // operand, we can't fold it! TODO: Also handle C_Address?
5568 if (OpInfo.CallOperandVal == OpVal &&
5569 (OpInfo.ConstraintType != TargetLowering::C_Memory ||
5570 !OpInfo.isIndirect))
5571 return false;
5572 }
5573
5574 return true;
5575}
5576
5577/// Recursively walk all the uses of I until we find a memory use.
5578/// If we find an obviously non-foldable instruction, return true.
5579/// Add accessed addresses and types to MemoryUses.
5580static bool FindAllMemoryUses(
5581 Instruction *I, SmallVectorImpl<std::pair<Use *, Type *>> &MemoryUses,
5582 SmallPtrSetImpl<Instruction *> &ConsideredInsts, const TargetLowering &TLI,
5583 const TargetRegisterInfo &TRI, bool OptSize, ProfileSummaryInfo *PSI,
5584 BlockFrequencyInfo *BFI, unsigned &SeenInsts) {
5585 // If we already considered this instruction, we're done.
5586 if (!ConsideredInsts.insert(Ptr: I).second)
5587 return false;
5588
5589 // If this is an obviously unfoldable instruction, bail out.
5590 if (!MightBeFoldableInst(I))
5591 return true;
5592
5593 // Loop over all the uses, recursively processing them.
5594 for (Use &U : I->uses()) {
5595 // Conservatively return true if we're seeing a large number or a deep chain
5596 // of users. This avoids excessive compilation times in pathological cases.
5597 if (SeenInsts++ >= MaxAddressUsersToScan)
5598 return true;
5599
5600 Instruction *UserI = cast<Instruction>(Val: U.getUser());
5601 if (LoadInst *LI = dyn_cast<LoadInst>(Val: UserI)) {
5602 MemoryUses.push_back(Elt: {&U, LI->getType()});
5603 continue;
5604 }
5605
5606 if (StoreInst *SI = dyn_cast<StoreInst>(Val: UserI)) {
5607 if (U.getOperandNo() != StoreInst::getPointerOperandIndex())
5608 return true; // Storing addr, not into addr.
5609 MemoryUses.push_back(Elt: {&U, SI->getValueOperand()->getType()});
5610 continue;
5611 }
5612
5613 if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Val: UserI)) {
5614 if (U.getOperandNo() != AtomicRMWInst::getPointerOperandIndex())
5615 return true; // Storing addr, not into addr.
5616 MemoryUses.push_back(Elt: {&U, RMW->getValOperand()->getType()});
5617 continue;
5618 }
5619
5620 if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Val: UserI)) {
5621 if (U.getOperandNo() != AtomicCmpXchgInst::getPointerOperandIndex())
5622 return true; // Storing addr, not into addr.
5623 MemoryUses.push_back(Elt: {&U, CmpX->getCompareOperand()->getType()});
5624 continue;
5625 }
5626
5627 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: UserI)) {
5628 SmallVector<Value *, 2> PtrOps;
5629 Type *AccessTy;
5630 if (!TLI.getAddrModeArguments(II, PtrOps, AccessTy))
5631 return true;
5632
5633 if (!find(Range&: PtrOps, Val: U.get()))
5634 return true;
5635
5636 MemoryUses.push_back(Elt: {&U, AccessTy});
5637 continue;
5638 }
5639
5640 if (CallInst *CI = dyn_cast<CallInst>(Val: UserI)) {
5641 if (CI->hasFnAttr(Kind: Attribute::Cold)) {
5642 // If this is a cold call, we can sink the addressing calculation into
5643 // the cold path. See optimizeCallInst
5644 if (!llvm::shouldOptimizeForSize(BB: CI->getParent(), PSI, BFI))
5645 continue;
5646 }
5647
5648 InlineAsm *IA = dyn_cast<InlineAsm>(Val: CI->getCalledOperand());
5649 if (!IA)
5650 return true;
5651
5652 // If this is a memory operand, we're cool, otherwise bail out.
5653 if (!IsOperandAMemoryOperand(CI, IA, OpVal: I, TLI, TRI))
5654 return true;
5655 continue;
5656 }
5657
5658 if (FindAllMemoryUses(I: UserI, MemoryUses, ConsideredInsts, TLI, TRI, OptSize,
5659 PSI, BFI, SeenInsts))
5660 return true;
5661 }
5662
5663 return false;
5664}
5665
5666static bool FindAllMemoryUses(
5667 Instruction *I, SmallVectorImpl<std::pair<Use *, Type *>> &MemoryUses,
5668 const TargetLowering &TLI, const TargetRegisterInfo &TRI, bool OptSize,
5669 ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {
5670 unsigned SeenInsts = 0;
5671 SmallPtrSet<Instruction *, 16> ConsideredInsts;
5672 return FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI, TRI, OptSize,
5673 PSI, BFI, SeenInsts);
5674}
5675
5676
5677/// Return true if Val is already known to be live at the use site that we're
5678/// folding it into. If so, there is no cost to include it in the addressing
5679/// mode. KnownLive1 and KnownLive2 are two values that we know are live at the
5680/// instruction already.
5681bool AddressingModeMatcher::valueAlreadyLiveAtInst(Value *Val,
5682 Value *KnownLive1,
5683 Value *KnownLive2) {
5684 // If Val is either of the known-live values, we know it is live!
5685 if (Val == nullptr || Val == KnownLive1 || Val == KnownLive2)
5686 return true;
5687
5688 // All values other than instructions and arguments (e.g. constants) are live.
5689 if (!isa<Instruction>(Val) && !isa<Argument>(Val))
5690 return true;
5691
5692 // If Val is a constant sized alloca in the entry block, it is live, this is
5693 // true because it is just a reference to the stack/frame pointer, which is
5694 // live for the whole function.
5695 if (AllocaInst *AI = dyn_cast<AllocaInst>(Val))
5696 if (AI->isStaticAlloca())
5697 return true;
5698
5699 // Check to see if this value is already used in the memory instruction's
5700 // block. If so, it's already live into the block at the very least, so we
5701 // can reasonably fold it.
5702 return Val->isUsedInBasicBlock(BB: MemoryInst->getParent());
5703}
5704
5705/// It is possible for the addressing mode of the machine to fold the specified
5706/// instruction into a load or store that ultimately uses it.
5707/// However, the specified instruction has multiple uses.
5708/// Given this, it may actually increase register pressure to fold it
5709/// into the load. For example, consider this code:
5710///
5711/// X = ...
5712/// Y = X+1
5713/// use(Y) -> nonload/store
5714/// Z = Y+1
5715/// load Z
5716///
5717/// In this case, Y has multiple uses, and can be folded into the load of Z
5718/// (yielding load [X+2]). However, doing this will cause both "X" and "X+1" to
5719/// be live at the use(Y) line. If we don't fold Y into load Z, we use one
5720/// fewer register. Since Y can't be folded into "use(Y)" we don't increase the
5721/// number of computations either.
5722///
5723/// Note that this (like most of CodeGenPrepare) is just a rough heuristic. If
5724/// X was live across 'load Z' for other reasons, we actually *would* want to
5725/// fold the addressing mode in the Z case. This would make Y die earlier.
5726bool AddressingModeMatcher::isProfitableToFoldIntoAddressingMode(
5727 Instruction *I, ExtAddrMode &AMBefore, ExtAddrMode &AMAfter) {
5728 if (IgnoreProfitability)
5729 return true;
5730
5731 // AMBefore is the addressing mode before this instruction was folded into it,
5732 // and AMAfter is the addressing mode after the instruction was folded. Get
5733 // the set of registers referenced by AMAfter and subtract out those
5734 // referenced by AMBefore: this is the set of values which folding in this
5735 // address extends the lifetime of.
5736 //
5737 // Note that there are only two potential values being referenced here,
5738 // BaseReg and ScaleReg (global addresses are always available, as are any
5739 // folded immediates).
5740 Value *BaseReg = AMAfter.BaseReg, *ScaledReg = AMAfter.ScaledReg;
5741
5742 // If the BaseReg or ScaledReg was referenced by the previous addrmode, their
5743 // lifetime wasn't extended by adding this instruction.
5744 if (valueAlreadyLiveAtInst(Val: BaseReg, KnownLive1: AMBefore.BaseReg, KnownLive2: AMBefore.ScaledReg))
5745 BaseReg = nullptr;
5746 if (valueAlreadyLiveAtInst(Val: ScaledReg, KnownLive1: AMBefore.BaseReg, KnownLive2: AMBefore.ScaledReg))
5747 ScaledReg = nullptr;
5748
5749 // If folding this instruction (and it's subexprs) didn't extend any live
5750 // ranges, we're ok with it.
5751 if (!BaseReg && !ScaledReg)
5752 return true;
5753
5754 // If all uses of this instruction can have the address mode sunk into them,
5755 // we can remove the addressing mode and effectively trade one live register
5756 // for another (at worst.) In this context, folding an addressing mode into
5757 // the use is just a particularly nice way of sinking it.
5758 SmallVector<std::pair<Use *, Type *>, 16> MemoryUses;
5759 if (FindAllMemoryUses(I, MemoryUses, TLI, TRI, OptSize, PSI, BFI))
5760 return false; // Has a non-memory, non-foldable use!
5761
5762 // Now that we know that all uses of this instruction are part of a chain of
5763 // computation involving only operations that could theoretically be folded
5764 // into a memory use, loop over each of these memory operation uses and see
5765 // if they could *actually* fold the instruction. The assumption is that
5766 // addressing modes are cheap and that duplicating the computation involved
5767 // many times is worthwhile, even on a fastpath. For sinking candidates
5768 // (i.e. cold call sites), this serves as a way to prevent excessive code
5769 // growth since most architectures have some reasonable small and fast way to
5770 // compute an effective address. (i.e LEA on x86)
5771 SmallVector<Instruction *, 32> MatchedAddrModeInsts;
5772 for (const std::pair<Use *, Type *> &Pair : MemoryUses) {
5773 Value *Address = Pair.first->get();
5774 Instruction *UserI = cast<Instruction>(Val: Pair.first->getUser());
5775 Type *AddressAccessTy = Pair.second;
5776 unsigned AS = Address->getType()->getPointerAddressSpace();
5777
5778 // Do a match against the root of this address, ignoring profitability. This
5779 // will tell us if the addressing mode for the memory operation will
5780 // *actually* cover the shared instruction.
5781 ExtAddrMode Result;
5782 std::pair<AssertingVH<GetElementPtrInst>, int64_t> LargeOffsetGEP(nullptr,
5783 0);
5784 TypePromotionTransaction::ConstRestorationPt LastKnownGood =
5785 TPT.getRestorationPoint();
5786 AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, TRI, LI, getDTFn,
5787 AddressAccessTy, AS, UserI, Result,
5788 InsertedInsts, PromotedInsts, TPT,
5789 LargeOffsetGEP, OptSize, PSI, BFI);
5790 Matcher.IgnoreProfitability = true;
5791 bool Success = Matcher.matchAddr(Addr: Address, Depth: 0);
5792 (void)Success;
5793 assert(Success && "Couldn't select *anything*?");
5794
5795 // The match was to check the profitability, the changes made are not
5796 // part of the original matcher. Therefore, they should be dropped
5797 // otherwise the original matcher will not present the right state.
5798 TPT.rollback(Point: LastKnownGood);
5799
5800 // If the match didn't cover I, then it won't be shared by it.
5801 if (!is_contained(Range&: MatchedAddrModeInsts, Element: I))
5802 return false;
5803
5804 MatchedAddrModeInsts.clear();
5805 }
5806
5807 return true;
5808}
5809
5810/// Return true if the specified values are defined in a
5811/// different basic block than BB.
5812static bool IsNonLocalValue(Value *V, BasicBlock *BB) {
5813 if (Instruction *I = dyn_cast<Instruction>(Val: V))
5814 return I->getParent() != BB;
5815 return false;
5816}
5817
5818// Find an insert position of Addr for MemoryInst. We can't guarantee MemoryInst
5819// is the first instruction that will use Addr. So we need to find the first
5820// user of Addr in current BB.
5821static BasicBlock::iterator findInsertPos(Value *Addr, Instruction *MemoryInst,
5822 Value *SunkAddr) {
5823 if (Addr->hasOneUse())
5824 return MemoryInst->getIterator();
5825
5826 // We already have a SunkAddr in current BB, but we may need to insert cast
5827 // instruction after it.
5828 if (SunkAddr) {
5829 if (Instruction *AddrInst = dyn_cast<Instruction>(Val: SunkAddr))
5830 return std::next(x: AddrInst->getIterator());
5831 }
5832
5833 // Find the first user of Addr in current BB.
5834 Instruction *Earliest = MemoryInst;
5835 for (User *U : Addr->users()) {
5836 Instruction *UserInst = dyn_cast<Instruction>(Val: U);
5837 if (UserInst && UserInst->getParent() == MemoryInst->getParent()) {
5838 if (isa<PHINode>(Val: UserInst) || UserInst->isDebugOrPseudoInst())
5839 continue;
5840 if (UserInst->comesBefore(Other: Earliest))
5841 Earliest = UserInst;
5842 }
5843 }
5844 return Earliest->getIterator();
5845}
5846
5847/// Sink addressing mode computation immediate before MemoryInst if doing so
5848/// can be done without increasing register pressure. The need for the
5849/// register pressure constraint means this can end up being an all or nothing
5850/// decision for all uses of the same addressing computation.
5851///
5852/// Load and Store Instructions often have addressing modes that can do
5853/// significant amounts of computation. As such, instruction selection will try
5854/// to get the load or store to do as much computation as possible for the
5855/// program. The problem is that isel can only see within a single block. As
5856/// such, we sink as much legal addressing mode work into the block as possible.
5857///
5858/// This method is used to optimize both load/store and inline asms with memory
5859/// operands. It's also used to sink addressing computations feeding into cold
5860/// call sites into their (cold) basic block.
5861///
5862/// The motivation for handling sinking into cold blocks is that doing so can
5863/// both enable other address mode sinking (by satisfying the register pressure
5864/// constraint above), and reduce register pressure globally (by removing the
5865/// addressing mode computation from the fast path entirely.).
5866bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
5867 Type *AccessTy, unsigned AddrSpace) {
5868 Value *Repl = Addr;
5869
5870 // Try to collapse single-value PHI nodes. This is necessary to undo
5871 // unprofitable PRE transformations.
5872 SmallVector<Value *, 8> worklist;
5873 SmallPtrSet<Value *, 16> Visited;
5874 worklist.push_back(Elt: Addr);
5875
5876 // Use a worklist to iteratively look through PHI and select nodes, and
5877 // ensure that the addressing mode obtained from the non-PHI/select roots of
5878 // the graph are compatible.
5879 bool PhiOrSelectSeen = false;
5880 SmallVector<Instruction *, 16> AddrModeInsts;
5881 AddressingModeCombiner AddrModes(*DL, Addr);
5882 TypePromotionTransaction TPT(RemovedInsts);
5883 TypePromotionTransaction::ConstRestorationPt LastKnownGood =
5884 TPT.getRestorationPoint();
5885 while (!worklist.empty()) {
5886 Value *V = worklist.pop_back_val();
5887
5888 // We allow traversing cyclic Phi nodes.
5889 // In case of success after this loop we ensure that traversing through
5890 // Phi nodes ends up with all cases to compute address of the form
5891 // BaseGV + Base + Scale * Index + Offset
5892 // where Scale and Offset are constans and BaseGV, Base and Index
5893 // are exactly the same Values in all cases.
5894 // It means that BaseGV, Scale and Offset dominate our memory instruction
5895 // and have the same value as they had in address computation represented
5896 // as Phi. So we can safely sink address computation to memory instruction.
5897 if (!Visited.insert(Ptr: V).second)
5898 continue;
5899
5900 // For a PHI node, push all of its incoming values.
5901 if (PHINode *P = dyn_cast<PHINode>(Val: V)) {
5902 append_range(C&: worklist, R: P->incoming_values());
5903 PhiOrSelectSeen = true;
5904 continue;
5905 }
5906 // Similar for select.
5907 if (SelectInst *SI = dyn_cast<SelectInst>(Val: V)) {
5908 worklist.push_back(Elt: SI->getFalseValue());
5909 worklist.push_back(Elt: SI->getTrueValue());
5910 PhiOrSelectSeen = true;
5911 continue;
5912 }
5913
5914 // For non-PHIs, determine the addressing mode being computed. Note that
5915 // the result may differ depending on what other uses our candidate
5916 // addressing instructions might have.
5917 AddrModeInsts.clear();
5918 std::pair<AssertingVH<GetElementPtrInst>, int64_t> LargeOffsetGEP(nullptr,
5919 0);
5920 // Defer the query (and possible computation of) the dom tree to point of
5921 // actual use. It's expected that most address matches don't actually need
5922 // the domtree.
5923 auto getDTFn = [this]() -> const DominatorTree & { return getDT(); };
5924 ExtAddrMode NewAddrMode = AddressingModeMatcher::Match(
5925 V, AccessTy, AS: AddrSpace, MemoryInst, AddrModeInsts, TLI: *TLI, LI: *LI, getDTFn,
5926 TRI: *TRI, InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI,
5927 BFI);
5928
5929 GetElementPtrInst *GEP = LargeOffsetGEP.first;
5930 if (GEP && !NewGEPBases.count(V: GEP)) {
5931 // If splitting the underlying data structure can reduce the offset of a
5932 // GEP, collect the GEP. Skip the GEPs that are the new bases of
5933 // previously split data structures.
5934 LargeOffsetGEPMap[GEP->getPointerOperand()].push_back(Elt: LargeOffsetGEP);
5935 LargeOffsetGEPID.insert(KV: std::make_pair(x&: GEP, y: LargeOffsetGEPID.size()));
5936 }
5937
5938 NewAddrMode.OriginalValue = V;
5939 if (!AddrModes.addNewAddrMode(NewAddrMode))
5940 break;
5941 }
5942
5943 // Try to combine the AddrModes we've collected. If we couldn't collect any,
5944 // or we have multiple but either couldn't combine them or combining them
5945 // wouldn't do anything useful, bail out now.
5946 if (!AddrModes.combineAddrModes()) {
5947 TPT.rollback(Point: LastKnownGood);
5948 return false;
5949 }
5950 bool Modified = TPT.commit();
5951
5952 // Get the combined AddrMode (or the only AddrMode, if we only had one).
5953 ExtAddrMode AddrMode = AddrModes.getAddrMode();
5954
5955 // If all the instructions matched are already in this BB, don't do anything.
5956 // If we saw a Phi node then it is not local definitely, and if we saw a
5957 // select then we want to push the address calculation past it even if it's
5958 // already in this BB.
5959 if (!PhiOrSelectSeen && none_of(Range&: AddrModeInsts, P: [&](Value *V) {
5960 return IsNonLocalValue(V, BB: MemoryInst->getParent());
5961 })) {
5962 LLVM_DEBUG(dbgs() << "CGP: Found local addrmode: " << AddrMode
5963 << "\n");
5964 return Modified;
5965 }
5966
5967 // Now that we determined the addressing expression we want to use and know
5968 // that we have to sink it into this block. Check to see if we have already
5969 // done this for some other load/store instr in this block. If so, reuse
5970 // the computation. Before attempting reuse, check if the address is valid
5971 // as it may have been erased.
5972
5973 WeakTrackingVH SunkAddrVH = SunkAddrs[Addr];
5974
5975 Value *SunkAddr = SunkAddrVH.pointsToAliveValue() ? SunkAddrVH : nullptr;
5976 Type *IntPtrTy = DL->getIntPtrType(Addr->getType());
5977
5978 // The current BB may be optimized multiple times, we can't guarantee the
5979 // reuse of Addr happens later, call findInsertPos to find an appropriate
5980 // insert position.
5981 auto InsertPos = findInsertPos(Addr, MemoryInst, SunkAddr);
5982
5983 // TODO: Adjust insert point considering (Base|Scaled)Reg if possible.
5984 if (!SunkAddr) {
5985 auto &DT = getDT();
5986 if ((AddrMode.BaseReg && !DT.dominates(Def: AddrMode.BaseReg, User: &*InsertPos)) ||
5987 (AddrMode.ScaledReg && !DT.dominates(Def: AddrMode.ScaledReg, User: &*InsertPos)))
5988 return Modified;
5989 }
5990
5991 IRBuilder<> Builder(MemoryInst->getParent(), InsertPos);
5992
5993 if (SunkAddr) {
5994 LLVM_DEBUG(dbgs() << "CGP: Reusing nonlocal addrmode: " << AddrMode
5995 << " for " << *MemoryInst << "\n");
5996 if (SunkAddr->getType() != Addr->getType()) {
5997 if (SunkAddr->getType()->getPointerAddressSpace() !=
5998 Addr->getType()->getPointerAddressSpace() &&
5999 !DL->isNonIntegralPointerType(Ty: Addr->getType())) {
6000 // There are two reasons the address spaces might not match: a no-op
6001 // addrspacecast, or a ptrtoint/inttoptr pair. Either way, we emit a
6002 // ptrtoint/inttoptr pair to ensure we match the original semantics.
6003 // TODO: allow bitcast between different address space pointers with the
6004 // same size.
6005 SunkAddr = Builder.CreatePtrToInt(V: SunkAddr, DestTy: IntPtrTy, Name: "sunkaddr");
6006 SunkAddr =
6007 Builder.CreateIntToPtr(V: SunkAddr, DestTy: Addr->getType(), Name: "sunkaddr");
6008 } else
6009 SunkAddr = Builder.CreatePointerCast(V: SunkAddr, DestTy: Addr->getType());
6010 }
6011 } else if (AddrSinkUsingGEPs || (!AddrSinkUsingGEPs.getNumOccurrences() &&
6012 SubtargetInfo->addrSinkUsingGEPs())) {
6013 // By default, we use the GEP-based method when AA is used later. This
6014 // prevents new inttoptr/ptrtoint pairs from degrading AA capabilities.
6015 LLVM_DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode
6016 << " for " << *MemoryInst << "\n");
6017 Value *ResultPtr = nullptr, *ResultIndex = nullptr;
6018
6019 // First, find the pointer.
6020 if (AddrMode.BaseReg && AddrMode.BaseReg->getType()->isPointerTy()) {
6021 ResultPtr = AddrMode.BaseReg;
6022 AddrMode.BaseReg = nullptr;
6023 }
6024
6025 if (AddrMode.Scale && AddrMode.ScaledReg->getType()->isPointerTy()) {
6026 // We can't add more than one pointer together, nor can we scale a
6027 // pointer (both of which seem meaningless).
6028 if (ResultPtr || AddrMode.Scale != 1)
6029 return Modified;
6030
6031 ResultPtr = AddrMode.ScaledReg;
6032 AddrMode.Scale = 0;
6033 }
6034
6035 // It is only safe to sign extend the BaseReg if we know that the math
6036 // required to create it did not overflow before we extend it. Since
6037 // the original IR value was tossed in favor of a constant back when
6038 // the AddrMode was created we need to bail out gracefully if widths
6039 // do not match instead of extending it.
6040 //
6041 // (See below for code to add the scale.)
6042 if (AddrMode.Scale) {
6043 Type *ScaledRegTy = AddrMode.ScaledReg->getType();
6044 if (cast<IntegerType>(Val: IntPtrTy)->getBitWidth() >
6045 cast<IntegerType>(Val: ScaledRegTy)->getBitWidth())
6046 return Modified;
6047 }
6048
6049 GlobalValue *BaseGV = AddrMode.BaseGV;
6050 if (BaseGV != nullptr) {
6051 if (ResultPtr)
6052 return Modified;
6053
6054 if (BaseGV->isThreadLocal()) {
6055 ResultPtr = Builder.CreateThreadLocalAddress(Ptr: BaseGV);
6056 } else {
6057 ResultPtr = BaseGV;
6058 }
6059 }
6060
6061 // If the real base value actually came from an inttoptr, then the matcher
6062 // will look through it and provide only the integer value. In that case,
6063 // use it here.
6064 if (!DL->isNonIntegralPointerType(Ty: Addr->getType())) {
6065 if (!ResultPtr && AddrMode.BaseReg) {
6066 ResultPtr = Builder.CreateIntToPtr(V: AddrMode.BaseReg, DestTy: Addr->getType(),
6067 Name: "sunkaddr");
6068 AddrMode.BaseReg = nullptr;
6069 } else if (!ResultPtr && AddrMode.Scale == 1) {
6070 ResultPtr = Builder.CreateIntToPtr(V: AddrMode.ScaledReg, DestTy: Addr->getType(),
6071 Name: "sunkaddr");
6072 AddrMode.Scale = 0;
6073 }
6074 }
6075
6076 if (!ResultPtr && !AddrMode.BaseReg && !AddrMode.Scale &&
6077 !AddrMode.BaseOffs) {
6078 SunkAddr = Constant::getNullValue(Ty: Addr->getType());
6079 } else if (!ResultPtr) {
6080 return Modified;
6081 } else {
6082 Type *I8PtrTy =
6083 Builder.getPtrTy(AddrSpace: Addr->getType()->getPointerAddressSpace());
6084
6085 // Start with the base register. Do this first so that subsequent address
6086 // matching finds it last, which will prevent it from trying to match it
6087 // as the scaled value in case it happens to be a mul. That would be
6088 // problematic if we've sunk a different mul for the scale, because then
6089 // we'd end up sinking both muls.
6090 if (AddrMode.BaseReg) {
6091 Value *V = AddrMode.BaseReg;
6092 if (V->getType() != IntPtrTy)
6093 V = Builder.CreateIntCast(V, DestTy: IntPtrTy, /*isSigned=*/true, Name: "sunkaddr");
6094
6095 ResultIndex = V;
6096 }
6097
6098 // Add the scale value.
6099 if (AddrMode.Scale) {
6100 Value *V = AddrMode.ScaledReg;
6101 if (V->getType() == IntPtrTy) {
6102 // done.
6103 } else {
6104 assert(cast<IntegerType>(IntPtrTy)->getBitWidth() <
6105 cast<IntegerType>(V->getType())->getBitWidth() &&
6106 "We can't transform if ScaledReg is too narrow");
6107 V = Builder.CreateTrunc(V, DestTy: IntPtrTy, Name: "sunkaddr");
6108 }
6109
6110 if (AddrMode.Scale != 1)
6111 V = Builder.CreateMul(
6112 LHS: V, RHS: ConstantInt::getSigned(Ty: IntPtrTy, V: AddrMode.Scale), Name: "sunkaddr");
6113 if (ResultIndex)
6114 ResultIndex = Builder.CreateAdd(LHS: ResultIndex, RHS: V, Name: "sunkaddr");
6115 else
6116 ResultIndex = V;
6117 }
6118
6119 // Add in the Base Offset if present.
6120 if (AddrMode.BaseOffs) {
6121 Value *V = ConstantInt::getSigned(Ty: IntPtrTy, V: AddrMode.BaseOffs);
6122 if (ResultIndex) {
6123 // We need to add this separately from the scale above to help with
6124 // SDAG consecutive load/store merging.
6125 if (ResultPtr->getType() != I8PtrTy)
6126 ResultPtr = Builder.CreatePointerCast(V: ResultPtr, DestTy: I8PtrTy);
6127 ResultPtr = Builder.CreatePtrAdd(Ptr: ResultPtr, Offset: ResultIndex, Name: "sunkaddr",
6128 NW: AddrMode.InBounds);
6129 }
6130
6131 ResultIndex = V;
6132 }
6133
6134 if (!ResultIndex) {
6135 auto PtrInst = dyn_cast<Instruction>(Val: ResultPtr);
6136 // We know that we have a pointer without any offsets. If this pointer
6137 // originates from a different basic block than the current one, we
6138 // must be able to recreate it in the current basic block.
6139 // We do not support the recreation of any instructions yet.
6140 if (PtrInst && PtrInst->getParent() != MemoryInst->getParent())
6141 return Modified;
6142 SunkAddr = ResultPtr;
6143 } else {
6144 if (ResultPtr->getType() != I8PtrTy)
6145 ResultPtr = Builder.CreatePointerCast(V: ResultPtr, DestTy: I8PtrTy);
6146 SunkAddr = Builder.CreatePtrAdd(Ptr: ResultPtr, Offset: ResultIndex, Name: "sunkaddr",
6147 NW: AddrMode.InBounds);
6148 }
6149
6150 if (SunkAddr->getType() != Addr->getType()) {
6151 if (SunkAddr->getType()->getPointerAddressSpace() !=
6152 Addr->getType()->getPointerAddressSpace() &&
6153 !DL->isNonIntegralPointerType(Ty: Addr->getType())) {
6154 // There are two reasons the address spaces might not match: a no-op
6155 // addrspacecast, or a ptrtoint/inttoptr pair. Either way, we emit a
6156 // ptrtoint/inttoptr pair to ensure we match the original semantics.
6157 // TODO: allow bitcast between different address space pointers with
6158 // the same size.
6159 SunkAddr = Builder.CreatePtrToInt(V: SunkAddr, DestTy: IntPtrTy, Name: "sunkaddr");
6160 SunkAddr =
6161 Builder.CreateIntToPtr(V: SunkAddr, DestTy: Addr->getType(), Name: "sunkaddr");
6162 } else
6163 SunkAddr = Builder.CreatePointerCast(V: SunkAddr, DestTy: Addr->getType());
6164 }
6165 }
6166 } else {
6167 // We'd require a ptrtoint/inttoptr down the line, which we can't do for
6168 // non-integral pointers, so in that case bail out now.
6169 Type *BaseTy = AddrMode.BaseReg ? AddrMode.BaseReg->getType() : nullptr;
6170 Type *ScaleTy = AddrMode.Scale ? AddrMode.ScaledReg->getType() : nullptr;
6171 PointerType *BasePtrTy = dyn_cast_or_null<PointerType>(Val: BaseTy);
6172 PointerType *ScalePtrTy = dyn_cast_or_null<PointerType>(Val: ScaleTy);
6173 if (DL->isNonIntegralPointerType(Ty: Addr->getType()) ||
6174 (BasePtrTy && DL->isNonIntegralPointerType(PT: BasePtrTy)) ||
6175 (ScalePtrTy && DL->isNonIntegralPointerType(PT: ScalePtrTy)) ||
6176 (AddrMode.BaseGV &&
6177 DL->isNonIntegralPointerType(PT: AddrMode.BaseGV->getType())))
6178 return Modified;
6179
6180 LLVM_DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode
6181 << " for " << *MemoryInst << "\n");
6182 Type *IntPtrTy = DL->getIntPtrType(Addr->getType());
6183 Value *Result = nullptr;
6184
6185 // Start with the base register. Do this first so that subsequent address
6186 // matching finds it last, which will prevent it from trying to match it
6187 // as the scaled value in case it happens to be a mul. That would be
6188 // problematic if we've sunk a different mul for the scale, because then
6189 // we'd end up sinking both muls.
6190 if (AddrMode.BaseReg) {
6191 Value *V = AddrMode.BaseReg;
6192 if (V->getType()->isPointerTy())
6193 V = Builder.CreatePtrToInt(V, DestTy: IntPtrTy, Name: "sunkaddr");
6194 if (V->getType() != IntPtrTy)
6195 V = Builder.CreateIntCast(V, DestTy: IntPtrTy, /*isSigned=*/true, Name: "sunkaddr");
6196 Result = V;
6197 }
6198
6199 // Add the scale value.
6200 if (AddrMode.Scale) {
6201 Value *V = AddrMode.ScaledReg;
6202 if (V->getType() == IntPtrTy) {
6203 // done.
6204 } else if (V->getType()->isPointerTy()) {
6205 V = Builder.CreatePtrToInt(V, DestTy: IntPtrTy, Name: "sunkaddr");
6206 } else if (cast<IntegerType>(Val: IntPtrTy)->getBitWidth() <
6207 cast<IntegerType>(Val: V->getType())->getBitWidth()) {
6208 V = Builder.CreateTrunc(V, DestTy: IntPtrTy, Name: "sunkaddr");
6209 } else {
6210 // It is only safe to sign extend the BaseReg if we know that the math
6211 // required to create it did not overflow before we extend it. Since
6212 // the original IR value was tossed in favor of a constant back when
6213 // the AddrMode was created we need to bail out gracefully if widths
6214 // do not match instead of extending it.
6215 Instruction *I = dyn_cast_or_null<Instruction>(Val: Result);
6216 if (I && (Result != AddrMode.BaseReg))
6217 I->eraseFromParent();
6218 return Modified;
6219 }
6220 if (AddrMode.Scale != 1)
6221 V = Builder.CreateMul(
6222 LHS: V, RHS: ConstantInt::getSigned(Ty: IntPtrTy, V: AddrMode.Scale), Name: "sunkaddr");
6223 if (Result)
6224 Result = Builder.CreateAdd(LHS: Result, RHS: V, Name: "sunkaddr");
6225 else
6226 Result = V;
6227 }
6228
6229 // Add in the BaseGV if present.
6230 GlobalValue *BaseGV = AddrMode.BaseGV;
6231 if (BaseGV != nullptr) {
6232 Value *BaseGVPtr;
6233 if (BaseGV->isThreadLocal()) {
6234 BaseGVPtr = Builder.CreateThreadLocalAddress(Ptr: BaseGV);
6235 } else {
6236 BaseGVPtr = BaseGV;
6237 }
6238 Value *V = Builder.CreatePtrToInt(V: BaseGVPtr, DestTy: IntPtrTy, Name: "sunkaddr");
6239 if (Result)
6240 Result = Builder.CreateAdd(LHS: Result, RHS: V, Name: "sunkaddr");
6241 else
6242 Result = V;
6243 }
6244
6245 // Add in the Base Offset if present.
6246 if (AddrMode.BaseOffs) {
6247 Value *V = ConstantInt::getSigned(Ty: IntPtrTy, V: AddrMode.BaseOffs);
6248 if (Result)
6249 Result = Builder.CreateAdd(LHS: Result, RHS: V, Name: "sunkaddr");
6250 else
6251 Result = V;
6252 }
6253
6254 if (!Result)
6255 SunkAddr = Constant::getNullValue(Ty: Addr->getType());
6256 else
6257 SunkAddr = Builder.CreateIntToPtr(V: Result, DestTy: Addr->getType(), Name: "sunkaddr");
6258 }
6259
6260 MemoryInst->replaceUsesOfWith(From: Repl, To: SunkAddr);
6261 // Store the newly computed address into the cache. In the case we reused a
6262 // value, this should be idempotent.
6263 SunkAddrs[Addr] = WeakTrackingVH(SunkAddr);
6264
6265 // If we have no uses, recursively delete the value and all dead instructions
6266 // using it.
6267 if (Repl->use_empty()) {
6268 resetIteratorIfInvalidatedWhileCalling(BB: CurInstIterator->getParent(), f: [&]() {
6269 RecursivelyDeleteTriviallyDeadInstructions(
6270 V: Repl, TLI: TLInfo, MSSAU: nullptr,
6271 AboutToDeleteCallback: [&](Value *V) { removeAllAssertingVHReferences(V); });
6272 });
6273 }
6274 ++NumMemoryInsts;
6275 return true;
6276}
6277
6278/// Rewrite GEP input to gather/scatter to enable SelectionDAGBuilder to find
6279/// a uniform base to use for ISD::MGATHER/MSCATTER. SelectionDAGBuilder can
6280/// only handle a 2 operand GEP in the same basic block or a splat constant
6281/// vector. The 2 operands to the GEP must have a scalar pointer and a vector
6282/// index.
6283///
6284/// If the existing GEP has a vector base pointer that is splat, we can look
6285/// through the splat to find the scalar pointer. If we can't find a scalar
6286/// pointer there's nothing we can do.
6287///
6288/// If we have a GEP with more than 2 indices where the middle indices are all
6289/// zeroes, we can replace it with 2 GEPs where the second has 2 operands.
6290///
6291/// If the final index isn't a vector or is a splat, we can emit a scalar GEP
6292/// followed by a GEP with an all zeroes vector index. This will enable
6293/// SelectionDAGBuilder to use the scalar GEP as the uniform base and have a
6294/// zero index.
6295bool CodeGenPrepare::optimizeGatherScatterInst(Instruction *MemoryInst,
6296 Value *Ptr) {
6297 Value *NewAddr;
6298
6299 if (const auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr)) {
6300 // Don't optimize GEPs that don't have indices.
6301 if (!GEP->hasIndices())
6302 return false;
6303
6304 // If the GEP and the gather/scatter aren't in the same BB, don't optimize.
6305 // FIXME: We should support this by sinking the GEP.
6306 if (MemoryInst->getParent() != GEP->getParent())
6307 return false;
6308
6309 SmallVector<Value *, 2> Ops(GEP->operands());
6310
6311 bool RewriteGEP = false;
6312
6313 if (Ops[0]->getType()->isVectorTy()) {
6314 Ops[0] = getSplatValue(V: Ops[0]);
6315 if (!Ops[0])
6316 return false;
6317 RewriteGEP = true;
6318 }
6319
6320 unsigned FinalIndex = Ops.size() - 1;
6321
6322 // Ensure all but the last index is 0.
6323 // FIXME: This isn't strictly required. All that's required is that they are
6324 // all scalars or splats.
6325 for (unsigned i = 1; i < FinalIndex; ++i) {
6326 auto *C = dyn_cast<Constant>(Val: Ops[i]);
6327 if (!C)
6328 return false;
6329 if (isa<VectorType>(Val: C->getType()))
6330 C = C->getSplatValue();
6331 auto *CI = dyn_cast_or_null<ConstantInt>(Val: C);
6332 if (!CI || !CI->isZero())
6333 return false;
6334 // Scalarize the index if needed.
6335 Ops[i] = CI;
6336 }
6337
6338 // Try to scalarize the final index.
6339 if (Ops[FinalIndex]->getType()->isVectorTy()) {
6340 if (Value *V = getSplatValue(V: Ops[FinalIndex])) {
6341 auto *C = dyn_cast<ConstantInt>(Val: V);
6342 // Don't scalarize all zeros vector.
6343 if (!C || !C->isZero()) {
6344 Ops[FinalIndex] = V;
6345 RewriteGEP = true;
6346 }
6347 }
6348 }
6349
6350 // If we made any changes or the we have extra operands, we need to generate
6351 // new instructions.
6352 if (!RewriteGEP && Ops.size() == 2)
6353 return false;
6354
6355 auto NumElts = cast<VectorType>(Val: Ptr->getType())->getElementCount();
6356
6357 IRBuilder<> Builder(MemoryInst);
6358
6359 Type *SourceTy = GEP->getSourceElementType();
6360 Type *ScalarIndexTy = DL->getIndexType(PtrTy: Ops[0]->getType()->getScalarType());
6361
6362 // If the final index isn't a vector, emit a scalar GEP containing all ops
6363 // and a vector GEP with all zeroes final index.
6364 if (!Ops[FinalIndex]->getType()->isVectorTy()) {
6365 NewAddr = Builder.CreateGEP(Ty: SourceTy, Ptr: Ops[0], IdxList: ArrayRef(Ops).drop_front());
6366 auto *IndexTy = VectorType::get(ElementType: ScalarIndexTy, EC: NumElts);
6367 auto *SecondTy = GetElementPtrInst::getIndexedType(
6368 Ty: SourceTy, IdxList: ArrayRef(Ops).drop_front());
6369 NewAddr =
6370 Builder.CreateGEP(Ty: SecondTy, Ptr: NewAddr, IdxList: Constant::getNullValue(Ty: IndexTy));
6371 } else {
6372 Value *Base = Ops[0];
6373 Value *Index = Ops[FinalIndex];
6374
6375 // Create a scalar GEP if there are more than 2 operands.
6376 if (Ops.size() != 2) {
6377 // Replace the last index with 0.
6378 Ops[FinalIndex] =
6379 Constant::getNullValue(Ty: Ops[FinalIndex]->getType()->getScalarType());
6380 Base = Builder.CreateGEP(Ty: SourceTy, Ptr: Base, IdxList: ArrayRef(Ops).drop_front());
6381 SourceTy = GetElementPtrInst::getIndexedType(
6382 Ty: SourceTy, IdxList: ArrayRef(Ops).drop_front());
6383 }
6384
6385 // Now create the GEP with scalar pointer and vector index.
6386 NewAddr = Builder.CreateGEP(Ty: SourceTy, Ptr: Base, IdxList: Index);
6387 }
6388 } else if (!isa<Constant>(Val: Ptr)) {
6389 // Not a GEP, maybe its a splat and we can create a GEP to enable
6390 // SelectionDAGBuilder to use it as a uniform base.
6391 Value *V = getSplatValue(V: Ptr);
6392 if (!V)
6393 return false;
6394
6395 auto NumElts = cast<VectorType>(Val: Ptr->getType())->getElementCount();
6396
6397 IRBuilder<> Builder(MemoryInst);
6398
6399 // Emit a vector GEP with a scalar pointer and all 0s vector index.
6400 Type *ScalarIndexTy = DL->getIndexType(PtrTy: V->getType()->getScalarType());
6401 auto *IndexTy = VectorType::get(ElementType: ScalarIndexTy, EC: NumElts);
6402 Type *ScalarTy;
6403 if (cast<IntrinsicInst>(Val: MemoryInst)->getIntrinsicID() ==
6404 Intrinsic::masked_gather) {
6405 ScalarTy = MemoryInst->getType()->getScalarType();
6406 } else {
6407 assert(cast<IntrinsicInst>(MemoryInst)->getIntrinsicID() ==
6408 Intrinsic::masked_scatter);
6409 ScalarTy = MemoryInst->getOperand(i: 0)->getType()->getScalarType();
6410 }
6411 NewAddr = Builder.CreateGEP(Ty: ScalarTy, Ptr: V, IdxList: Constant::getNullValue(Ty: IndexTy));
6412 } else {
6413 // Constant, SelectionDAGBuilder knows to check if its a splat.
6414 return false;
6415 }
6416
6417 MemoryInst->replaceUsesOfWith(From: Ptr, To: NewAddr);
6418
6419 // If we have no uses, recursively delete the value and all dead instructions
6420 // using it.
6421 if (Ptr->use_empty())
6422 RecursivelyDeleteTriviallyDeadInstructions(
6423 V: Ptr, TLI: TLInfo, MSSAU: nullptr,
6424 AboutToDeleteCallback: [&](Value *V) { removeAllAssertingVHReferences(V); });
6425
6426 return true;
6427}
6428
6429// This is a helper for CodeGenPrepare::optimizeMulWithOverflow.
6430// Check the pattern we are interested in where there are maximum 2 uses
6431// of the intrinsic which are the extract instructions.
6432static bool matchOverflowPattern(Instruction *&I, ExtractValueInst *&MulExtract,
6433 ExtractValueInst *&OverflowExtract) {
6434 // Bail out if it's more than 2 users:
6435 if (I->hasNUsesOrMore(N: 3))
6436 return false;
6437
6438 for (User *U : I->users()) {
6439 auto *Extract = dyn_cast<ExtractValueInst>(Val: U);
6440 if (!Extract || Extract->getNumIndices() != 1)
6441 return false;
6442
6443 unsigned Index = Extract->getIndices()[0];
6444 if (Index == 0)
6445 MulExtract = Extract;
6446 else if (Index == 1)
6447 OverflowExtract = Extract;
6448 else
6449 return false;
6450 }
6451 return true;
6452}
6453
6454// Rewrite the mul_with_overflow intrinsic by checking if both of the
6455// operands' value ranges are within the legal type. If so, we can optimize the
6456// multiplication algorithm. This code is supposed to be written during the step
6457// of type legalization, but given that we need to reconstruct the IR which is
6458// not doable there, we do it here.
6459// The IR after the optimization will look like:
6460// entry:
6461// if signed:
6462// ( (lhs_lo>>BW-1) ^ lhs_hi) || ( (rhs_lo>>BW-1) ^ rhs_hi) ? overflow,
6463// overflow_no
6464// else:
6465// (lhs_hi != 0) || (rhs_hi != 0) ? overflow, overflow_no
6466// overflow_no:
6467// overflow:
6468// overflow.res:
6469// \returns true if optimization was applied
6470// TODO: This optimization can be further improved to optimize branching on
6471// overflow where the 'overflow_no' BB can branch directly to the false
6472// successor of overflow, but that would add additional complexity so we leave
6473// it for future work.
6474bool CodeGenPrepare::optimizeMulWithOverflow(Instruction *I, bool IsSigned,
6475 ModifyDT &ModifiedDT) {
6476 // Check if target supports this optimization.
6477 if (!TLI->shouldOptimizeMulOverflowWithZeroHighBits(
6478 Context&: I->getContext(),
6479 VT: TLI->getValueType(DL: *DL, Ty: I->getType()->getContainedType(i: 0))))
6480 return false;
6481
6482 ExtractValueInst *MulExtract = nullptr, *OverflowExtract = nullptr;
6483 if (!matchOverflowPattern(I, MulExtract, OverflowExtract))
6484 return false;
6485
6486 // Keep track of the instruction to stop reoptimizing it again.
6487 InsertedInsts.insert(Ptr: I);
6488
6489 Value *LHS = I->getOperand(i: 0);
6490 Value *RHS = I->getOperand(i: 1);
6491 Type *Ty = LHS->getType();
6492 unsigned VTHalfBitWidth = Ty->getScalarSizeInBits() / 2;
6493 Type *LegalTy = Ty->getWithNewBitWidth(NewBitWidth: VTHalfBitWidth);
6494
6495 // New BBs:
6496 BasicBlock *OverflowEntryBB =
6497 splitBlockBefore(Old: I->getParent(), SplitPt: I, DTU, LI, MSSAU: nullptr, BBName: "");
6498 OverflowEntryBB->takeName(V: I->getParent());
6499 // Keep the 'br' instruction that is generated as a result of the split to be
6500 // erased/replaced later.
6501 Instruction *OldTerminator = OverflowEntryBB->getTerminator();
6502 BasicBlock *NoOverflowBB =
6503 BasicBlock::Create(Context&: I->getContext(), Name: "overflow.no", Parent: I->getFunction());
6504 NoOverflowBB->moveAfter(MovePos: OverflowEntryBB);
6505 BasicBlock *OverflowBB =
6506 BasicBlock::Create(Context&: I->getContext(), Name: "overflow", Parent: I->getFunction());
6507 OverflowBB->moveAfter(MovePos: NoOverflowBB);
6508
6509 // BB overflow.entry:
6510 IRBuilder<> Builder(OverflowEntryBB);
6511 // Extract low and high halves of LHS:
6512 Value *LoLHS = Builder.CreateTrunc(V: LHS, DestTy: LegalTy, Name: "lo.lhs");
6513 Value *HiLHS = Builder.CreateLShr(LHS, RHS: VTHalfBitWidth, Name: "lhs.lsr");
6514 HiLHS = Builder.CreateTrunc(V: HiLHS, DestTy: LegalTy, Name: "hi.lhs");
6515
6516 // Extract low and high halves of RHS:
6517 Value *LoRHS = Builder.CreateTrunc(V: RHS, DestTy: LegalTy, Name: "lo.rhs");
6518 Value *HiRHS = Builder.CreateLShr(LHS: RHS, RHS: VTHalfBitWidth, Name: "rhs.lsr");
6519 HiRHS = Builder.CreateTrunc(V: HiRHS, DestTy: LegalTy, Name: "hi.rhs");
6520
6521 Value *IsAnyBitTrue;
6522 if (IsSigned) {
6523 Value *SignLoLHS =
6524 Builder.CreateAShr(LHS: LoLHS, RHS: VTHalfBitWidth - 1, Name: "sign.lo.lhs");
6525 Value *SignLoRHS =
6526 Builder.CreateAShr(LHS: LoRHS, RHS: VTHalfBitWidth - 1, Name: "sign.lo.rhs");
6527 Value *XorLHS = Builder.CreateXor(LHS: HiLHS, RHS: SignLoLHS);
6528 Value *XorRHS = Builder.CreateXor(LHS: HiRHS, RHS: SignLoRHS);
6529 Value *Or = Builder.CreateOr(LHS: XorLHS, RHS: XorRHS, Name: "or.lhs.rhs");
6530 IsAnyBitTrue = Builder.CreateCmp(Pred: ICmpInst::ICMP_NE, LHS: Or,
6531 RHS: ConstantInt::getNullValue(Ty: Or->getType()));
6532 } else {
6533 Value *CmpLHS = Builder.CreateCmp(Pred: ICmpInst::ICMP_NE, LHS: HiLHS,
6534 RHS: ConstantInt::getNullValue(Ty: LegalTy));
6535 Value *CmpRHS = Builder.CreateCmp(Pred: ICmpInst::ICMP_NE, LHS: HiRHS,
6536 RHS: ConstantInt::getNullValue(Ty: LegalTy));
6537 IsAnyBitTrue = Builder.CreateOr(LHS: CmpLHS, RHS: CmpRHS, Name: "or.lhs.rhs");
6538 }
6539 Builder.CreateCondBr(Cond: IsAnyBitTrue, True: OverflowBB, False: NoOverflowBB);
6540
6541 // BB overflow.no:
6542 Builder.SetInsertPoint(NoOverflowBB);
6543 Value *ExtLoLHS, *ExtLoRHS;
6544 if (IsSigned) {
6545 ExtLoLHS = Builder.CreateSExt(V: LoLHS, DestTy: Ty, Name: "lo.lhs.ext");
6546 ExtLoRHS = Builder.CreateSExt(V: LoRHS, DestTy: Ty, Name: "lo.rhs.ext");
6547 } else {
6548 ExtLoLHS = Builder.CreateZExt(V: LoLHS, DestTy: Ty, Name: "lo.lhs.ext");
6549 ExtLoRHS = Builder.CreateZExt(V: LoRHS, DestTy: Ty, Name: "lo.rhs.ext");
6550 }
6551
6552 Value *Mul = Builder.CreateMul(LHS: ExtLoLHS, RHS: ExtLoRHS, Name: "mul.overflow.no");
6553
6554 // Create the 'overflow.res' BB to merge the results of
6555 // the two paths:
6556 BasicBlock *OverflowResBB = I->getParent();
6557 OverflowResBB->setName("overflow.res");
6558
6559 // BB overflow.no: jump to overflow.res BB
6560 Builder.CreateBr(Dest: OverflowResBB);
6561 // No we don't need the old terminator in overflow.entry BB, erase it:
6562 OldTerminator->eraseFromParent();
6563
6564 // BB overflow.res:
6565 Builder.SetInsertPoint(TheBB: OverflowResBB, IP: OverflowResBB->getFirstInsertionPt());
6566 // Create PHI nodes to merge results from no.overflow BB and overflow BB to
6567 // replace the extract instructions.
6568 PHINode *OverflowResPHI = Builder.CreatePHI(Ty, NumReservedValues: 2),
6569 *OverflowFlagPHI =
6570 Builder.CreatePHI(Ty: IntegerType::getInt1Ty(C&: I->getContext()), NumReservedValues: 2);
6571
6572 // Add the incoming values from no.overflow BB and later from overflow BB.
6573 OverflowResPHI->addIncoming(V: Mul, BB: NoOverflowBB);
6574 OverflowFlagPHI->addIncoming(V: ConstantInt::getFalse(Context&: I->getContext()),
6575 BB: NoOverflowBB);
6576
6577 // Replace all users of MulExtract and OverflowExtract to use the PHI nodes.
6578 if (MulExtract) {
6579 MulExtract->replaceAllUsesWith(V: OverflowResPHI);
6580 MulExtract->eraseFromParent();
6581 }
6582 if (OverflowExtract) {
6583 OverflowExtract->replaceAllUsesWith(V: OverflowFlagPHI);
6584 OverflowExtract->eraseFromParent();
6585 }
6586
6587 // Remove the intrinsic from parent (overflow.res BB) as it will be part of
6588 // overflow BB
6589 I->removeFromParent();
6590 // BB overflow:
6591 I->insertInto(ParentBB: OverflowBB, It: OverflowBB->end());
6592 Builder.SetInsertPoint(TheBB: OverflowBB, IP: OverflowBB->end());
6593 Value *MulOverflow = Builder.CreateExtractValue(Agg: I, Idxs: {0}, Name: "mul.overflow");
6594 Value *OverflowFlag = Builder.CreateExtractValue(Agg: I, Idxs: {1}, Name: "overflow.flag");
6595 Builder.CreateBr(Dest: OverflowResBB);
6596
6597 // Add The Extracted values to the PHINodes in the overflow.res BB.
6598 OverflowResPHI->addIncoming(V: MulOverflow, BB: OverflowBB);
6599 OverflowFlagPHI->addIncoming(V: OverflowFlag, BB: OverflowBB);
6600
6601 DTU->applyUpdates(Updates: {{DominatorTree::Insert, OverflowEntryBB, OverflowBB},
6602 {DominatorTree::Insert, OverflowEntryBB, NoOverflowBB},
6603 {DominatorTree::Insert, NoOverflowBB, OverflowResBB},
6604 {DominatorTree::Delete, OverflowEntryBB, OverflowResBB},
6605 {DominatorTree::Insert, OverflowBB, OverflowResBB}});
6606
6607 ModifiedDT = ModifyDT::ModifyBBDT;
6608 return true;
6609}
6610
6611/// If there are any memory operands, use OptimizeMemoryInst to sink their
6612/// address computing into the block when possible / profitable.
6613bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) {
6614 bool MadeChange = false;
6615
6616 const TargetRegisterInfo *TRI =
6617 TM->getSubtargetImpl(*CS->getFunction())->getRegisterInfo();
6618 TargetLowering::AsmOperandInfoVector TargetConstraints =
6619 TLI->ParseConstraints(DL: *DL, TRI, Call: *CS);
6620 unsigned ArgNo = 0;
6621 for (TargetLowering::AsmOperandInfo &OpInfo : TargetConstraints) {
6622 // Compute the constraint code and ConstraintType to use.
6623 TLI->ComputeConstraintToUse(OpInfo, Op: SDValue());
6624
6625 // TODO: Also handle C_Address?
6626 if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
6627 OpInfo.isIndirect) {
6628 Value *OpVal = CS->getArgOperand(i: ArgNo++);
6629 MadeChange |= optimizeMemoryInst(MemoryInst: CS, Addr: OpVal, AccessTy: OpVal->getType(), AddrSpace: ~0u);
6630 } else if (OpInfo.Type == InlineAsm::isInput)
6631 ArgNo++;
6632 }
6633
6634 return MadeChange;
6635}
6636
6637/// Check if all the uses of \p Val are equivalent (or free) zero or
6638/// sign extensions.
6639static bool hasSameExtUse(Value *Val, const TargetLowering &TLI) {
6640 assert(!Val->use_empty() && "Input must have at least one use");
6641 const Instruction *FirstUser = cast<Instruction>(Val: *Val->user_begin());
6642 bool IsSExt = isa<SExtInst>(Val: FirstUser);
6643 Type *ExtTy = FirstUser->getType();
6644 for (const User *U : Val->users()) {
6645 const Instruction *UI = cast<Instruction>(Val: U);
6646 if ((IsSExt && !isa<SExtInst>(Val: UI)) || (!IsSExt && !isa<ZExtInst>(Val: UI)))
6647 return false;
6648 Type *CurTy = UI->getType();
6649 // Same input and output types: Same instruction after CSE.
6650 if (CurTy == ExtTy)
6651 continue;
6652
6653 // If IsSExt is true, we are in this situation:
6654 // a = Val
6655 // b = sext ty1 a to ty2
6656 // c = sext ty1 a to ty3
6657 // Assuming ty2 is shorter than ty3, this could be turned into:
6658 // a = Val
6659 // b = sext ty1 a to ty2
6660 // c = sext ty2 b to ty3
6661 // However, the last sext is not free.
6662 if (IsSExt)
6663 return false;
6664
6665 // This is a ZExt, maybe this is free to extend from one type to another.
6666 // In that case, we would not account for a different use.
6667 Type *NarrowTy;
6668 Type *LargeTy;
6669 if (ExtTy->getScalarType()->getIntegerBitWidth() >
6670 CurTy->getScalarType()->getIntegerBitWidth()) {
6671 NarrowTy = CurTy;
6672 LargeTy = ExtTy;
6673 } else {
6674 NarrowTy = ExtTy;
6675 LargeTy = CurTy;
6676 }
6677
6678 if (!TLI.isZExtFree(FromTy: NarrowTy, ToTy: LargeTy))
6679 return false;
6680 }
6681 // All uses are the same or can be derived from one another for free.
6682 return true;
6683}
6684
6685/// Try to speculatively promote extensions in \p Exts and continue
6686/// promoting through newly promoted operands recursively as far as doing so is
6687/// profitable. Save extensions profitably moved up, in \p ProfitablyMovedExts.
6688/// When some promotion happened, \p TPT contains the proper state to revert
6689/// them.
6690///
6691/// \return true if some promotion happened, false otherwise.
6692bool CodeGenPrepare::tryToPromoteExts(
6693 TypePromotionTransaction &TPT, const SmallVectorImpl<Instruction *> &Exts,
6694 SmallVectorImpl<Instruction *> &ProfitablyMovedExts,
6695 unsigned CreatedInstsCost) {
6696 bool Promoted = false;
6697
6698 // Iterate over all the extensions to try to promote them.
6699 for (auto *I : Exts) {
6700 // Early check if we directly have ext(load).
6701 if (isa<LoadInst>(Val: I->getOperand(i: 0))) {
6702 ProfitablyMovedExts.push_back(Elt: I);
6703 continue;
6704 }
6705
6706 // Check whether or not we want to do any promotion. The reason we have
6707 // this check inside the for loop is to catch the case where an extension
6708 // is directly fed by a load because in such case the extension can be moved
6709 // up without any promotion on its operands.
6710 if (!TLI->enableExtLdPromotion() || DisableExtLdPromotion)
6711 return false;
6712
6713 // Get the action to perform the promotion.
6714 TypePromotionHelper::Action TPH =
6715 TypePromotionHelper::getAction(Ext: I, InsertedInsts, TLI: *TLI, PromotedInsts);
6716 // Check if we can promote.
6717 if (!TPH) {
6718 // Save the current extension as we cannot move up through its operand.
6719 ProfitablyMovedExts.push_back(Elt: I);
6720 continue;
6721 }
6722
6723 // Save the current state.
6724 TypePromotionTransaction::ConstRestorationPt LastKnownGood =
6725 TPT.getRestorationPoint();
6726 SmallVector<Instruction *, 4> NewExts;
6727 unsigned NewCreatedInstsCost = 0;
6728 unsigned ExtCost = !TLI->isExtFree(I);
6729 // Promote.
6730 Value *PromotedVal = TPH(I, TPT, PromotedInsts, NewCreatedInstsCost,
6731 &NewExts, nullptr, *TLI);
6732 assert(PromotedVal &&
6733 "TypePromotionHelper should have filtered out those cases");
6734
6735 // We would be able to merge only one extension in a load.
6736 // Therefore, if we have more than 1 new extension we heuristically
6737 // cut this search path, because it means we degrade the code quality.
6738 // With exactly 2, the transformation is neutral, because we will merge
6739 // one extension but leave one. However, we optimistically keep going,
6740 // because the new extension may be removed too. Also avoid replacing a
6741 // single free extension with multiple extensions, as this increases the
6742 // number of IR instructions while not providing any savings.
6743 long long TotalCreatedInstsCost = CreatedInstsCost + NewCreatedInstsCost;
6744 // FIXME: It would be possible to propagate a negative value instead of
6745 // conservatively ceiling it to 0.
6746 TotalCreatedInstsCost =
6747 std::max(a: (long long)0, b: (TotalCreatedInstsCost - ExtCost));
6748 if (!StressExtLdPromotion &&
6749 (TotalCreatedInstsCost > 1 ||
6750 !isPromotedInstructionLegal(TLI: *TLI, DL: *DL, Val: PromotedVal) ||
6751 (ExtCost == 0 && NewExts.size() > 1))) {
6752 // This promotion is not profitable, rollback to the previous state, and
6753 // save the current extension in ProfitablyMovedExts as the latest
6754 // speculative promotion turned out to be unprofitable.
6755 TPT.rollback(Point: LastKnownGood);
6756 ProfitablyMovedExts.push_back(Elt: I);
6757 continue;
6758 }
6759 // Continue promoting NewExts as far as doing so is profitable.
6760 SmallVector<Instruction *, 2> NewlyMovedExts;
6761 (void)tryToPromoteExts(TPT, Exts: NewExts, ProfitablyMovedExts&: NewlyMovedExts, CreatedInstsCost: TotalCreatedInstsCost);
6762 bool NewPromoted = false;
6763 for (auto *ExtInst : NewlyMovedExts) {
6764 Instruction *MovedExt = cast<Instruction>(Val: ExtInst);
6765 Value *ExtOperand = MovedExt->getOperand(i: 0);
6766 // If we have reached to a load, we need this extra profitability check
6767 // as it could potentially be merged into an ext(load).
6768 if (isa<LoadInst>(Val: ExtOperand) &&
6769 !(StressExtLdPromotion || NewCreatedInstsCost <= ExtCost ||
6770 (ExtOperand->hasOneUse() || hasSameExtUse(Val: ExtOperand, TLI: *TLI))))
6771 continue;
6772
6773 ProfitablyMovedExts.push_back(Elt: MovedExt);
6774 NewPromoted = true;
6775 }
6776
6777 // If none of speculative promotions for NewExts is profitable, rollback
6778 // and save the current extension (I) as the last profitable extension.
6779 if (!NewPromoted) {
6780 TPT.rollback(Point: LastKnownGood);
6781 ProfitablyMovedExts.push_back(Elt: I);
6782 continue;
6783 }
6784 // The promotion is profitable.
6785 Promoted = true;
6786 }
6787 return Promoted;
6788}
6789
6790/// Merging redundant sexts when one is dominating the other.
6791bool CodeGenPrepare::mergeSExts(Function &F) {
6792 bool Changed = false;
6793 for (auto &Entry : ValToSExtendedUses) {
6794 SExts &Insts = Entry.second;
6795 SExts CurPts;
6796 for (Instruction *Inst : Insts) {
6797 if (RemovedInsts.count(Ptr: Inst) || !isa<SExtInst>(Val: Inst) ||
6798 Inst->getOperand(i: 0) != Entry.first)
6799 continue;
6800 bool inserted = false;
6801 for (auto &Pt : CurPts) {
6802 if (getDT().dominates(Def: Inst, User: Pt)) {
6803 replaceAllUsesWith(Old: Pt, New: Inst, FreshBBs, IsHuge: IsHugeFunc);
6804 RemovedInsts.insert(Ptr: Pt);
6805 Pt->removeFromParent();
6806 Pt = Inst;
6807 inserted = true;
6808 Changed = true;
6809 break;
6810 }
6811 if (!getDT().dominates(Def: Pt, User: Inst))
6812 // Give up if we need to merge in a common dominator as the
6813 // experiments show it is not profitable.
6814 continue;
6815 replaceAllUsesWith(Old: Inst, New: Pt, FreshBBs, IsHuge: IsHugeFunc);
6816 RemovedInsts.insert(Ptr: Inst);
6817 Inst->removeFromParent();
6818 inserted = true;
6819 Changed = true;
6820 break;
6821 }
6822 if (!inserted)
6823 CurPts.push_back(Elt: Inst);
6824 }
6825 }
6826 return Changed;
6827}
6828
6829// Splitting large data structures so that the GEPs accessing them can have
6830// smaller offsets so that they can be sunk to the same blocks as their users.
6831// For example, a large struct starting from %base is split into two parts
6832// where the second part starts from %new_base.
6833//
6834// Before:
6835// BB0:
6836// %base =
6837//
6838// BB1:
6839// %gep0 = gep %base, off0
6840// %gep1 = gep %base, off1
6841// %gep2 = gep %base, off2
6842//
6843// BB2:
6844// %load1 = load %gep0
6845// %load2 = load %gep1
6846// %load3 = load %gep2
6847//
6848// After:
6849// BB0:
6850// %base =
6851// %new_base = gep %base, off0
6852//
6853// BB1:
6854// %new_gep0 = %new_base
6855// %new_gep1 = gep %new_base, off1 - off0
6856// %new_gep2 = gep %new_base, off2 - off0
6857//
6858// BB2:
6859// %load1 = load i32, i32* %new_gep0
6860// %load2 = load i32, i32* %new_gep1
6861// %load3 = load i32, i32* %new_gep2
6862//
6863// %new_gep1 and %new_gep2 can be sunk to BB2 now after the splitting because
6864// their offsets are smaller enough to fit into the addressing mode.
6865bool CodeGenPrepare::splitLargeGEPOffsets() {
6866 bool Changed = false;
6867 for (auto &Entry : LargeOffsetGEPMap) {
6868 Value *OldBase = Entry.first;
6869 SmallVectorImpl<std::pair<AssertingVH<GetElementPtrInst>, int64_t>>
6870 &LargeOffsetGEPs = Entry.second;
6871 auto compareGEPOffset =
6872 [&](const std::pair<GetElementPtrInst *, int64_t> &LHS,
6873 const std::pair<GetElementPtrInst *, int64_t> &RHS) {
6874 if (LHS.first == RHS.first)
6875 return false;
6876 if (LHS.second != RHS.second)
6877 return LHS.second < RHS.second;
6878 return LargeOffsetGEPID[LHS.first] < LargeOffsetGEPID[RHS.first];
6879 };
6880 // Sorting all the GEPs of the same data structures based on the offsets.
6881 llvm::sort(C&: LargeOffsetGEPs, Comp: compareGEPOffset);
6882 LargeOffsetGEPs.erase(CS: llvm::unique(R&: LargeOffsetGEPs), CE: LargeOffsetGEPs.end());
6883 // Skip if all the GEPs have the same offsets.
6884 if (LargeOffsetGEPs.front().second == LargeOffsetGEPs.back().second)
6885 continue;
6886 GetElementPtrInst *BaseGEP = LargeOffsetGEPs.begin()->first;
6887 int64_t BaseOffset = LargeOffsetGEPs.begin()->second;
6888 Value *NewBaseGEP = nullptr;
6889
6890 auto createNewBase = [&](int64_t BaseOffset, Value *OldBase,
6891 GetElementPtrInst *GEP) {
6892 LLVMContext &Ctx = GEP->getContext();
6893 Type *PtrIdxTy = DL->getIndexType(PtrTy: GEP->getType());
6894 Type *I8PtrTy =
6895 PointerType::get(C&: Ctx, AddressSpace: GEP->getType()->getPointerAddressSpace());
6896
6897 BasicBlock::iterator NewBaseInsertPt;
6898 BasicBlock *NewBaseInsertBB;
6899 if (auto *BaseI = dyn_cast<Instruction>(Val: OldBase)) {
6900 // If the base of the struct is an instruction, the new base will be
6901 // inserted close to it.
6902 NewBaseInsertBB = BaseI->getParent();
6903 if (isa<PHINode>(Val: BaseI))
6904 NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt();
6905 else if (InvokeInst *Invoke = dyn_cast<InvokeInst>(Val: BaseI)) {
6906 NewBaseInsertBB =
6907 SplitEdge(From: NewBaseInsertBB, To: Invoke->getNormalDest(), DT: &getDT(), LI);
6908 NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt();
6909 } else
6910 NewBaseInsertPt = std::next(x: BaseI->getIterator());
6911 } else {
6912 // If the current base is an argument or global value, the new base
6913 // will be inserted to the entry block.
6914 NewBaseInsertBB = &BaseGEP->getFunction()->getEntryBlock();
6915 NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt();
6916 }
6917 IRBuilder<> NewBaseBuilder(NewBaseInsertBB, NewBaseInsertPt);
6918 // Create a new base.
6919 // TODO: Avoid implicit trunc?
6920 // See https://github.com/llvm/llvm-project/issues/112510.
6921 Value *BaseIndex =
6922 ConstantInt::getSigned(Ty: PtrIdxTy, V: BaseOffset, /*ImplicitTrunc=*/true);
6923 NewBaseGEP = OldBase;
6924 if (NewBaseGEP->getType() != I8PtrTy)
6925 NewBaseGEP = NewBaseBuilder.CreatePointerCast(V: NewBaseGEP, DestTy: I8PtrTy);
6926 NewBaseGEP =
6927 NewBaseBuilder.CreatePtrAdd(Ptr: NewBaseGEP, Offset: BaseIndex, Name: "splitgep");
6928 NewGEPBases.insert(V: NewBaseGEP);
6929 return;
6930 };
6931
6932 // Check whether all the offsets can be encoded with prefered common base.
6933 if (int64_t PreferBase = TLI->getPreferredLargeGEPBaseOffset(
6934 MinOffset: LargeOffsetGEPs.front().second, MaxOffset: LargeOffsetGEPs.back().second)) {
6935 BaseOffset = PreferBase;
6936 // Create a new base if the offset of the BaseGEP can be decoded with one
6937 // instruction.
6938 createNewBase(BaseOffset, OldBase, BaseGEP);
6939 }
6940
6941 auto *LargeOffsetGEP = LargeOffsetGEPs.begin();
6942 while (LargeOffsetGEP != LargeOffsetGEPs.end()) {
6943 GetElementPtrInst *GEP = LargeOffsetGEP->first;
6944 int64_t Offset = LargeOffsetGEP->second;
6945 if (Offset != BaseOffset) {
6946 TargetLowering::AddrMode AddrMode;
6947 AddrMode.HasBaseReg = true;
6948 AddrMode.BaseOffs = Offset - BaseOffset;
6949 // The result type of the GEP might not be the type of the memory
6950 // access.
6951 if (!TLI->isLegalAddressingMode(DL: *DL, AM: AddrMode,
6952 Ty: GEP->getResultElementType(),
6953 AddrSpace: GEP->getAddressSpace())) {
6954 // We need to create a new base if the offset to the current base is
6955 // too large to fit into the addressing mode. So, a very large struct
6956 // may be split into several parts.
6957 BaseGEP = GEP;
6958 BaseOffset = Offset;
6959 NewBaseGEP = nullptr;
6960 }
6961 }
6962
6963 // Generate a new GEP to replace the current one.
6964 Type *PtrIdxTy = DL->getIndexType(PtrTy: GEP->getType());
6965
6966 if (!NewBaseGEP) {
6967 // Create a new base if we don't have one yet. Find the insertion
6968 // pointer for the new base first.
6969 createNewBase(BaseOffset, OldBase, GEP);
6970 }
6971
6972 IRBuilder<> Builder(GEP);
6973 Value *NewGEP = NewBaseGEP;
6974 if (Offset != BaseOffset) {
6975 // Calculate the new offset for the new GEP.
6976 Value *Index = ConstantInt::get(Ty: PtrIdxTy, V: Offset - BaseOffset);
6977 NewGEP = Builder.CreatePtrAdd(Ptr: NewBaseGEP, Offset: Index);
6978 }
6979 replaceAllUsesWith(Old: GEP, New: NewGEP, FreshBBs, IsHuge: IsHugeFunc);
6980 LargeOffsetGEPID.erase(Val: GEP);
6981 LargeOffsetGEP = LargeOffsetGEPs.erase(CI: LargeOffsetGEP);
6982 GEP->eraseFromParent();
6983 Changed = true;
6984 }
6985 }
6986 return Changed;
6987}
6988
6989bool CodeGenPrepare::optimizePhiType(
6990 PHINode *I, SmallPtrSetImpl<PHINode *> &Visited,
6991 SmallPtrSetImpl<Instruction *> &DeletedInstrs) {
6992 // We are looking for a collection on interconnected phi nodes that together
6993 // only use loads/bitcasts and are used by stores/bitcasts, and the bitcasts
6994 // are of the same type. Convert the whole set of nodes to the type of the
6995 // bitcast.
6996 Type *PhiTy = I->getType();
6997 Type *ConvertTy = nullptr;
6998 if (Visited.count(Ptr: I) ||
6999 (!I->getType()->isIntegerTy() && !I->getType()->isFloatingPointTy()))
7000 return false;
7001
7002 SmallVector<Instruction *, 4> Worklist;
7003 Worklist.push_back(Elt: cast<Instruction>(Val: I));
7004 SmallPtrSet<PHINode *, 4> PhiNodes;
7005 SmallPtrSet<ConstantData *, 4> Constants;
7006 PhiNodes.insert(Ptr: I);
7007 Visited.insert(Ptr: I);
7008 SmallPtrSet<Instruction *, 4> Defs;
7009 SmallPtrSet<Instruction *, 4> Uses;
7010 // This works by adding extra bitcasts between load/stores and removing
7011 // existing bitcasts. If we have a phi(bitcast(load)) or a store(bitcast(phi))
7012 // we can get in the situation where we remove a bitcast in one iteration
7013 // just to add it again in the next. We need to ensure that at least one
7014 // bitcast we remove are anchored to something that will not change back.
7015 bool AnyAnchored = false;
7016
7017 while (!Worklist.empty()) {
7018 Instruction *II = Worklist.pop_back_val();
7019
7020 if (auto *Phi = dyn_cast<PHINode>(Val: II)) {
7021 // Handle Defs, which might also be PHI's
7022 for (Value *V : Phi->incoming_values()) {
7023 if (auto *OpPhi = dyn_cast<PHINode>(Val: V)) {
7024 if (!PhiNodes.count(Ptr: OpPhi)) {
7025 if (!Visited.insert(Ptr: OpPhi).second)
7026 return false;
7027 PhiNodes.insert(Ptr: OpPhi);
7028 Worklist.push_back(Elt: OpPhi);
7029 }
7030 } else if (auto *OpLoad = dyn_cast<LoadInst>(Val: V)) {
7031 if (!OpLoad->isSimple())
7032 return false;
7033 if (Defs.insert(Ptr: OpLoad).second)
7034 Worklist.push_back(Elt: OpLoad);
7035 } else if (auto *OpEx = dyn_cast<ExtractElementInst>(Val: V)) {
7036 if (Defs.insert(Ptr: OpEx).second)
7037 Worklist.push_back(Elt: OpEx);
7038 } else if (auto *OpBC = dyn_cast<BitCastInst>(Val: V)) {
7039 if (!ConvertTy)
7040 ConvertTy = OpBC->getOperand(i_nocapture: 0)->getType();
7041 if (OpBC->getOperand(i_nocapture: 0)->getType() != ConvertTy)
7042 return false;
7043 if (Defs.insert(Ptr: OpBC).second) {
7044 Worklist.push_back(Elt: OpBC);
7045 AnyAnchored |= !isa<LoadInst>(Val: OpBC->getOperand(i_nocapture: 0)) &&
7046 !isa<ExtractElementInst>(Val: OpBC->getOperand(i_nocapture: 0));
7047 }
7048 } else if (auto *OpC = dyn_cast<ConstantData>(Val: V))
7049 Constants.insert(Ptr: OpC);
7050 else
7051 return false;
7052 }
7053 }
7054
7055 // Handle uses which might also be phi's
7056 for (User *V : II->users()) {
7057 if (auto *OpPhi = dyn_cast<PHINode>(Val: V)) {
7058 if (!PhiNodes.count(Ptr: OpPhi)) {
7059 if (Visited.count(Ptr: OpPhi))
7060 return false;
7061 PhiNodes.insert(Ptr: OpPhi);
7062 Visited.insert(Ptr: OpPhi);
7063 Worklist.push_back(Elt: OpPhi);
7064 }
7065 } else if (auto *OpStore = dyn_cast<StoreInst>(Val: V)) {
7066 if (!OpStore->isSimple() || OpStore->getOperand(i_nocapture: 0) != II)
7067 return false;
7068 Uses.insert(Ptr: OpStore);
7069 } else if (auto *OpBC = dyn_cast<BitCastInst>(Val: V)) {
7070 if (!ConvertTy)
7071 ConvertTy = OpBC->getType();
7072 if (OpBC->getType() != ConvertTy)
7073 return false;
7074 Uses.insert(Ptr: OpBC);
7075 AnyAnchored |=
7076 any_of(Range: OpBC->users(), P: [](User *U) { return !isa<StoreInst>(Val: U); });
7077 } else {
7078 return false;
7079 }
7080 }
7081 }
7082
7083 if (!ConvertTy || !AnyAnchored || PhiTy == ConvertTy ||
7084 !TLI->shouldConvertPhiType(From: PhiTy, To: ConvertTy))
7085 return false;
7086
7087 LLVM_DEBUG(dbgs() << "Converting " << *I << "\n and connected nodes to "
7088 << *ConvertTy << "\n");
7089
7090 // Create all the new phi nodes of the new type, and bitcast any loads to the
7091 // correct type.
7092 ValueToValueMap ValMap;
7093 for (ConstantData *C : Constants)
7094 ValMap[C] = ConstantExpr::getBitCast(C, Ty: ConvertTy);
7095 for (Instruction *D : Defs) {
7096 if (isa<BitCastInst>(Val: D)) {
7097 ValMap[D] = D->getOperand(i: 0);
7098 DeletedInstrs.insert(Ptr: D);
7099 } else {
7100 BasicBlock::iterator insertPt = std::next(x: D->getIterator());
7101 ValMap[D] = new BitCastInst(D, ConvertTy, D->getName() + ".bc", insertPt);
7102 }
7103 }
7104 for (PHINode *Phi : PhiNodes)
7105 ValMap[Phi] = PHINode::Create(Ty: ConvertTy, NumReservedValues: Phi->getNumIncomingValues(),
7106 NameStr: Phi->getName() + ".tc", InsertBefore: Phi->getIterator());
7107 // Pipe together all the PhiNodes.
7108 for (PHINode *Phi : PhiNodes) {
7109 PHINode *NewPhi = cast<PHINode>(Val: ValMap[Phi]);
7110 for (int i = 0, e = Phi->getNumIncomingValues(); i < e; i++)
7111 NewPhi->addIncoming(V: ValMap[Phi->getIncomingValue(i)],
7112 BB: Phi->getIncomingBlock(i));
7113 Visited.insert(Ptr: NewPhi);
7114 }
7115 // And finally pipe up the stores and bitcasts
7116 for (Instruction *U : Uses) {
7117 if (isa<BitCastInst>(Val: U)) {
7118 DeletedInstrs.insert(Ptr: U);
7119 replaceAllUsesWith(Old: U, New: ValMap[U->getOperand(i: 0)], FreshBBs, IsHuge: IsHugeFunc);
7120 } else {
7121 U->setOperand(i: 0, Val: new BitCastInst(ValMap[U->getOperand(i: 0)], PhiTy, "bc",
7122 U->getIterator()));
7123 }
7124 }
7125
7126 // Save the removed phis to be deleted later.
7127 DeletedInstrs.insert_range(R&: PhiNodes);
7128 return true;
7129}
7130
7131bool CodeGenPrepare::optimizePhiTypes(Function &F) {
7132 if (!OptimizePhiTypes)
7133 return false;
7134
7135 bool Changed = false;
7136 SmallPtrSet<PHINode *, 4> Visited;
7137 SmallPtrSet<Instruction *, 4> DeletedInstrs;
7138
7139 // Attempt to optimize all the phis in the functions to the correct type.
7140 for (auto &BB : F)
7141 for (auto &Phi : BB.phis())
7142 Changed |= optimizePhiType(I: &Phi, Visited, DeletedInstrs);
7143
7144 // Remove any old phi's that have been converted.
7145 for (auto *I : DeletedInstrs) {
7146 replaceAllUsesWith(Old: I, New: PoisonValue::get(T: I->getType()), FreshBBs, IsHuge: IsHugeFunc);
7147 I->eraseFromParent();
7148 }
7149
7150 return Changed;
7151}
7152
7153/// Return true, if an ext(load) can be formed from an extension in
7154/// \p MovedExts.
7155bool CodeGenPrepare::canFormExtLd(
7156 const SmallVectorImpl<Instruction *> &MovedExts, LoadInst *&LI,
7157 Instruction *&Inst, bool HasPromoted) {
7158 for (auto *MovedExtInst : MovedExts) {
7159 if (isa<LoadInst>(Val: MovedExtInst->getOperand(i: 0))) {
7160 LI = cast<LoadInst>(Val: MovedExtInst->getOperand(i: 0));
7161 Inst = MovedExtInst;
7162 break;
7163 }
7164 }
7165 if (!LI)
7166 return false;
7167
7168 // If they're already in the same block, there's nothing to do.
7169 // Make the cheap checks first if we did not promote.
7170 // If we promoted, we need to check if it is indeed profitable.
7171 if (!HasPromoted && LI->getParent() == Inst->getParent())
7172 return false;
7173
7174 return TLI->isExtLoad(Load: LI, Ext: Inst, DL: *DL);
7175}
7176
7177/// Move a zext or sext fed by a load into the same basic block as the load,
7178/// unless conditions are unfavorable. This allows SelectionDAG to fold the
7179/// extend into the load.
7180///
7181/// E.g.,
7182/// \code
7183/// %ld = load i32* %addr
7184/// %add = add nuw i32 %ld, 4
7185/// %zext = zext i32 %add to i64
7186// \endcode
7187/// =>
7188/// \code
7189/// %ld = load i32* %addr
7190/// %zext = zext i32 %ld to i64
7191/// %add = add nuw i64 %zext, 4
7192/// \encode
7193/// Note that the promotion in %add to i64 is done in tryToPromoteExts(), which
7194/// allow us to match zext(load i32*) to i64.
7195///
7196/// Also, try to promote the computations used to obtain a sign extended
7197/// value used into memory accesses.
7198/// E.g.,
7199/// \code
7200/// a = add nsw i32 b, 3
7201/// d = sext i32 a to i64
7202/// e = getelementptr ..., i64 d
7203/// \endcode
7204/// =>
7205/// \code
7206/// f = sext i32 b to i64
7207/// a = add nsw i64 f, 3
7208/// e = getelementptr ..., i64 a
7209/// \endcode
7210///
7211/// \p Inst[in/out] the extension may be modified during the process if some
7212/// promotions apply.
7213bool CodeGenPrepare::optimizeExt(Instruction *&Inst) {
7214 bool AllowPromotionWithoutCommonHeader = false;
7215 /// See if it is an interesting sext operations for the address type
7216 /// promotion before trying to promote it, e.g., the ones with the right
7217 /// type and used in memory accesses.
7218 bool ATPConsiderable = TTI->shouldConsiderAddressTypePromotion(
7219 I: *Inst, AllowPromotionWithoutCommonHeader);
7220 TypePromotionTransaction TPT(RemovedInsts);
7221 TypePromotionTransaction::ConstRestorationPt LastKnownGood =
7222 TPT.getRestorationPoint();
7223 SmallVector<Instruction *, 1> Exts;
7224 SmallVector<Instruction *, 2> SpeculativelyMovedExts;
7225 Exts.push_back(Elt: Inst);
7226
7227 bool HasPromoted = tryToPromoteExts(TPT, Exts, ProfitablyMovedExts&: SpeculativelyMovedExts);
7228
7229 // Look for a load being extended.
7230 LoadInst *LI = nullptr;
7231 Instruction *ExtFedByLoad;
7232
7233 // Try to promote a chain of computation if it allows to form an extended
7234 // load.
7235 if (canFormExtLd(MovedExts: SpeculativelyMovedExts, LI, Inst&: ExtFedByLoad, HasPromoted)) {
7236 assert(LI && ExtFedByLoad && "Expect a valid load and extension");
7237 TPT.commit();
7238 // Move the extend into the same block as the load.
7239 ExtFedByLoad->moveAfter(MovePos: LI);
7240 ++NumExtsMoved;
7241 Inst = ExtFedByLoad;
7242 return true;
7243 }
7244
7245 // Continue promoting SExts if known as considerable depending on targets.
7246 if (ATPConsiderable &&
7247 performAddressTypePromotion(Inst, AllowPromotionWithoutCommonHeader,
7248 HasPromoted, TPT, SpeculativelyMovedExts))
7249 return true;
7250
7251 TPT.rollback(Point: LastKnownGood);
7252 return false;
7253}
7254
7255// Perform address type promotion if doing so is profitable.
7256// If AllowPromotionWithoutCommonHeader == false, we should find other sext
7257// instructions that sign extended the same initial value. However, if
7258// AllowPromotionWithoutCommonHeader == true, we expect promoting the
7259// extension is just profitable.
7260bool CodeGenPrepare::performAddressTypePromotion(
7261 Instruction *&Inst, bool AllowPromotionWithoutCommonHeader,
7262 bool HasPromoted, TypePromotionTransaction &TPT,
7263 SmallVectorImpl<Instruction *> &SpeculativelyMovedExts) {
7264 bool Promoted = false;
7265 SmallPtrSet<Instruction *, 1> UnhandledExts;
7266 bool AllSeenFirst = true;
7267 for (auto *I : SpeculativelyMovedExts) {
7268 Value *HeadOfChain = I->getOperand(i: 0);
7269 auto AlreadySeen = SeenChainsForSExt.find(Val: HeadOfChain);
7270 // If there is an unhandled SExt which has the same header, try to promote
7271 // it as well.
7272 if (AlreadySeen != SeenChainsForSExt.end()) {
7273 if (AlreadySeen->second != nullptr)
7274 UnhandledExts.insert(Ptr: AlreadySeen->second);
7275 AllSeenFirst = false;
7276 }
7277 }
7278
7279 if (!AllSeenFirst || (AllowPromotionWithoutCommonHeader &&
7280 SpeculativelyMovedExts.size() == 1)) {
7281 TPT.commit();
7282 if (HasPromoted)
7283 Promoted = true;
7284 for (auto *I : SpeculativelyMovedExts) {
7285 Value *HeadOfChain = I->getOperand(i: 0);
7286 SeenChainsForSExt[HeadOfChain] = nullptr;
7287 ValToSExtendedUses[HeadOfChain].push_back(Elt: I);
7288 }
7289 // Update Inst as promotion happen.
7290 Inst = SpeculativelyMovedExts.pop_back_val();
7291 } else {
7292 // This is the first chain visited from the header, keep the current chain
7293 // as unhandled. Defer to promote this until we encounter another SExt
7294 // chain derived from the same header.
7295 for (auto *I : SpeculativelyMovedExts) {
7296 Value *HeadOfChain = I->getOperand(i: 0);
7297 SeenChainsForSExt[HeadOfChain] = Inst;
7298 }
7299 return false;
7300 }
7301
7302 if (!AllSeenFirst && !UnhandledExts.empty())
7303 for (auto *VisitedSExt : UnhandledExts) {
7304 if (RemovedInsts.count(Ptr: VisitedSExt))
7305 continue;
7306 TypePromotionTransaction TPT(RemovedInsts);
7307 SmallVector<Instruction *, 1> Exts;
7308 SmallVector<Instruction *, 2> Chains;
7309 Exts.push_back(Elt: VisitedSExt);
7310 bool HasPromoted = tryToPromoteExts(TPT, Exts, ProfitablyMovedExts&: Chains);
7311 TPT.commit();
7312 if (HasPromoted)
7313 Promoted = true;
7314 for (auto *I : Chains) {
7315 Value *HeadOfChain = I->getOperand(i: 0);
7316 // Mark this as handled.
7317 SeenChainsForSExt[HeadOfChain] = nullptr;
7318 ValToSExtendedUses[HeadOfChain].push_back(Elt: I);
7319 }
7320 }
7321 return Promoted;
7322}
7323
7324bool CodeGenPrepare::optimizeExtUses(Instruction *I) {
7325 BasicBlock *DefBB = I->getParent();
7326
7327 // If the result of a {s|z}ext and its source are both live out, rewrite all
7328 // other uses of the source with result of extension.
7329 Value *Src = I->getOperand(i: 0);
7330 if (Src->hasOneUse())
7331 return false;
7332
7333 // Only do this xform if truncating is free.
7334 if (!TLI->isTruncateFree(FromTy: I->getType(), ToTy: Src->getType()))
7335 return false;
7336
7337 // Only safe to perform the optimization if the source is also defined in
7338 // this block.
7339 if (!isa<Instruction>(Val: Src) || DefBB != cast<Instruction>(Val: Src)->getParent())
7340 return false;
7341
7342 bool DefIsLiveOut = false;
7343 for (User *U : I->users()) {
7344 Instruction *UI = cast<Instruction>(Val: U);
7345
7346 // Figure out which BB this ext is used in.
7347 BasicBlock *UserBB = UI->getParent();
7348 if (UserBB == DefBB)
7349 continue;
7350 DefIsLiveOut = true;
7351 break;
7352 }
7353 if (!DefIsLiveOut)
7354 return false;
7355
7356 // Make sure none of the uses are PHI nodes.
7357 for (User *U : Src->users()) {
7358 Instruction *UI = cast<Instruction>(Val: U);
7359 BasicBlock *UserBB = UI->getParent();
7360 if (UserBB == DefBB)
7361 continue;
7362 // Be conservative. We don't want this xform to end up introducing
7363 // reloads just before load / store instructions.
7364 if (isa<PHINode>(Val: UI) || isa<LoadInst>(Val: UI) || isa<StoreInst>(Val: UI))
7365 return false;
7366 }
7367
7368 // InsertedTruncs - Only insert one trunc in each block once.
7369 DenseMap<BasicBlock *, Instruction *> InsertedTruncs;
7370
7371 bool MadeChange = false;
7372 for (Use &U : Src->uses()) {
7373 Instruction *User = cast<Instruction>(Val: U.getUser());
7374
7375 // Figure out which BB this ext is used in.
7376 BasicBlock *UserBB = User->getParent();
7377 if (UserBB == DefBB)
7378 continue;
7379
7380 // Both src and def are live in this block. Rewrite the use.
7381 Instruction *&InsertedTrunc = InsertedTruncs[UserBB];
7382
7383 if (!InsertedTrunc) {
7384 BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
7385 assert(InsertPt != UserBB->end());
7386 InsertedTrunc = new TruncInst(I, Src->getType(), "");
7387 InsertedTrunc->insertBefore(BB&: *UserBB, InsertPos: InsertPt);
7388 InsertedInsts.insert(Ptr: InsertedTrunc);
7389 }
7390
7391 // Replace a use of the {s|z}ext source with a use of the result.
7392 U = InsertedTrunc;
7393 ++NumExtUses;
7394 MadeChange = true;
7395 }
7396
7397 return MadeChange;
7398}
7399
7400// Find loads whose uses only use some of the loaded value's bits. Add an "and"
7401// just after the load if the target can fold this into one extload instruction,
7402// with the hope of eliminating some of the other later "and" instructions using
7403// the loaded value. "and"s that are made trivially redundant by the insertion
7404// of the new "and" are removed by this function, while others (e.g. those whose
7405// path from the load goes through a phi) are left for isel to potentially
7406// remove.
7407//
7408// For example:
7409//
7410// b0:
7411// x = load i32
7412// ...
7413// b1:
7414// y = and x, 0xff
7415// z = use y
7416//
7417// becomes:
7418//
7419// b0:
7420// x = load i32
7421// x' = and x, 0xff
7422// ...
7423// b1:
7424// z = use x'
7425//
7426// whereas:
7427//
7428// b0:
7429// x1 = load i32
7430// ...
7431// b1:
7432// x2 = load i32
7433// ...
7434// b2:
7435// x = phi x1, x2
7436// y = and x, 0xff
7437//
7438// becomes (after a call to optimizeLoadExt for each load):
7439//
7440// b0:
7441// x1 = load i32
7442// x1' = and x1, 0xff
7443// ...
7444// b1:
7445// x2 = load i32
7446// x2' = and x2, 0xff
7447// ...
7448// b2:
7449// x = phi x1', x2'
7450// y = and x, 0xff
7451bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {
7452 if (!Load->isSimple() || !Load->getType()->isIntOrPtrTy())
7453 return false;
7454
7455 // Skip loads we've already transformed.
7456 if (Load->hasOneUse() &&
7457 InsertedInsts.count(Ptr: cast<Instruction>(Val: *Load->user_begin())))
7458 return false;
7459
7460 // Look at all uses of Load, looking through phis, to determine how many bits
7461 // of the loaded value are needed.
7462 SmallVector<Instruction *, 8> WorkList;
7463 SmallPtrSet<Instruction *, 16> Visited;
7464 SmallVector<Instruction *, 8> AndsToMaybeRemove;
7465 SmallVector<Instruction *, 8> DropFlags;
7466 for (auto *U : Load->users())
7467 WorkList.push_back(Elt: cast<Instruction>(Val: U));
7468
7469 EVT LoadResultVT = TLI->getValueType(DL: *DL, Ty: Load->getType());
7470 unsigned BitWidth = LoadResultVT.getSizeInBits();
7471 // If the BitWidth is 0, do not try to optimize the type
7472 if (BitWidth == 0)
7473 return false;
7474
7475 APInt DemandBits(BitWidth, 0);
7476 APInt WidestAndBits(BitWidth, 0);
7477
7478 while (!WorkList.empty()) {
7479 Instruction *I = WorkList.pop_back_val();
7480
7481 // Break use-def graph loops.
7482 if (!Visited.insert(Ptr: I).second)
7483 continue;
7484
7485 // For a PHI node, push all of its users.
7486 if (auto *Phi = dyn_cast<PHINode>(Val: I)) {
7487 for (auto *U : Phi->users())
7488 WorkList.push_back(Elt: cast<Instruction>(Val: U));
7489 continue;
7490 }
7491
7492 switch (I->getOpcode()) {
7493 case Instruction::And: {
7494 auto *AndC = dyn_cast<ConstantInt>(Val: I->getOperand(i: 1));
7495 if (!AndC)
7496 return false;
7497 APInt AndBits = AndC->getValue();
7498 DemandBits |= AndBits;
7499 // Keep track of the widest and mask we see.
7500 if (AndBits.ugt(RHS: WidestAndBits))
7501 WidestAndBits = AndBits;
7502 if (AndBits == WidestAndBits && I->getOperand(i: 0) == Load)
7503 AndsToMaybeRemove.push_back(Elt: I);
7504 break;
7505 }
7506
7507 case Instruction::Shl: {
7508 auto *ShlC = dyn_cast<ConstantInt>(Val: I->getOperand(i: 1));
7509 if (!ShlC)
7510 return false;
7511 uint64_t ShiftAmt = ShlC->getLimitedValue(Limit: BitWidth - 1);
7512 DemandBits.setLowBits(BitWidth - ShiftAmt);
7513 DropFlags.push_back(Elt: I);
7514 break;
7515 }
7516
7517 case Instruction::Trunc: {
7518 EVT TruncVT = TLI->getValueType(DL: *DL, Ty: I->getType());
7519 unsigned TruncBitWidth = TruncVT.getSizeInBits();
7520 DemandBits.setLowBits(TruncBitWidth);
7521 DropFlags.push_back(Elt: I);
7522 break;
7523 }
7524
7525 default:
7526 return false;
7527 }
7528 }
7529
7530 uint32_t ActiveBits = DemandBits.getActiveBits();
7531 // Avoid hoisting (and (load x) 1) since it is unlikely to be folded by the
7532 // target even if isLoadLegal says an i1 EXTLOAD is valid. For example,
7533 // for the AArch64 target isLoadLegal(i32, i1, ..., ZEXTLOAD, false) returns
7534 // true, but (and (load x) 1) is not matched as a single instruction, rather
7535 // as a LDR followed by an AND.
7536 // TODO: Look into removing this restriction by fixing backends to either
7537 // return false for isLoadLegal for i1 or have them select this pattern to
7538 // a single instruction.
7539 //
7540 // Also avoid hoisting if we didn't see any ands with the exact DemandBits
7541 // mask, since these are the only ands that will be removed by isel.
7542 if (ActiveBits <= 1 || !DemandBits.isMask(numBits: ActiveBits) ||
7543 WidestAndBits != DemandBits)
7544 return false;
7545
7546 LLVMContext &Ctx = Load->getType()->getContext();
7547 Type *TruncTy = Type::getIntNTy(C&: Ctx, N: ActiveBits);
7548 EVT TruncVT = TLI->getValueType(DL: *DL, Ty: TruncTy);
7549
7550 // Reject cases that won't be matched as extloads.
7551 if (!LoadResultVT.bitsGT(VT: TruncVT) || !TruncVT.isRound() ||
7552 !TLI->isLoadLegal(ValVT: LoadResultVT, MemVT: TruncVT, Alignment: Load->getAlign(),
7553 AddrSpace: Load->getPointerAddressSpace(), ExtType: ISD::ZEXTLOAD, Atomic: false))
7554 return false;
7555
7556 IRBuilder<> Builder(Load->getNextNode());
7557 auto *NewAnd = cast<Instruction>(
7558 Val: Builder.CreateAnd(LHS: Load, RHS: ConstantInt::get(Context&: Ctx, V: DemandBits)));
7559 // Mark this instruction as "inserted by CGP", so that other
7560 // optimizations don't touch it.
7561 InsertedInsts.insert(Ptr: NewAnd);
7562
7563 // Replace all uses of load with new and (except for the use of load in the
7564 // new and itself).
7565 replaceAllUsesWith(Old: Load, New: NewAnd, FreshBBs, IsHuge: IsHugeFunc);
7566 NewAnd->setOperand(i: 0, Val: Load);
7567
7568 // Remove any and instructions that are now redundant.
7569 for (auto *And : AndsToMaybeRemove)
7570 // Check that the and mask is the same as the one we decided to put on the
7571 // new and.
7572 if (cast<ConstantInt>(Val: And->getOperand(i: 1))->getValue() == DemandBits) {
7573 replaceAllUsesWith(Old: And, New: NewAnd, FreshBBs, IsHuge: IsHugeFunc);
7574 if (&*CurInstIterator == And)
7575 CurInstIterator = std::next(x: And->getIterator());
7576 And->eraseFromParent();
7577 ++NumAndUses;
7578 }
7579
7580 // NSW flags may not longer hold.
7581 for (auto *Inst : DropFlags)
7582 Inst->setHasNoSignedWrap(false);
7583
7584 ++NumAndsAdded;
7585 return true;
7586}
7587
7588/// Check if V (an operand of a select instruction) is an expensive instruction
7589/// that is only used once.
7590static bool sinkSelectOperand(const TargetTransformInfo *TTI, Value *V) {
7591 auto *I = dyn_cast<Instruction>(Val: V);
7592 // If it's safe to speculatively execute, then it should not have side
7593 // effects; therefore, it's safe to sink and possibly *not* execute.
7594 return I && I->hasOneUse() && isSafeToSpeculativelyExecute(I) &&
7595 TTI->isExpensiveToSpeculativelyExecute(I);
7596}
7597
7598/// Returns true if a SelectInst should be turned into an explicit branch.
7599static bool isFormingBranchFromSelectProfitable(const TargetTransformInfo *TTI,
7600 const TargetLowering *TLI,
7601 SelectInst *SI) {
7602 // If even a predictable select is cheap, then a branch can't be cheaper.
7603 if (!TLI->isPredictableSelectExpensive())
7604 return false;
7605
7606 // FIXME: This should use the same heuristics as IfConversion to determine
7607 // whether a select is better represented as a branch.
7608
7609 // If metadata tells us that the select condition is obviously predictable,
7610 // then we want to replace the select with a branch.
7611 uint64_t TrueWeight, FalseWeight;
7612 if (extractBranchWeights(I: *SI, TrueVal&: TrueWeight, FalseVal&: FalseWeight)) {
7613 uint64_t Max = std::max(a: TrueWeight, b: FalseWeight);
7614 uint64_t Sum = TrueWeight + FalseWeight;
7615 if (Sum != 0) {
7616 auto Probability = BranchProbability::getBranchProbability(Numerator: Max, Denominator: Sum);
7617 if (Probability > TTI->getPredictableBranchThreshold())
7618 return true;
7619 }
7620 }
7621
7622 CmpInst *Cmp = dyn_cast<CmpInst>(Val: SI->getCondition());
7623
7624 // If a branch is predictable, an out-of-order CPU can avoid blocking on its
7625 // comparison condition. If the compare has more than one use, there's
7626 // probably another cmov or setcc around, so it's not worth emitting a branch.
7627 if (!Cmp || !Cmp->hasOneUse())
7628 return false;
7629
7630 // If either operand of the select is expensive and only needed on one side
7631 // of the select, we should form a branch.
7632 if (sinkSelectOperand(TTI, V: SI->getTrueValue()) ||
7633 sinkSelectOperand(TTI, V: SI->getFalseValue()))
7634 return true;
7635
7636 return false;
7637}
7638
7639/// If \p isTrue is true, return the true value of \p SI, otherwise return
7640/// false value of \p SI. If the true/false value of \p SI is defined by any
7641/// select instructions in \p Selects, look through the defining select
7642/// instruction until the true/false value is not defined in \p Selects.
7643static Value *
7644getTrueOrFalseValue(SelectInst *SI, bool isTrue,
7645 const SmallPtrSet<const Instruction *, 2> &Selects) {
7646 Value *V = nullptr;
7647
7648 for (SelectInst *DefSI = SI; DefSI != nullptr && Selects.count(Ptr: DefSI);
7649 DefSI = dyn_cast<SelectInst>(Val: V)) {
7650 assert(DefSI->getCondition() == SI->getCondition() &&
7651 "The condition of DefSI does not match with SI");
7652 V = (isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue());
7653 }
7654
7655 assert(V && "Failed to get select true/false value");
7656 return V;
7657}
7658
7659bool CodeGenPrepare::optimizeShiftInst(BinaryOperator *Shift) {
7660 assert(Shift->isShift() && "Expected a shift");
7661
7662 // If this is (1) a vector shift, (2) shifts by scalars are cheaper than
7663 // general vector shifts, and (3) the shift amount is a select-of-splatted
7664 // values, hoist the shifts before the select:
7665 // shift Op0, (select Cond, TVal, FVal) -->
7666 // select Cond, (shift Op0, TVal), (shift Op0, FVal)
7667 //
7668 // This is inverting a generic IR transform when we know that the cost of a
7669 // general vector shift is more than the cost of 2 shift-by-scalars.
7670 // We can't do this effectively in SDAG because we may not be able to
7671 // determine if the select operands are splats from within a basic block.
7672 Type *Ty = Shift->getType();
7673 if (!Ty->isVectorTy() || !TTI->isVectorShiftByScalarCheap(Ty))
7674 return false;
7675 Value *Cond, *TVal, *FVal;
7676 if (!match(V: Shift->getOperand(i_nocapture: 1),
7677 P: m_OneUse(SubPattern: m_Select(C: m_Value(V&: Cond), L: m_Value(V&: TVal), R: m_Value(V&: FVal)))))
7678 return false;
7679 if (!isSplatValue(V: TVal) || !isSplatValue(V: FVal))
7680 return false;
7681
7682 IRBuilder<> Builder(Shift);
7683 BinaryOperator::BinaryOps Opcode = Shift->getOpcode();
7684 Value *NewTVal = Builder.CreateBinOp(Opc: Opcode, LHS: Shift->getOperand(i_nocapture: 0), RHS: TVal);
7685 Value *NewFVal = Builder.CreateBinOp(Opc: Opcode, LHS: Shift->getOperand(i_nocapture: 0), RHS: FVal);
7686 Value *NewSel = Builder.CreateSelect(C: Cond, True: NewTVal, False: NewFVal);
7687 replaceAllUsesWith(Old: Shift, New: NewSel, FreshBBs, IsHuge: IsHugeFunc);
7688 Shift->eraseFromParent();
7689 return true;
7690}
7691
7692bool CodeGenPrepare::optimizeFunnelShift(IntrinsicInst *Fsh) {
7693 Intrinsic::ID Opcode = Fsh->getIntrinsicID();
7694 assert((Opcode == Intrinsic::fshl || Opcode == Intrinsic::fshr) &&
7695 "Expected a funnel shift");
7696
7697 // If this is (1) a vector funnel shift, (2) shifts by scalars are cheaper
7698 // than general vector shifts, and (3) the shift amount is select-of-splatted
7699 // values, hoist the funnel shifts before the select:
7700 // fsh Op0, Op1, (select Cond, TVal, FVal) -->
7701 // select Cond, (fsh Op0, Op1, TVal), (fsh Op0, Op1, FVal)
7702 //
7703 // This is inverting a generic IR transform when we know that the cost of a
7704 // general vector shift is more than the cost of 2 shift-by-scalars.
7705 // We can't do this effectively in SDAG because we may not be able to
7706 // determine if the select operands are splats from within a basic block.
7707 Type *Ty = Fsh->getType();
7708 if (!Ty->isVectorTy() || !TTI->isVectorShiftByScalarCheap(Ty))
7709 return false;
7710 Value *Cond, *TVal, *FVal;
7711 if (!match(V: Fsh->getOperand(i_nocapture: 2),
7712 P: m_OneUse(SubPattern: m_Select(C: m_Value(V&: Cond), L: m_Value(V&: TVal), R: m_Value(V&: FVal)))))
7713 return false;
7714 if (!isSplatValue(V: TVal) || !isSplatValue(V: FVal))
7715 return false;
7716
7717 IRBuilder<> Builder(Fsh);
7718 Value *X = Fsh->getOperand(i_nocapture: 0), *Y = Fsh->getOperand(i_nocapture: 1);
7719 Value *NewTVal = Builder.CreateIntrinsic(ID: Opcode, OverloadTypes: Ty, Args: {X, Y, TVal});
7720 Value *NewFVal = Builder.CreateIntrinsic(ID: Opcode, OverloadTypes: Ty, Args: {X, Y, FVal});
7721 Value *NewSel = Builder.CreateSelect(C: Cond, True: NewTVal, False: NewFVal);
7722 replaceAllUsesWith(Old: Fsh, New: NewSel, FreshBBs, IsHuge: IsHugeFunc);
7723 Fsh->eraseFromParent();
7724 return true;
7725}
7726
7727/// If we have a SelectInst that will likely profit from branch prediction,
7728/// turn it into a branch.
7729bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
7730 if (DisableSelectToBranch)
7731 return false;
7732
7733 // If the SelectOptimize pass is enabled, selects have already been optimized.
7734 if (!getCGPassBuilderOption().DisableSelectOptimize)
7735 return false;
7736
7737 // Find all consecutive select instructions that share the same condition.
7738 SmallVector<SelectInst *, 2> ASI;
7739 ASI.push_back(Elt: SI);
7740 for (BasicBlock::iterator It = ++BasicBlock::iterator(SI);
7741 It != SI->getParent()->end(); ++It) {
7742 SelectInst *I = dyn_cast<SelectInst>(Val: &*It);
7743 if (I && SI->getCondition() == I->getCondition()) {
7744 ASI.push_back(Elt: I);
7745 } else {
7746 break;
7747 }
7748 }
7749
7750 SelectInst *LastSI = ASI.back();
7751 // Increment the current iterator to skip all the rest of select instructions
7752 // because they will be either "not lowered" or "all lowered" to branch.
7753 CurInstIterator = std::next(x: LastSI->getIterator());
7754 // Examine debug-info attached to the consecutive select instructions. They
7755 // won't be individually optimised by optimizeInst, so we need to perform
7756 // DbgVariableRecord maintenence here instead.
7757 for (SelectInst *SI : ArrayRef(ASI).drop_front())
7758 fixupDbgVariableRecordsOnInst(I&: *SI);
7759
7760 bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(BitWidth: 1);
7761
7762 // Can we convert the 'select' to CF ?
7763 if (VectorCond || SI->getMetadata(KindID: LLVMContext::MD_unpredictable))
7764 return false;
7765
7766 TargetLowering::SelectSupportKind SelectKind;
7767 if (SI->getType()->isVectorTy())
7768 SelectKind = TargetLowering::ScalarCondVectorVal;
7769 else
7770 SelectKind = TargetLowering::ScalarValSelect;
7771
7772 if (TLI->isSelectSupported(SelectKind) &&
7773 (!isFormingBranchFromSelectProfitable(TTI, TLI, SI) ||
7774 llvm::shouldOptimizeForSize(BB: SI->getParent(), PSI, BFI)))
7775 return false;
7776
7777 // Transform a sequence like this:
7778 // start:
7779 // %cmp = cmp uge i32 %a, %b
7780 // %sel = select i1 %cmp, i32 %c, i32 %d
7781 //
7782 // Into:
7783 // start:
7784 // %cmp = cmp uge i32 %a, %b
7785 // %cmp.frozen = freeze %cmp
7786 // br i1 %cmp.frozen, label %select.true, label %select.false
7787 // select.true:
7788 // br label %select.end
7789 // select.false:
7790 // br label %select.end
7791 // select.end:
7792 // %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ]
7793 //
7794 // %cmp should be frozen, otherwise it may introduce undefined behavior.
7795 // In addition, we may sink instructions that produce %c or %d from
7796 // the entry block into the destination(s) of the new branch.
7797 // If the true or false blocks do not contain a sunken instruction, that
7798 // block and its branch may be optimized away. In that case, one side of the
7799 // first branch will point directly to select.end, and the corresponding PHI
7800 // predecessor block will be the start block.
7801 // The CFG is altered here and we update the DominatorTree and the LoopInfo,
7802 // but we don't set a ModifiedDT flag to avoid restarting the function walk in
7803 // runOnFunction for each select optimized.
7804
7805 // Collect values that go on the true side and the values that go on the false
7806 // side.
7807 SmallVector<Instruction *> TrueInstrs, FalseInstrs;
7808 for (SelectInst *SI : ASI) {
7809 if (Value *V = SI->getTrueValue(); sinkSelectOperand(TTI, V))
7810 TrueInstrs.push_back(Elt: cast<Instruction>(Val: V));
7811 if (Value *V = SI->getFalseValue(); sinkSelectOperand(TTI, V))
7812 FalseInstrs.push_back(Elt: cast<Instruction>(Val: V));
7813 }
7814
7815 // Split the select block, according to how many (if any) values go on each
7816 // side.
7817 BasicBlock *StartBlock = SI->getParent();
7818 BasicBlock::iterator SplitPt = std::next(x: BasicBlock::iterator(LastSI));
7819 // We should split before any debug-info.
7820 SplitPt.setHeadBit(true);
7821
7822 IRBuilder<> IB(SI);
7823 auto *CondFr = IB.CreateFreeze(V: SI->getCondition(), Name: SI->getName() + ".frozen");
7824
7825 BasicBlock *TrueBlock = nullptr;
7826 BasicBlock *FalseBlock = nullptr;
7827 BasicBlock *EndBlock = nullptr;
7828 UncondBrInst *TrueBranch = nullptr;
7829 UncondBrInst *FalseBranch = nullptr;
7830 if (TrueInstrs.size() == 0) {
7831 FalseBranch = cast<UncondBrInst>(
7832 Val: SplitBlockAndInsertIfElse(Cond: CondFr, SplitBefore: SplitPt, Unreachable: false, BranchWeights: nullptr, DTU, LI));
7833 FalseBlock = FalseBranch->getParent();
7834 EndBlock = cast<BasicBlock>(Val: FalseBranch->getOperand(i_nocapture: 0));
7835 } else if (FalseInstrs.size() == 0) {
7836 TrueBranch = cast<UncondBrInst>(
7837 Val: SplitBlockAndInsertIfThen(Cond: CondFr, SplitBefore: SplitPt, Unreachable: false, BranchWeights: nullptr, DTU, LI));
7838 TrueBlock = TrueBranch->getParent();
7839 EndBlock = TrueBranch->getSuccessor();
7840 } else {
7841 Instruction *ThenTerm = nullptr;
7842 Instruction *ElseTerm = nullptr;
7843 SplitBlockAndInsertIfThenElse(Cond: CondFr, SplitBefore: SplitPt, ThenTerm: &ThenTerm, ElseTerm: &ElseTerm,
7844 BranchWeights: nullptr, DTU, LI);
7845 TrueBranch = cast<UncondBrInst>(Val: ThenTerm);
7846 FalseBranch = cast<UncondBrInst>(Val: ElseTerm);
7847 TrueBlock = TrueBranch->getParent();
7848 FalseBlock = FalseBranch->getParent();
7849 EndBlock = TrueBranch->getSuccessor();
7850 }
7851
7852 EndBlock->setName("select.end");
7853 if (TrueBlock)
7854 TrueBlock->setName("select.true.sink");
7855 if (FalseBlock)
7856 FalseBlock->setName(FalseInstrs.size() == 0 ? "select.false"
7857 : "select.false.sink");
7858
7859 if (IsHugeFunc) {
7860 if (TrueBlock)
7861 FreshBBs.insert(Ptr: TrueBlock);
7862 if (FalseBlock)
7863 FreshBBs.insert(Ptr: FalseBlock);
7864 FreshBBs.insert(Ptr: EndBlock);
7865 }
7866
7867 BFI->setBlockFreq(BB: EndBlock, Freq: BFI->getBlockFreq(BB: StartBlock));
7868
7869 static const unsigned MD[] = {
7870 LLVMContext::MD_prof, LLVMContext::MD_unpredictable,
7871 LLVMContext::MD_make_implicit, LLVMContext::MD_dbg};
7872 StartBlock->getTerminator()->copyMetadata(SrcInst: *SI, WL: MD);
7873
7874 // Sink expensive instructions into the conditional blocks to avoid executing
7875 // them speculatively.
7876 for (Instruction *I : TrueInstrs)
7877 I->moveBefore(InsertPos: TrueBranch->getIterator());
7878 for (Instruction *I : FalseInstrs)
7879 I->moveBefore(InsertPos: FalseBranch->getIterator());
7880
7881 // If we did not create a new block for one of the 'true' or 'false' paths
7882 // of the condition, it means that side of the branch goes to the end block
7883 // directly and the path originates from the start block from the point of
7884 // view of the new PHI.
7885 if (TrueBlock == nullptr)
7886 TrueBlock = StartBlock;
7887 else if (FalseBlock == nullptr)
7888 FalseBlock = StartBlock;
7889
7890 SmallPtrSet<const Instruction *, 2> INS(llvm::from_range, ASI);
7891 // Use reverse iterator because later select may use the value of the
7892 // earlier select, and we need to propagate value through earlier select
7893 // to get the PHI operand.
7894 for (SelectInst *SI : llvm::reverse(C&: ASI)) {
7895 // The select itself is replaced with a PHI Node.
7896 PHINode *PN = PHINode::Create(Ty: SI->getType(), NumReservedValues: 2, NameStr: "");
7897 PN->insertBefore(InsertPos: EndBlock->begin());
7898 PN->takeName(V: SI);
7899 PN->addIncoming(V: getTrueOrFalseValue(SI, isTrue: true, Selects: INS), BB: TrueBlock);
7900 PN->addIncoming(V: getTrueOrFalseValue(SI, isTrue: false, Selects: INS), BB: FalseBlock);
7901 PN->setDebugLoc(SI->getDebugLoc());
7902
7903 replaceAllUsesWith(Old: SI, New: PN, FreshBBs, IsHuge: IsHugeFunc);
7904 SI->eraseFromParent();
7905 INS.erase(Ptr: SI);
7906 ++NumSelectsExpanded;
7907 }
7908
7909 // Instruct OptimizeBlock to skip to the next block.
7910 CurInstIterator = StartBlock->end();
7911 return true;
7912}
7913
7914/// Some targets only accept certain types for splat inputs. For example a VDUP
7915/// in MVE takes a GPR (integer) register, and the instruction that incorporate
7916/// a VDUP (such as a VADD qd, qm, rm) also require a gpr register.
7917bool CodeGenPrepare::optimizeShuffleVectorInst(ShuffleVectorInst *SVI) {
7918 // Accept shuf(insertelem(undef/poison, val, 0), undef/poison, <0,0,..>) only
7919 if (!match(V: SVI, P: m_Shuffle(v1: m_InsertElt(Val: m_Undef(), Elt: m_Value(), Idx: m_ZeroInt()),
7920 v2: m_Undef(), mask: m_ZeroMask())))
7921 return false;
7922 Type *NewType = TLI->shouldConvertSplatType(SVI);
7923 if (!NewType)
7924 return false;
7925
7926 auto *SVIVecType = cast<FixedVectorType>(Val: SVI->getType());
7927 assert(!NewType->isVectorTy() && "Expected a scalar type!");
7928 assert(NewType->getScalarSizeInBits() == SVIVecType->getScalarSizeInBits() &&
7929 "Expected a type of the same size!");
7930 auto *NewVecType =
7931 FixedVectorType::get(ElementType: NewType, NumElts: SVIVecType->getNumElements());
7932
7933 // Create a bitcast (shuffle (insert (bitcast(..))))
7934 IRBuilder<> Builder(SVI->getContext());
7935 Builder.SetInsertPoint(SVI);
7936 Value *BC1 = Builder.CreateBitCast(
7937 V: cast<Instruction>(Val: SVI->getOperand(i_nocapture: 0))->getOperand(i: 1), DestTy: NewType);
7938 Value *Shuffle = Builder.CreateVectorSplat(NumElts: NewVecType->getNumElements(), V: BC1);
7939 Value *BC2 = Builder.CreateBitCast(V: Shuffle, DestTy: SVIVecType);
7940
7941 replaceAllUsesWith(Old: SVI, New: BC2, FreshBBs, IsHuge: IsHugeFunc);
7942 RecursivelyDeleteTriviallyDeadInstructions(
7943 V: SVI, TLI: TLInfo, MSSAU: nullptr,
7944 AboutToDeleteCallback: [&](Value *V) { removeAllAssertingVHReferences(V); });
7945
7946 // Also hoist the bitcast up to its operand if it they are not in the same
7947 // block.
7948 if (auto *BCI = dyn_cast<Instruction>(Val: BC1))
7949 if (auto *Op = dyn_cast<Instruction>(Val: BCI->getOperand(i: 0)))
7950 if (BCI->getParent() != Op->getParent() && !isa<PHINode>(Val: Op) &&
7951 !Op->isTerminator() && !Op->isEHPad())
7952 BCI->moveAfter(MovePos: Op);
7953
7954 return true;
7955}
7956
7957bool CodeGenPrepare::tryToSinkFreeOperands(Instruction *I) {
7958 // If the operands of I can be folded into a target instruction together with
7959 // I, duplicate and sink them.
7960 SmallVector<Use *, 4> OpsToSink;
7961 if (!TTI->isProfitableToSinkOperands(I, Ops&: OpsToSink))
7962 return false;
7963
7964 // OpsToSink can contain multiple uses in a use chain (e.g.
7965 // (%u1 with %u1 = shufflevector), (%u2 with %u2 = zext %u1)). The dominating
7966 // uses must come first, so we process the ops in reverse order so as to not
7967 // create invalid IR.
7968 BasicBlock *TargetBB = I->getParent();
7969 bool Changed = false;
7970 SmallVector<Use *, 4> ToReplace;
7971 Instruction *InsertPoint = I;
7972 for (Use *U : reverse(C&: OpsToSink)) {
7973 auto *UI = cast<Instruction>(Val: U->get());
7974 if (isa<PHINode>(Val: UI) || UI->mayHaveSideEffects() || UI->mayReadFromMemory())
7975 continue;
7976 if (UI->getParent() == TargetBB) {
7977 if (UI->comesBefore(Other: InsertPoint))
7978 InsertPoint = UI;
7979 continue;
7980 }
7981 ToReplace.push_back(Elt: U);
7982 }
7983
7984 SetVector<Instruction *> MaybeDead;
7985 DenseMap<Instruction *, Instruction *> NewInstructions;
7986 for (Use *U : ToReplace) {
7987 auto *UI = cast<Instruction>(Val: U->get());
7988 Instruction *NI = UI->clone();
7989
7990 if (IsHugeFunc) {
7991 // Now we clone an instruction, its operands' defs may sink to this BB
7992 // now. So we put the operands defs' BBs into FreshBBs to do optimization.
7993 for (Value *Op : NI->operands())
7994 if (auto *OpDef = dyn_cast<Instruction>(Val: Op))
7995 FreshBBs.insert(Ptr: OpDef->getParent());
7996 }
7997
7998 NewInstructions[UI] = NI;
7999 MaybeDead.insert(X: UI);
8000 LLVM_DEBUG(dbgs() << "Sinking " << *UI << " to user " << *I << "\n");
8001 NI->insertBefore(InsertPos: InsertPoint->getIterator());
8002 InsertPoint = NI;
8003 InsertedInsts.insert(Ptr: NI);
8004
8005 // Update the use for the new instruction, making sure that we update the
8006 // sunk instruction uses, if it is part of a chain that has already been
8007 // sunk.
8008 Instruction *OldI = cast<Instruction>(Val: U->getUser());
8009 if (auto It = NewInstructions.find(Val: OldI); It != NewInstructions.end())
8010 It->second->setOperand(i: U->getOperandNo(), Val: NI);
8011 else
8012 U->set(NI);
8013 Changed = true;
8014 }
8015
8016 // Remove instructions that are dead after sinking.
8017 for (auto *I : MaybeDead) {
8018 if (!I->hasNUsesOrMore(N: 1)) {
8019 LLVM_DEBUG(dbgs() << "Removing dead instruction: " << *I << "\n");
8020 I->eraseFromParent();
8021 }
8022 }
8023
8024 return Changed;
8025}
8026
8027bool CodeGenPrepare::optimizeSwitchType(SwitchInst *SI) {
8028 Value *Cond = SI->getCondition();
8029 Type *OldType = Cond->getType();
8030 LLVMContext &Context = Cond->getContext();
8031 EVT OldVT = TLI->getValueType(DL: *DL, Ty: OldType);
8032 MVT RegType = TLI->getPreferredSwitchConditionType(Context, ConditionVT: OldVT);
8033 unsigned RegWidth = RegType.getSizeInBits();
8034
8035 if (RegWidth <= cast<IntegerType>(Val: OldType)->getBitWidth())
8036 return false;
8037
8038 // If the register width is greater than the type width, expand the condition
8039 // of the switch instruction and each case constant to the width of the
8040 // register. By widening the type of the switch condition, subsequent
8041 // comparisons (for case comparisons) will not need to be extended to the
8042 // preferred register width, so we will potentially eliminate N-1 extends,
8043 // where N is the number of cases in the switch.
8044 auto *NewType = Type::getIntNTy(C&: Context, N: RegWidth);
8045
8046 // Extend the switch condition and case constants using the target preferred
8047 // extend unless the switch condition is a function argument with an extend
8048 // attribute. In that case, we can avoid an unnecessary mask/extension by
8049 // matching the argument extension instead.
8050 Instruction::CastOps ExtType = Instruction::ZExt;
8051 // Some targets prefer SExt over ZExt.
8052 if (TLI->isSExtCheaperThanZExt(FromTy: OldVT, ToTy: RegType))
8053 ExtType = Instruction::SExt;
8054
8055 if (auto *Arg = dyn_cast<Argument>(Val: Cond)) {
8056 if (Arg->hasSExtAttr())
8057 ExtType = Instruction::SExt;
8058 if (Arg->hasZExtAttr())
8059 ExtType = Instruction::ZExt;
8060 }
8061
8062 auto *ExtInst = CastInst::Create(ExtType, S: Cond, Ty: NewType);
8063 ExtInst->insertBefore(InsertPos: SI->getIterator());
8064 ExtInst->setDebugLoc(SI->getDebugLoc());
8065 SI->setCondition(ExtInst);
8066 for (auto Case : SI->cases()) {
8067 const APInt &NarrowConst = Case.getCaseValue()->getValue();
8068 APInt WideConst = (ExtType == Instruction::ZExt)
8069 ? NarrowConst.zext(width: RegWidth)
8070 : NarrowConst.sext(width: RegWidth);
8071 Case.setValue(ConstantInt::get(Context, V: WideConst));
8072 }
8073
8074 return true;
8075}
8076
8077bool CodeGenPrepare::optimizeSwitchPhiConstants(SwitchInst *SI) {
8078 // The SCCP optimization tends to produce code like this:
8079 // switch(x) { case 42: phi(42, ...) }
8080 // Materializing the constant for the phi-argument needs instructions; So we
8081 // change the code to:
8082 // switch(x) { case 42: phi(x, ...) }
8083
8084 Value *Condition = SI->getCondition();
8085 // Avoid endless loop in degenerate case.
8086 if (isa<ConstantInt>(Val: *Condition))
8087 return false;
8088
8089 bool Changed = false;
8090 BasicBlock *SwitchBB = SI->getParent();
8091 Type *ConditionType = Condition->getType();
8092
8093 for (const SwitchInst::CaseHandle &Case : SI->cases()) {
8094 ConstantInt *CaseValue = Case.getCaseValue();
8095 BasicBlock *CaseBB = Case.getCaseSuccessor();
8096 // Set to true if we previously checked that `CaseBB` is only reached by
8097 // a single case from this switch.
8098 bool CheckedForSinglePred = false;
8099 for (PHINode &PHI : CaseBB->phis()) {
8100 Type *PHIType = PHI.getType();
8101 // If ZExt is free then we can also catch patterns like this:
8102 // switch((i32)x) { case 42: phi((i64)42, ...); }
8103 // and replace `(i64)42` with `zext i32 %x to i64`.
8104 bool TryZExt =
8105 PHIType->isIntegerTy() &&
8106 PHIType->getIntegerBitWidth() > ConditionType->getIntegerBitWidth() &&
8107 TLI->isZExtFree(FromTy: ConditionType, ToTy: PHIType);
8108 if (PHIType == ConditionType || TryZExt) {
8109 // Set to true to skip this case because of multiple preds.
8110 bool SkipCase = false;
8111 Value *Replacement = nullptr;
8112 for (unsigned I = 0, E = PHI.getNumIncomingValues(); I != E; I++) {
8113 Value *PHIValue = PHI.getIncomingValue(i: I);
8114 if (PHIValue != CaseValue) {
8115 if (!TryZExt)
8116 continue;
8117 ConstantInt *PHIValueInt = dyn_cast<ConstantInt>(Val: PHIValue);
8118 if (!PHIValueInt ||
8119 PHIValueInt->getValue() !=
8120 CaseValue->getValue().zext(width: PHIType->getIntegerBitWidth()))
8121 continue;
8122 }
8123 if (PHI.getIncomingBlock(i: I) != SwitchBB)
8124 continue;
8125 // We cannot optimize if there are multiple case labels jumping to
8126 // this block. This check may get expensive when there are many
8127 // case labels so we test for it last.
8128 if (!CheckedForSinglePred) {
8129 CheckedForSinglePred = true;
8130 if (SI->findCaseDest(BB: CaseBB) == nullptr) {
8131 SkipCase = true;
8132 break;
8133 }
8134 }
8135
8136 if (Replacement == nullptr) {
8137 if (PHIValue == CaseValue) {
8138 Replacement = Condition;
8139 } else {
8140 IRBuilder<> Builder(SI);
8141 Replacement = Builder.CreateZExt(V: Condition, DestTy: PHIType);
8142 }
8143 }
8144 PHI.setIncomingValue(i: I, V: Replacement);
8145 Changed = true;
8146 }
8147 if (SkipCase)
8148 break;
8149 }
8150 }
8151 }
8152 return Changed;
8153}
8154
8155bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) {
8156 bool Changed = optimizeSwitchType(SI);
8157 Changed |= optimizeSwitchPhiConstants(SI);
8158 return Changed;
8159}
8160
8161namespace {
8162
8163/// Helper class to promote a scalar operation to a vector one.
8164/// This class is used to move downward extractelement transition.
8165/// E.g.,
8166/// a = vector_op <2 x i32>
8167/// b = extractelement <2 x i32> a, i32 0
8168/// c = scalar_op b
8169/// store c
8170///
8171/// =>
8172/// a = vector_op <2 x i32>
8173/// c = vector_op a (equivalent to scalar_op on the related lane)
8174/// * d = extractelement <2 x i32> c, i32 0
8175/// * store d
8176/// Assuming both extractelement and store can be combine, we get rid of the
8177/// transition.
8178class VectorPromoteHelper {
8179 /// DataLayout associated with the current module.
8180 const DataLayout &DL;
8181
8182 /// Used to perform some checks on the legality of vector operations.
8183 const TargetLowering &TLI;
8184
8185 /// Used to estimated the cost of the promoted chain.
8186 const TargetTransformInfo &TTI;
8187
8188 /// The transition being moved downwards.
8189 Instruction *Transition;
8190
8191 /// The sequence of instructions to be promoted.
8192 SmallVector<Instruction *, 4> InstsToBePromoted;
8193
8194 /// Cost of combining a store and an extract.
8195 unsigned StoreExtractCombineCost;
8196
8197 /// Instruction that will be combined with the transition.
8198 Instruction *CombineInst = nullptr;
8199
8200 /// The instruction that represents the current end of the transition.
8201 /// Since we are faking the promotion until we reach the end of the chain
8202 /// of computation, we need a way to get the current end of the transition.
8203 Instruction *getEndOfTransition() const {
8204 if (InstsToBePromoted.empty())
8205 return Transition;
8206 return InstsToBePromoted.back();
8207 }
8208
8209 /// Return the index of the original value in the transition.
8210 /// E.g., for "extractelement <2 x i32> c, i32 1" the original value,
8211 /// c, is at index 0.
8212 unsigned getTransitionOriginalValueIdx() const {
8213 assert(isa<ExtractElementInst>(Transition) &&
8214 "Other kind of transitions are not supported yet");
8215 return 0;
8216 }
8217
8218 /// Return the index of the index in the transition.
8219 /// E.g., for "extractelement <2 x i32> c, i32 0" the index
8220 /// is at index 1.
8221 unsigned getTransitionIdx() const {
8222 assert(isa<ExtractElementInst>(Transition) &&
8223 "Other kind of transitions are not supported yet");
8224 return 1;
8225 }
8226
8227 /// Get the type of the transition.
8228 /// This is the type of the original value.
8229 /// E.g., for "extractelement <2 x i32> c, i32 1" the type of the
8230 /// transition is <2 x i32>.
8231 Type *getTransitionType() const {
8232 return Transition->getOperand(i: getTransitionOriginalValueIdx())->getType();
8233 }
8234
8235 /// Promote \p ToBePromoted by moving \p Def downward through.
8236 /// I.e., we have the following sequence:
8237 /// Def = Transition <ty1> a to <ty2>
8238 /// b = ToBePromoted <ty2> Def, ...
8239 /// =>
8240 /// b = ToBePromoted <ty1> a, ...
8241 /// Def = Transition <ty1> ToBePromoted to <ty2>
8242 void promoteImpl(Instruction *ToBePromoted);
8243
8244 /// Check whether or not it is profitable to promote all the
8245 /// instructions enqueued to be promoted.
8246 bool isProfitableToPromote() {
8247 Value *ValIdx = Transition->getOperand(i: getTransitionOriginalValueIdx());
8248 unsigned Index = isa<ConstantInt>(Val: ValIdx)
8249 ? cast<ConstantInt>(Val: ValIdx)->getZExtValue()
8250 : -1;
8251 Type *PromotedType = getTransitionType();
8252
8253 StoreInst *ST = cast<StoreInst>(Val: CombineInst);
8254 unsigned AS = ST->getPointerAddressSpace();
8255 // Check if this store is supported.
8256 if (!TLI.allowsMisalignedMemoryAccesses(
8257 TLI.getValueType(DL, Ty: ST->getValueOperand()->getType()), AddrSpace: AS,
8258 Alignment: ST->getAlign())) {
8259 // If this is not supported, there is no way we can combine
8260 // the extract with the store.
8261 return false;
8262 }
8263
8264 // The scalar chain of computation has to pay for the transition
8265 // scalar to vector.
8266 // The vector chain has to account for the combining cost.
8267 enum TargetTransformInfo::TargetCostKind CostKind =
8268 TargetTransformInfo::TCK_RecipThroughput;
8269 InstructionCost ScalarCost =
8270 TTI.getVectorInstrCost(I: *Transition, Val: PromotedType, CostKind, Index);
8271 InstructionCost VectorCost = StoreExtractCombineCost;
8272 for (const auto &Inst : InstsToBePromoted) {
8273 // Compute the cost.
8274 // By construction, all instructions being promoted are arithmetic ones.
8275 // Moreover, one argument is a constant that can be viewed as a splat
8276 // constant.
8277 Value *Arg0 = Inst->getOperand(i: 0);
8278 bool IsArg0Constant = isa<UndefValue>(Val: Arg0) || isa<ConstantInt>(Val: Arg0) ||
8279 isa<ConstantFP>(Val: Arg0);
8280 TargetTransformInfo::OperandValueInfo Arg0Info, Arg1Info;
8281 if (IsArg0Constant)
8282 Arg0Info.Kind = TargetTransformInfo::OK_UniformConstantValue;
8283 else
8284 Arg1Info.Kind = TargetTransformInfo::OK_UniformConstantValue;
8285
8286 ScalarCost += TTI.getArithmeticInstrCost(
8287 Opcode: Inst->getOpcode(), Ty: Inst->getType(), CostKind, Opd1Info: Arg0Info, Opd2Info: Arg1Info);
8288 VectorCost += TTI.getArithmeticInstrCost(Opcode: Inst->getOpcode(), Ty: PromotedType,
8289 CostKind, Opd1Info: Arg0Info, Opd2Info: Arg1Info);
8290 }
8291 LLVM_DEBUG(
8292 dbgs() << "Estimated cost of computation to be promoted:\nScalar: "
8293 << ScalarCost << "\nVector: " << VectorCost << '\n');
8294 return ScalarCost > VectorCost;
8295 }
8296
8297 /// Generate a constant vector with \p Val with the same
8298 /// number of elements as the transition.
8299 /// \p UseSplat defines whether or not \p Val should be replicated
8300 /// across the whole vector.
8301 /// In other words, if UseSplat == true, we generate <Val, Val, ..., Val>,
8302 /// otherwise we generate a vector with as many poison as possible:
8303 /// <poison, ..., poison, Val, poison, ..., poison> where \p Val is only
8304 /// used at the index of the extract.
8305 Value *getConstantVector(Constant *Val, bool UseSplat) const {
8306 unsigned ExtractIdx = std::numeric_limits<unsigned>::max();
8307 if (!UseSplat) {
8308 // If we cannot determine where the constant must be, we have to
8309 // use a splat constant.
8310 Value *ValExtractIdx = Transition->getOperand(i: getTransitionIdx());
8311 if (ConstantInt *CstVal = dyn_cast<ConstantInt>(Val: ValExtractIdx))
8312 ExtractIdx = CstVal->getSExtValue();
8313 else
8314 UseSplat = true;
8315 }
8316
8317 ElementCount EC = cast<VectorType>(Val: getTransitionType())->getElementCount();
8318 if (UseSplat)
8319 return ConstantVector::getSplat(EC, Elt: Val);
8320
8321 if (!EC.isScalable()) {
8322 SmallVector<Constant *, 4> ConstVec;
8323 PoisonValue *PoisonVal = PoisonValue::get(T: Val->getType());
8324 for (unsigned Idx = 0; Idx != EC.getKnownMinValue(); ++Idx) {
8325 if (Idx == ExtractIdx)
8326 ConstVec.push_back(Elt: Val);
8327 else
8328 ConstVec.push_back(Elt: PoisonVal);
8329 }
8330 return ConstantVector::get(V: ConstVec);
8331 } else
8332 llvm_unreachable(
8333 "Generate scalable vector for non-splat is unimplemented");
8334 }
8335
8336 /// Check if promoting to a vector type an operand at \p OperandIdx
8337 /// in \p Use can trigger undefined behavior.
8338 static bool canCauseUndefinedBehavior(const Instruction *Use,
8339 unsigned OperandIdx) {
8340 // This is not safe to introduce undef when the operand is on
8341 // the right hand side of a division-like instruction.
8342 if (OperandIdx != 1)
8343 return false;
8344 switch (Use->getOpcode()) {
8345 default:
8346 return false;
8347 case Instruction::SDiv:
8348 case Instruction::UDiv:
8349 case Instruction::SRem:
8350 case Instruction::URem:
8351 return true;
8352 case Instruction::FDiv:
8353 case Instruction::FRem:
8354 return !Use->hasNoNaNs();
8355 }
8356 llvm_unreachable(nullptr);
8357 }
8358
8359public:
8360 VectorPromoteHelper(const DataLayout &DL, const TargetLowering &TLI,
8361 const TargetTransformInfo &TTI, Instruction *Transition,
8362 unsigned CombineCost)
8363 : DL(DL), TLI(TLI), TTI(TTI), Transition(Transition),
8364 StoreExtractCombineCost(CombineCost) {
8365 assert(Transition && "Do not know how to promote null");
8366 }
8367
8368 /// Check if we can promote \p ToBePromoted to \p Type.
8369 bool canPromote(const Instruction *ToBePromoted) const {
8370 // We could support CastInst too.
8371 return isa<BinaryOperator>(Val: ToBePromoted);
8372 }
8373
8374 /// Check if it is profitable to promote \p ToBePromoted
8375 /// by moving downward the transition through.
8376 bool shouldPromote(const Instruction *ToBePromoted) const {
8377 // Promote only if all the operands can be statically expanded.
8378 // Indeed, we do not want to introduce any new kind of transitions.
8379 for (const Use &U : ToBePromoted->operands()) {
8380 const Value *Val = U.get();
8381 if (Val == getEndOfTransition()) {
8382 // If the use is a division and the transition is on the rhs,
8383 // we cannot promote the operation, otherwise we may create a
8384 // division by zero.
8385 if (canCauseUndefinedBehavior(Use: ToBePromoted, OperandIdx: U.getOperandNo()))
8386 return false;
8387 continue;
8388 }
8389 if (!isa<ConstantInt>(Val) && !isa<UndefValue>(Val) &&
8390 !isa<ConstantFP>(Val))
8391 return false;
8392 }
8393 // Check that the resulting operation is legal.
8394 int ISDOpcode = TLI.InstructionOpcodeToISD(Opcode: ToBePromoted->getOpcode());
8395 if (!ISDOpcode)
8396 return false;
8397 return StressStoreExtract ||
8398 TLI.isOperationLegalOrCustom(
8399 Op: ISDOpcode, VT: TLI.getValueType(DL, Ty: getTransitionType(), AllowUnknown: true));
8400 }
8401
8402 /// Check whether or not \p Use can be combined
8403 /// with the transition.
8404 /// I.e., is it possible to do Use(Transition) => AnotherUse?
8405 bool canCombine(const Instruction *Use) { return isa<StoreInst>(Val: Use); }
8406
8407 /// Record \p ToBePromoted as part of the chain to be promoted.
8408 void enqueueForPromotion(Instruction *ToBePromoted) {
8409 InstsToBePromoted.push_back(Elt: ToBePromoted);
8410 }
8411
8412 /// Set the instruction that will be combined with the transition.
8413 void recordCombineInstruction(Instruction *ToBeCombined) {
8414 assert(canCombine(ToBeCombined) && "Unsupported instruction to combine");
8415 CombineInst = ToBeCombined;
8416 }
8417
8418 /// Promote all the instructions enqueued for promotion if it is
8419 /// is profitable.
8420 /// \return True if the promotion happened, false otherwise.
8421 bool promote() {
8422 // Check if there is something to promote.
8423 // Right now, if we do not have anything to combine with,
8424 // we assume the promotion is not profitable.
8425 if (InstsToBePromoted.empty() || !CombineInst)
8426 return false;
8427
8428 // Check cost.
8429 if (!StressStoreExtract && !isProfitableToPromote())
8430 return false;
8431
8432 // Promote.
8433 for (auto &ToBePromoted : InstsToBePromoted)
8434 promoteImpl(ToBePromoted);
8435 InstsToBePromoted.clear();
8436 return true;
8437 }
8438};
8439
8440} // end anonymous namespace
8441
8442void VectorPromoteHelper::promoteImpl(Instruction *ToBePromoted) {
8443 // At this point, we know that all the operands of ToBePromoted but Def
8444 // can be statically promoted.
8445 // For Def, we need to use its parameter in ToBePromoted:
8446 // b = ToBePromoted ty1 a
8447 // Def = Transition ty1 b to ty2
8448 // Move the transition down.
8449 // 1. Replace all uses of the promoted operation by the transition.
8450 // = ... b => = ... Def.
8451 assert(ToBePromoted->getType() == Transition->getType() &&
8452 "The type of the result of the transition does not match "
8453 "the final type");
8454 ToBePromoted->replaceAllUsesWith(V: Transition);
8455 // 2. Update the type of the uses.
8456 // b = ToBePromoted ty2 Def => b = ToBePromoted ty1 Def.
8457 Type *TransitionTy = getTransitionType();
8458 ToBePromoted->mutateType(Ty: TransitionTy);
8459 // 3. Update all the operands of the promoted operation with promoted
8460 // operands.
8461 // b = ToBePromoted ty1 Def => b = ToBePromoted ty1 a.
8462 for (Use &U : ToBePromoted->operands()) {
8463 Value *Val = U.get();
8464 Value *NewVal = nullptr;
8465 if (Val == Transition)
8466 NewVal = Transition->getOperand(i: getTransitionOriginalValueIdx());
8467 else if (isa<UndefValue>(Val) || isa<ConstantInt>(Val) ||
8468 isa<ConstantFP>(Val)) {
8469 // Use a splat constant if it is not safe to use undef.
8470 NewVal = getConstantVector(
8471 Val: cast<Constant>(Val),
8472 UseSplat: isa<UndefValue>(Val) ||
8473 canCauseUndefinedBehavior(Use: ToBePromoted, OperandIdx: U.getOperandNo()));
8474 } else
8475 llvm_unreachable("Did you modified shouldPromote and forgot to update "
8476 "this?");
8477 ToBePromoted->setOperand(i: U.getOperandNo(), Val: NewVal);
8478 }
8479 Transition->moveAfter(MovePos: ToBePromoted);
8480 Transition->setOperand(i: getTransitionOriginalValueIdx(), Val: ToBePromoted);
8481}
8482
8483/// Some targets can do store(extractelement) with one instruction.
8484/// Try to push the extractelement towards the stores when the target
8485/// has this feature and this is profitable.
8486bool CodeGenPrepare::optimizeExtractElementInst(Instruction *Inst) {
8487 unsigned CombineCost = std::numeric_limits<unsigned>::max();
8488 if (DisableStoreExtract ||
8489 (!StressStoreExtract &&
8490 !TLI->canCombineStoreAndExtract(VectorTy: Inst->getOperand(i: 0)->getType(),
8491 Idx: Inst->getOperand(i: 1), Cost&: CombineCost)))
8492 return false;
8493
8494 // At this point we know that Inst is a vector to scalar transition.
8495 // Try to move it down the def-use chain, until:
8496 // - We can combine the transition with its single use
8497 // => we got rid of the transition.
8498 // - We escape the current basic block
8499 // => we would need to check that we are moving it at a cheaper place and
8500 // we do not do that for now.
8501 BasicBlock *Parent = Inst->getParent();
8502 LLVM_DEBUG(dbgs() << "Found an interesting transition: " << *Inst << '\n');
8503 VectorPromoteHelper VPH(*DL, *TLI, *TTI, Inst, CombineCost);
8504 // If the transition has more than one use, assume this is not going to be
8505 // beneficial.
8506 while (Inst->hasOneUse()) {
8507 Instruction *ToBePromoted = cast<Instruction>(Val: *Inst->user_begin());
8508 LLVM_DEBUG(dbgs() << "Use: " << *ToBePromoted << '\n');
8509
8510 if (ToBePromoted->getParent() != Parent) {
8511 LLVM_DEBUG(dbgs() << "Instruction to promote is in a different block ("
8512 << ToBePromoted->getParent()->getName()
8513 << ") than the transition (" << Parent->getName()
8514 << ").\n");
8515 return false;
8516 }
8517
8518 if (VPH.canCombine(Use: ToBePromoted)) {
8519 LLVM_DEBUG(dbgs() << "Assume " << *Inst << '\n'
8520 << "will be combined with: " << *ToBePromoted << '\n');
8521 VPH.recordCombineInstruction(ToBeCombined: ToBePromoted);
8522 bool Changed = VPH.promote();
8523 NumStoreExtractExposed += Changed;
8524 return Changed;
8525 }
8526
8527 LLVM_DEBUG(dbgs() << "Try promoting.\n");
8528 if (!VPH.canPromote(ToBePromoted) || !VPH.shouldPromote(ToBePromoted))
8529 return false;
8530
8531 LLVM_DEBUG(dbgs() << "Promoting is possible... Enqueue for promotion!\n");
8532
8533 VPH.enqueueForPromotion(ToBePromoted);
8534 Inst = ToBePromoted;
8535 }
8536 return false;
8537}
8538
8539/// For the instruction sequence of store below, F and I values
8540/// are bundled together as an i64 value before being stored into memory.
8541/// Sometimes it is more efficient to generate separate stores for F and I,
8542/// which can remove the bitwise instructions or sink them to colder places.
8543///
8544/// (store (or (zext (bitcast F to i32) to i64),
8545/// (shl (zext I to i64), 32)), addr) -->
8546/// (store F, addr) and (store I, addr+4)
8547///
8548/// Similarly, splitting for other merged store can also be beneficial, like:
8549/// For pair of {i32, i32}, i64 store --> two i32 stores.
8550/// For pair of {i32, i16}, i64 store --> two i32 stores.
8551/// For pair of {i16, i16}, i32 store --> two i16 stores.
8552/// For pair of {i16, i8}, i32 store --> two i16 stores.
8553/// For pair of {i8, i8}, i16 store --> two i8 stores.
8554///
8555/// We allow each target to determine specifically which kind of splitting is
8556/// supported.
8557///
8558/// The store patterns are commonly seen from the simple code snippet below
8559/// if only std::make_pair(...) is sroa transformed before inlined into hoo.
8560/// void goo(const std::pair<int, float> &);
8561/// hoo() {
8562/// ...
8563/// goo(std::make_pair(tmp, ftmp));
8564/// ...
8565/// }
8566///
8567/// Although we already have similar splitting in DAG Combine, we duplicate
8568/// it in CodeGenPrepare to catch the case in which pattern is across
8569/// multiple BBs. The logic in DAG Combine is kept to catch case generated
8570/// during code expansion.
8571static bool splitMergedValStore(StoreInst &SI, const DataLayout &DL,
8572 const TargetLowering &TLI) {
8573 // Handle simple but common cases only.
8574 Type *StoreType = SI.getValueOperand()->getType();
8575
8576 // The code below assumes shifting a value by <number of bits>,
8577 // whereas scalable vectors would have to be shifted by
8578 // <2log(vscale) + number of bits> in order to store the
8579 // low/high parts. Bailing out for now.
8580 if (StoreType->isScalableTy())
8581 return false;
8582
8583 if (!DL.typeSizeEqualsStoreSize(Ty: StoreType) ||
8584 DL.getTypeSizeInBits(Ty: StoreType) == 0)
8585 return false;
8586
8587 unsigned HalfValBitSize = DL.getTypeSizeInBits(Ty: StoreType) / 2;
8588 Type *SplitStoreType = Type::getIntNTy(C&: SI.getContext(), N: HalfValBitSize);
8589 if (!DL.typeSizeEqualsStoreSize(Ty: SplitStoreType))
8590 return false;
8591
8592 // Don't split the store if it is volatile or atomic.
8593 if (!SI.isSimple())
8594 return false;
8595
8596 // Match the following patterns:
8597 // (store (or (zext LValue to i64),
8598 // (shl (zext HValue to i64), 32)), HalfValBitSize)
8599 // or
8600 // (store (or (shl (zext HValue to i64), 32)), HalfValBitSize)
8601 // (zext LValue to i64),
8602 // Expect both operands of OR and the first operand of SHL have only
8603 // one use.
8604 Value *LValue, *HValue;
8605 if (!match(V: SI.getValueOperand(),
8606 P: m_c_Or(L: m_OneUse(SubPattern: m_ZExt(Op: m_Value(V&: LValue))),
8607 R: m_OneUse(SubPattern: m_Shl(L: m_OneUse(SubPattern: m_ZExt(Op: m_Value(V&: HValue))),
8608 R: m_SpecificInt(V: HalfValBitSize))))))
8609 return false;
8610
8611 // Check LValue and HValue are int with size less or equal than 32.
8612 if (!LValue->getType()->isIntegerTy() ||
8613 DL.getTypeSizeInBits(Ty: LValue->getType()) > HalfValBitSize ||
8614 !HValue->getType()->isIntegerTy() ||
8615 DL.getTypeSizeInBits(Ty: HValue->getType()) > HalfValBitSize)
8616 return false;
8617
8618 // If LValue/HValue is a bitcast instruction, use the EVT before bitcast
8619 // as the input of target query.
8620 auto *LBC = dyn_cast<BitCastInst>(Val: LValue);
8621 auto *HBC = dyn_cast<BitCastInst>(Val: HValue);
8622 EVT LowTy = LBC ? EVT::getEVT(Ty: LBC->getOperand(i_nocapture: 0)->getType())
8623 : EVT::getEVT(Ty: LValue->getType());
8624 EVT HighTy = HBC ? EVT::getEVT(Ty: HBC->getOperand(i_nocapture: 0)->getType())
8625 : EVT::getEVT(Ty: HValue->getType());
8626 if (!ForceSplitStore && !TLI.isMultiStoresCheaperThanBitsMerge(LTy: LowTy, HTy: HighTy))
8627 return false;
8628
8629 // Start to split store.
8630 IRBuilder<> Builder(SI.getContext());
8631 Builder.SetInsertPoint(&SI);
8632
8633 // If LValue/HValue is a bitcast in another BB, create a new one in current
8634 // BB so it may be merged with the splitted stores by dag combiner.
8635 if (LBC && LBC->getParent() != SI.getParent())
8636 LValue = Builder.CreateBitCast(V: LBC->getOperand(i_nocapture: 0), DestTy: LBC->getType());
8637 if (HBC && HBC->getParent() != SI.getParent())
8638 HValue = Builder.CreateBitCast(V: HBC->getOperand(i_nocapture: 0), DestTy: HBC->getType());
8639
8640 bool IsLE = SI.getDataLayout().isLittleEndian();
8641 auto CreateSplitStore = [&](Value *V, bool Upper) {
8642 V = Builder.CreateZExtOrBitCast(V, DestTy: SplitStoreType);
8643 Value *Addr = SI.getPointerOperand();
8644 Align Alignment = SI.getAlign();
8645 const bool IsOffsetStore = (IsLE && Upper) || (!IsLE && !Upper);
8646 if (IsOffsetStore) {
8647 Addr = Builder.CreateGEP(
8648 Ty: SplitStoreType, Ptr: Addr,
8649 IdxList: ConstantInt::get(Ty: Type::getInt32Ty(C&: SI.getContext()), V: 1));
8650
8651 // When splitting the store in half, naturally one half will retain the
8652 // alignment of the original wider store, regardless of whether it was
8653 // over-aligned or not, while the other will require adjustment.
8654 Alignment = commonAlignment(A: Alignment, Offset: HalfValBitSize / 8);
8655 }
8656 Builder.CreateAlignedStore(Val: V, Ptr: Addr, Align: Alignment);
8657 };
8658
8659 CreateSplitStore(LValue, false);
8660 CreateSplitStore(HValue, true);
8661
8662 // Delete the old store.
8663 SI.eraseFromParent();
8664 return true;
8665}
8666
8667// Return true if the GEP has two operands, the first operand is of a sequential
8668// type, and the second operand is a constant.
8669static bool GEPSequentialConstIndexed(GetElementPtrInst *GEP) {
8670 gep_type_iterator I = gep_type_begin(GEP: *GEP);
8671 return GEP->getNumOperands() == 2 && I.isSequential() &&
8672 isa<ConstantInt>(Val: GEP->getOperand(i_nocapture: 1));
8673}
8674
8675// Try unmerging GEPs to reduce liveness interference (register pressure) across
8676// IndirectBr edges. Since IndirectBr edges tend to touch on many blocks,
8677// reducing liveness interference across those edges benefits global register
8678// allocation. Currently handles only certain cases.
8679//
8680// For example, unmerge %GEPI and %UGEPI as below.
8681//
8682// ---------- BEFORE ----------
8683// SrcBlock:
8684// ...
8685// %GEPIOp = ...
8686// ...
8687// %GEPI = gep %GEPIOp, Idx
8688// ...
8689// indirectbr ... [ label %DstB0, label %DstB1, ... label %DstBi ... ]
8690// (* %GEPI is alive on the indirectbr edges due to other uses ahead)
8691// (* %GEPIOp is alive on the indirectbr edges only because of it's used by
8692// %UGEPI)
8693//
8694// DstB0: ... (there may be a gep similar to %UGEPI to be unmerged)
8695// DstB1: ... (there may be a gep similar to %UGEPI to be unmerged)
8696// ...
8697//
8698// DstBi:
8699// ...
8700// %UGEPI = gep %GEPIOp, UIdx
8701// ...
8702// ---------------------------
8703//
8704// ---------- AFTER ----------
8705// SrcBlock:
8706// ... (same as above)
8707// (* %GEPI is still alive on the indirectbr edges)
8708// (* %GEPIOp is no longer alive on the indirectbr edges as a result of the
8709// unmerging)
8710// ...
8711//
8712// DstBi:
8713// ...
8714// %UGEPI = gep %GEPI, (UIdx-Idx)
8715// ...
8716// ---------------------------
8717//
8718// The register pressure on the IndirectBr edges is reduced because %GEPIOp is
8719// no longer alive on them.
8720//
8721// We try to unmerge GEPs here in CodGenPrepare, as opposed to limiting merging
8722// of GEPs in the first place in InstCombiner::visitGetElementPtrInst() so as
8723// not to disable further simplications and optimizations as a result of GEP
8724// merging.
8725//
8726// Note this unmerging may increase the length of the data flow critical path
8727// (the path from %GEPIOp to %UGEPI would go through %GEPI), which is a tradeoff
8728// between the register pressure and the length of data-flow critical
8729// path. Restricting this to the uncommon IndirectBr case would minimize the
8730// impact of potentially longer critical path, if any, and the impact on compile
8731// time.
8732static bool tryUnmergingGEPsAcrossIndirectBr(GetElementPtrInst *GEPI,
8733 const TargetTransformInfo *TTI) {
8734 BasicBlock *SrcBlock = GEPI->getParent();
8735 // Check that SrcBlock ends with an IndirectBr. If not, give up. The common
8736 // (non-IndirectBr) cases exit early here.
8737 if (!isa<IndirectBrInst>(Val: SrcBlock->getTerminator()))
8738 return false;
8739 // Check that GEPI is a simple gep with a single constant index.
8740 if (!GEPSequentialConstIndexed(GEP: GEPI))
8741 return false;
8742 ConstantInt *GEPIIdx = cast<ConstantInt>(Val: GEPI->getOperand(i_nocapture: 1));
8743 // Check that GEPI is a cheap one.
8744 if (TTI->getIntImmCost(Imm: GEPIIdx->getValue(), Ty: GEPIIdx->getType(),
8745 CostKind: TargetTransformInfo::TCK_SizeAndLatency) >
8746 TargetTransformInfo::TCC_Basic)
8747 return false;
8748 Value *GEPIOp = GEPI->getOperand(i_nocapture: 0);
8749 // Check that GEPIOp is an instruction that's also defined in SrcBlock.
8750 if (!isa<Instruction>(Val: GEPIOp))
8751 return false;
8752 auto *GEPIOpI = cast<Instruction>(Val: GEPIOp);
8753 if (GEPIOpI->getParent() != SrcBlock)
8754 return false;
8755 // Check that GEP is used outside the block, meaning it's alive on the
8756 // IndirectBr edge(s).
8757 if (llvm::none_of(Range: GEPI->users(), P: [&](User *Usr) {
8758 if (auto *I = dyn_cast<Instruction>(Val: Usr)) {
8759 if (I->getParent() != SrcBlock) {
8760 return true;
8761 }
8762 }
8763 return false;
8764 }))
8765 return false;
8766 // The second elements of the GEP chains to be unmerged.
8767 std::vector<GetElementPtrInst *> UGEPIs;
8768 // Check each user of GEPIOp to check if unmerging would make GEPIOp not alive
8769 // on IndirectBr edges.
8770 for (User *Usr : GEPIOp->users()) {
8771 if (Usr == GEPI)
8772 continue;
8773 // Check if Usr is an Instruction. If not, give up.
8774 if (!isa<Instruction>(Val: Usr))
8775 return false;
8776 auto *UI = cast<Instruction>(Val: Usr);
8777 // Check if Usr in the same block as GEPIOp, which is fine, skip.
8778 if (UI->getParent() == SrcBlock)
8779 continue;
8780 // Check if Usr is a GEP. If not, give up.
8781 if (!isa<GetElementPtrInst>(Val: Usr))
8782 return false;
8783 auto *UGEPI = cast<GetElementPtrInst>(Val: Usr);
8784 // Check if UGEPI is a simple gep with a single constant index and GEPIOp is
8785 // the pointer operand to it. If so, record it in the vector. If not, give
8786 // up.
8787 if (!GEPSequentialConstIndexed(GEP: UGEPI))
8788 return false;
8789 if (UGEPI->getOperand(i_nocapture: 0) != GEPIOp)
8790 return false;
8791 if (UGEPI->getSourceElementType() != GEPI->getSourceElementType())
8792 return false;
8793 if (GEPIIdx->getType() !=
8794 cast<ConstantInt>(Val: UGEPI->getOperand(i_nocapture: 1))->getType())
8795 return false;
8796 ConstantInt *UGEPIIdx = cast<ConstantInt>(Val: UGEPI->getOperand(i_nocapture: 1));
8797 if (TTI->getIntImmCost(Imm: UGEPIIdx->getValue(), Ty: UGEPIIdx->getType(),
8798 CostKind: TargetTransformInfo::TCK_SizeAndLatency) >
8799 TargetTransformInfo::TCC_Basic)
8800 return false;
8801 UGEPIs.push_back(x: UGEPI);
8802 }
8803 if (UGEPIs.size() == 0)
8804 return false;
8805 // Check the materializing cost of (Uidx-Idx).
8806 for (GetElementPtrInst *UGEPI : UGEPIs) {
8807 ConstantInt *UGEPIIdx = cast<ConstantInt>(Val: UGEPI->getOperand(i_nocapture: 1));
8808 APInt NewIdx = UGEPIIdx->getValue() - GEPIIdx->getValue();
8809 InstructionCost ImmCost = TTI->getIntImmCost(
8810 Imm: NewIdx, Ty: GEPIIdx->getType(), CostKind: TargetTransformInfo::TCK_SizeAndLatency);
8811 if (ImmCost > TargetTransformInfo::TCC_Basic)
8812 return false;
8813 }
8814 // Now unmerge between GEPI and UGEPIs.
8815 for (GetElementPtrInst *UGEPI : UGEPIs) {
8816 UGEPI->setOperand(i_nocapture: 0, Val_nocapture: GEPI);
8817 ConstantInt *UGEPIIdx = cast<ConstantInt>(Val: UGEPI->getOperand(i_nocapture: 1));
8818 auto NewIdx = UGEPIIdx->getValue() - GEPIIdx->getValue();
8819 Constant *NewUGEPIIdx = ConstantInt::get(Ty: GEPIIdx->getType(), V: NewIdx);
8820 UGEPI->setOperand(i_nocapture: 1, Val_nocapture: NewUGEPIIdx);
8821
8822 auto SourceFlags = GEPI->getNoWrapFlags();
8823 // Intersect flags to avoid UB in updated GEP.
8824 auto TargetFlags =
8825 UGEPI->getNoWrapFlags().intersectForOffsetAdd(Other: SourceFlags);
8826 // If UGEPI now has a negative index, drop the nuw flag.
8827 if (NewIdx.isNegative() && TargetFlags.hasNoUnsignedWrap())
8828 TargetFlags = TargetFlags.withoutNoUnsignedWrap();
8829 UGEPI->setNoWrapFlags(TargetFlags);
8830 }
8831 // After unmerging, verify that GEPIOp is actually only used in SrcBlock (not
8832 // alive on IndirectBr edges).
8833 assert(llvm::none_of(GEPIOp->users(),
8834 [&](User *Usr) {
8835 return cast<Instruction>(Usr)->getParent() != SrcBlock;
8836 }) &&
8837 "GEPIOp is used outside SrcBlock");
8838 return true;
8839}
8840
8841static bool optimizeBranch(CondBrInst *Branch, const TargetLowering &TLI,
8842 SmallPtrSet<BasicBlock *, 32> &FreshBBs,
8843 bool IsHugeFunc) {
8844 // Try and convert
8845 // %c = icmp ult %x, 8
8846 // br %c, bla, blb
8847 // %tc = lshr %x, 3
8848 // to
8849 // %tc = lshr %x, 3
8850 // %c = icmp eq %tc, 0
8851 // br %c, bla, blb
8852 // Creating the cmp to zero can be better for the backend, especially if the
8853 // lshr produces flags that can be used automatically.
8854 if (!TLI.preferZeroCompareBranch())
8855 return false;
8856
8857 ICmpInst *Cmp = dyn_cast<ICmpInst>(Val: Branch->getCondition());
8858 if (!Cmp || !isa<ConstantInt>(Val: Cmp->getOperand(i_nocapture: 1)) || !Cmp->hasOneUse())
8859 return false;
8860
8861 Value *X = Cmp->getOperand(i_nocapture: 0);
8862 if (!X->hasUseList())
8863 return false;
8864
8865 APInt CmpC = cast<ConstantInt>(Val: Cmp->getOperand(i_nocapture: 1))->getValue();
8866
8867 for (auto *U : X->users()) {
8868 Instruction *UI = dyn_cast<Instruction>(Val: U);
8869 // A quick dominance check
8870 if (!UI ||
8871 (UI->getParent() != Branch->getParent() &&
8872 UI->getParent() != Branch->getSuccessor(i: 0) &&
8873 UI->getParent() != Branch->getSuccessor(i: 1)) ||
8874 (UI->getParent() != Branch->getParent() &&
8875 !UI->getParent()->getSinglePredecessor()))
8876 continue;
8877
8878 if (CmpC.isPowerOf2() && Cmp->getPredicate() == ICmpInst::ICMP_ULT &&
8879 match(V: UI, P: m_Shr(L: m_Specific(V: X), R: m_SpecificInt(V: CmpC.logBase2())))) {
8880 IRBuilder<> Builder(Branch);
8881 if (UI->getParent() != Branch->getParent())
8882 UI->moveBefore(InsertPos: Branch->getIterator());
8883 UI->dropPoisonGeneratingFlags();
8884 Value *NewCmp = Builder.CreateCmp(Pred: ICmpInst::ICMP_EQ, LHS: UI,
8885 RHS: ConstantInt::get(Ty: UI->getType(), V: 0));
8886 LLVM_DEBUG(dbgs() << "Converting " << *Cmp << "\n");
8887 LLVM_DEBUG(dbgs() << " to compare on zero: " << *NewCmp << "\n");
8888 replaceAllUsesWith(Old: Cmp, New: NewCmp, FreshBBs, IsHuge: IsHugeFunc);
8889 return true;
8890 }
8891 if (Cmp->isEquality() &&
8892 (match(V: UI, P: m_Add(L: m_Specific(V: X), R: m_SpecificInt(V: -CmpC))) ||
8893 match(V: UI, P: m_Sub(L: m_Specific(V: X), R: m_SpecificInt(V: CmpC))) ||
8894 match(V: UI, P: m_Xor(L: m_Specific(V: X), R: m_SpecificInt(V: CmpC))))) {
8895 IRBuilder<> Builder(Branch);
8896 if (UI->getParent() != Branch->getParent())
8897 UI->moveBefore(InsertPos: Branch->getIterator());
8898 UI->dropPoisonGeneratingFlags();
8899 Value *NewCmp = Builder.CreateCmp(Pred: Cmp->getPredicate(), LHS: UI,
8900 RHS: ConstantInt::get(Ty: UI->getType(), V: 0));
8901 LLVM_DEBUG(dbgs() << "Converting " << *Cmp << "\n");
8902 LLVM_DEBUG(dbgs() << " to compare on zero: " << *NewCmp << "\n");
8903 replaceAllUsesWith(Old: Cmp, New: NewCmp, FreshBBs, IsHuge: IsHugeFunc);
8904 return true;
8905 }
8906 }
8907 return false;
8908}
8909
8910bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
8911 bool AnyChange = false;
8912 AnyChange = fixupDbgVariableRecordsOnInst(I&: *I);
8913
8914 // Bail out if we inserted the instruction to prevent optimizations from
8915 // stepping on each other's toes.
8916 if (InsertedInsts.count(Ptr: I))
8917 return AnyChange;
8918
8919 // TODO: Move into the switch on opcode below here.
8920 if (PHINode *P = dyn_cast<PHINode>(Val: I)) {
8921 // It is possible for very late stage optimizations (such as SimplifyCFG)
8922 // to introduce PHI nodes too late to be cleaned up. If we detect such a
8923 // trivial PHI, go ahead and zap it here.
8924 if (Value *V = simplifyInstruction(I: P, Q: {*DL, TLInfo})) {
8925 LargeOffsetGEPMap.erase(Key: P);
8926 replaceAllUsesWith(Old: P, New: V, FreshBBs, IsHuge: IsHugeFunc);
8927 P->eraseFromParent();
8928 ++NumPHIsElim;
8929 return true;
8930 }
8931 return AnyChange;
8932 }
8933
8934 if (CastInst *CI = dyn_cast<CastInst>(Val: I)) {
8935 // If the source of the cast is a constant, then this should have
8936 // already been constant folded. The only reason NOT to constant fold
8937 // it is if something (e.g. LSR) was careful to place the constant
8938 // evaluation in a block other than then one that uses it (e.g. to hoist
8939 // the address of globals out of a loop). If this is the case, we don't
8940 // want to forward-subst the cast.
8941 if (isa<Constant>(Val: CI->getOperand(i_nocapture: 0)))
8942 return AnyChange;
8943
8944 if (OptimizeNoopCopyExpression(CI, TLI: *TLI, DL: *DL))
8945 return true;
8946
8947 if ((isa<UIToFPInst>(Val: I) || isa<SIToFPInst>(Val: I) || isa<FPToUIInst>(Val: I) ||
8948 isa<TruncInst>(Val: I)) &&
8949 TLI->optimizeExtendOrTruncateConversion(
8950 I, L: LI->getLoopFor(BB: I->getParent()), TTI: *TTI))
8951 return true;
8952
8953 if (isa<ZExtInst>(Val: I) || isa<SExtInst>(Val: I)) {
8954 /// Sink a zext or sext into its user blocks if the target type doesn't
8955 /// fit in one register
8956 if (TLI->getTypeAction(Context&: CI->getContext(),
8957 VT: TLI->getValueType(DL: *DL, Ty: CI->getType())) ==
8958 TargetLowering::TypeExpandInteger) {
8959 return SinkCast(CI);
8960 } else {
8961 if (TLI->optimizeExtendOrTruncateConversion(
8962 I, L: LI->getLoopFor(BB: I->getParent()), TTI: *TTI))
8963 return true;
8964
8965 bool MadeChange = optimizeExt(Inst&: I);
8966 return MadeChange | optimizeExtUses(I);
8967 }
8968 }
8969 return AnyChange;
8970 }
8971
8972 if (auto *Cmp = dyn_cast<CmpInst>(Val: I))
8973 if (optimizeCmp(Cmp, ModifiedDT))
8974 return true;
8975
8976 if (match(V: I, P: m_URem(L: m_Value(), R: m_Value())))
8977 if (optimizeURem(Rem: I))
8978 return true;
8979
8980 if (LoadInst *LI = dyn_cast<LoadInst>(Val: I)) {
8981 LI->setMetadata(KindID: LLVMContext::MD_invariant_group, Node: nullptr);
8982 bool Modified = optimizeLoadExt(Load: LI);
8983 unsigned AS = LI->getPointerAddressSpace();
8984 Modified |= optimizeMemoryInst(MemoryInst: I, Addr: I->getOperand(i: 0), AccessTy: LI->getType(), AddrSpace: AS);
8985 return Modified;
8986 }
8987
8988 if (StoreInst *SI = dyn_cast<StoreInst>(Val: I)) {
8989 if (splitMergedValStore(SI&: *SI, DL: *DL, TLI: *TLI))
8990 return true;
8991 SI->setMetadata(KindID: LLVMContext::MD_invariant_group, Node: nullptr);
8992 unsigned AS = SI->getPointerAddressSpace();
8993 return optimizeMemoryInst(MemoryInst: I, Addr: SI->getOperand(i_nocapture: 1),
8994 AccessTy: SI->getOperand(i_nocapture: 0)->getType(), AddrSpace: AS);
8995 }
8996
8997 if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Val: I)) {
8998 unsigned AS = RMW->getPointerAddressSpace();
8999 return optimizeMemoryInst(MemoryInst: I, Addr: RMW->getPointerOperand(), AccessTy: RMW->getType(), AddrSpace: AS);
9000 }
9001
9002 if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Val: I)) {
9003 unsigned AS = CmpX->getPointerAddressSpace();
9004 return optimizeMemoryInst(MemoryInst: I, Addr: CmpX->getPointerOperand(),
9005 AccessTy: CmpX->getCompareOperand()->getType(), AddrSpace: AS);
9006 }
9007
9008 BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Val: I);
9009
9010 if (BinOp && BinOp->getOpcode() == Instruction::And && EnableAndCmpSinking &&
9011 sinkAndCmp0Expression(AndI: BinOp, TLI: *TLI, InsertedInsts))
9012 return true;
9013
9014 // TODO: Move this into the switch on opcode - it handles shifts already.
9015 if (BinOp && (BinOp->getOpcode() == Instruction::AShr ||
9016 BinOp->getOpcode() == Instruction::LShr)) {
9017 ConstantInt *CI = dyn_cast<ConstantInt>(Val: BinOp->getOperand(i_nocapture: 1));
9018 if (CI && TLI->hasExtractBitsInsn())
9019 if (OptimizeExtractBits(ShiftI: BinOp, CI, TLI: *TLI, DL: *DL))
9020 return true;
9021 }
9022
9023 if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Val: I)) {
9024 if (GEPI->hasAllZeroIndices()) {
9025 /// The GEP operand must be a pointer, so must its result -> BitCast
9026 Instruction *NC = new BitCastInst(GEPI->getOperand(i_nocapture: 0), GEPI->getType(),
9027 GEPI->getName(), GEPI->getIterator());
9028 NC->setDebugLoc(GEPI->getDebugLoc());
9029 replaceAllUsesWith(Old: GEPI, New: NC, FreshBBs, IsHuge: IsHugeFunc);
9030 RecursivelyDeleteTriviallyDeadInstructions(
9031 V: GEPI, TLI: TLInfo, MSSAU: nullptr,
9032 AboutToDeleteCallback: [&](Value *V) { removeAllAssertingVHReferences(V); });
9033 ++NumGEPsElim;
9034 optimizeInst(I: NC, ModifiedDT);
9035 return true;
9036 }
9037 if (tryUnmergingGEPsAcrossIndirectBr(GEPI, TTI)) {
9038 return true;
9039 }
9040 }
9041
9042 if (FreezeInst *FI = dyn_cast<FreezeInst>(Val: I)) {
9043 // freeze(icmp a, const)) -> icmp (freeze a), const
9044 // This helps generate efficient conditional jumps.
9045 Instruction *CmpI = nullptr;
9046 if (ICmpInst *II = dyn_cast<ICmpInst>(Val: FI->getOperand(i_nocapture: 0)))
9047 CmpI = II;
9048 else if (FCmpInst *F = dyn_cast<FCmpInst>(Val: FI->getOperand(i_nocapture: 0)))
9049 CmpI = F->getFastMathFlags().none() ? F : nullptr;
9050
9051 if (CmpI && CmpI->hasOneUse()) {
9052 auto Op0 = CmpI->getOperand(i: 0), Op1 = CmpI->getOperand(i: 1);
9053 bool Const0 = isa<ConstantInt>(Val: Op0) || isa<ConstantFP>(Val: Op0) ||
9054 isa<ConstantPointerNull>(Val: Op0);
9055 bool Const1 = isa<ConstantInt>(Val: Op1) || isa<ConstantFP>(Val: Op1) ||
9056 isa<ConstantPointerNull>(Val: Op1);
9057 if (Const0 || Const1) {
9058 if (!Const0 || !Const1) {
9059 auto *F = new FreezeInst(Const0 ? Op1 : Op0, "", CmpI->getIterator());
9060 F->takeName(V: FI);
9061 CmpI->setOperand(i: Const0 ? 1 : 0, Val: F);
9062 }
9063 replaceAllUsesWith(Old: FI, New: CmpI, FreshBBs, IsHuge: IsHugeFunc);
9064 FI->eraseFromParent();
9065 return true;
9066 }
9067 }
9068 return AnyChange;
9069 }
9070
9071 if (tryToSinkFreeOperands(I))
9072 return true;
9073
9074 switch (I->getOpcode()) {
9075 case Instruction::Shl:
9076 case Instruction::LShr:
9077 case Instruction::AShr:
9078 return optimizeShiftInst(Shift: cast<BinaryOperator>(Val: I));
9079 case Instruction::Call:
9080 return optimizeCallInst(CI: cast<CallInst>(Val: I), ModifiedDT);
9081 case Instruction::Select:
9082 return optimizeSelectInst(SI: cast<SelectInst>(Val: I));
9083 case Instruction::ShuffleVector:
9084 return optimizeShuffleVectorInst(SVI: cast<ShuffleVectorInst>(Val: I));
9085 case Instruction::Switch:
9086 return optimizeSwitchInst(SI: cast<SwitchInst>(Val: I));
9087 case Instruction::ExtractElement:
9088 return optimizeExtractElementInst(Inst: cast<ExtractElementInst>(Val: I));
9089 case Instruction::CondBr:
9090 return optimizeBranch(Branch: cast<CondBrInst>(Val: I), TLI: *TLI, FreshBBs, IsHugeFunc);
9091 }
9092
9093 return AnyChange;
9094}
9095
9096/// Given an OR instruction, check to see if this is a bitreverse
9097/// idiom. If so, insert the new intrinsic and return true.
9098bool CodeGenPrepare::makeBitReverse(Instruction &I) {
9099 if (!I.getType()->isIntegerTy() ||
9100 !TLI->isOperationLegalOrCustom(Op: ISD::BITREVERSE,
9101 VT: TLI->getValueType(DL: *DL, Ty: I.getType(), AllowUnknown: true)))
9102 return false;
9103
9104 SmallVector<Instruction *, 4> Insts;
9105 if (!recognizeBSwapOrBitReverseIdiom(I: &I, MatchBSwaps: false, MatchBitReversals: true, InsertedInsts&: Insts))
9106 return false;
9107 Instruction *LastInst = Insts.back();
9108 replaceAllUsesWith(Old: &I, New: LastInst, FreshBBs, IsHuge: IsHugeFunc);
9109 RecursivelyDeleteTriviallyDeadInstructions(
9110 V: &I, TLI: TLInfo, MSSAU: nullptr,
9111 AboutToDeleteCallback: [&](Value *V) { removeAllAssertingVHReferences(V); });
9112 return true;
9113}
9114
9115// In this pass we look for GEP and cast instructions that are used
9116// across basic blocks and rewrite them to improve basic-block-at-a-time
9117// selection.
9118bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, ModifyDT &ModifiedDT) {
9119 SunkAddrs.clear();
9120 bool MadeChange = false;
9121
9122 do {
9123 CurInstIterator = BB.begin();
9124 ModifiedDT = ModifyDT::NotModifyDT;
9125 while (CurInstIterator != BB.end()) {
9126 MadeChange |= optimizeInst(I: &*CurInstIterator++, ModifiedDT);
9127 if (ModifiedDT != ModifyDT::NotModifyDT) {
9128 // For huge function we tend to quickly go though the inner optmization
9129 // opportunities in the BB. So we go back to the BB head to re-optimize
9130 // each instruction instead of go back to the function head.
9131 if (IsHugeFunc)
9132 break;
9133 return true;
9134 }
9135 }
9136 } while (ModifiedDT == ModifyDT::ModifyInstDT);
9137
9138 bool MadeBitReverse = true;
9139 while (MadeBitReverse) {
9140 MadeBitReverse = false;
9141 for (auto &I : reverse(C&: BB)) {
9142 if (makeBitReverse(I)) {
9143 MadeBitReverse = MadeChange = true;
9144 break;
9145 }
9146 }
9147 }
9148 MadeChange |= dupRetToEnableTailCallOpts(BB: &BB, ModifiedDT);
9149
9150 return MadeChange;
9151}
9152
9153bool CodeGenPrepare::fixupDbgVariableRecordsOnInst(Instruction &I) {
9154 bool AnyChange = false;
9155 for (DbgVariableRecord &DVR : filterDbgVars(R: I.getDbgRecordRange()))
9156 AnyChange |= fixupDbgVariableRecord(I&: DVR);
9157 return AnyChange;
9158}
9159
9160// FIXME: should updating debug-info really cause the "changed" flag to fire,
9161// which can cause a function to be reprocessed?
9162bool CodeGenPrepare::fixupDbgVariableRecord(DbgVariableRecord &DVR) {
9163 if (DVR.Type != DbgVariableRecord::LocationType::Value &&
9164 DVR.Type != DbgVariableRecord::LocationType::Assign)
9165 return false;
9166
9167 // Does this DbgVariableRecord refer to a sunk address calculation?
9168 bool AnyChange = false;
9169 SmallDenseSet<Value *> LocationOps(DVR.location_ops().begin(),
9170 DVR.location_ops().end());
9171 for (Value *Location : LocationOps) {
9172 WeakTrackingVH SunkAddrVH = SunkAddrs[Location];
9173 Value *SunkAddr = SunkAddrVH.pointsToAliveValue() ? SunkAddrVH : nullptr;
9174 if (SunkAddr) {
9175 // Point dbg.value at locally computed address, which should give the best
9176 // opportunity to be accurately lowered. This update may change the type
9177 // of pointer being referred to; however this makes no difference to
9178 // debugging information, and we can't generate bitcasts that may affect
9179 // codegen.
9180 DVR.replaceVariableLocationOp(OldValue: Location, NewValue: SunkAddr);
9181 AnyChange = true;
9182 }
9183 }
9184 return AnyChange;
9185}
9186
9187static void DbgInserterHelper(DbgVariableRecord *DVR, BasicBlock::iterator VI) {
9188 DVR->removeFromParent();
9189 BasicBlock *VIBB = VI->getParent();
9190 if (isa<PHINode>(Val: VI))
9191 VIBB->insertDbgRecordBefore(DR: DVR, Here: VIBB->getFirstInsertionPt());
9192 else
9193 VIBB->insertDbgRecordAfter(DR: DVR, I: &*VI);
9194}
9195
9196// A llvm.dbg.value may be using a value before its definition, due to
9197// optimizations in this pass and others. Scan for such dbg.values, and rescue
9198// them by moving the dbg.value to immediately after the value definition.
9199// FIXME: Ideally this should never be necessary, and this has the potential
9200// to re-order dbg.value intrinsics.
9201bool CodeGenPrepare::placeDbgValues(Function &F) {
9202 bool MadeChange = false;
9203 DominatorTree &DT = getDT();
9204
9205 auto DbgProcessor = [&](auto *DbgItem, Instruction *Position) {
9206 SmallVector<Instruction *, 4> VIs;
9207 for (Value *V : DbgItem->location_ops())
9208 if (Instruction *VI = dyn_cast_or_null<Instruction>(Val: V))
9209 VIs.push_back(Elt: VI);
9210
9211 // This item may depend on multiple instructions, complicating any
9212 // potential sink. This block takes the defensive approach, opting to
9213 // "undef" the item if it has more than one instruction and any of them do
9214 // not dominate iem.
9215 for (Instruction *VI : VIs) {
9216 if (VI->isTerminator())
9217 continue;
9218
9219 // If VI is a phi in a block with an EHPad terminator, we can't insert
9220 // after it.
9221 if (isa<PHINode>(Val: VI) && VI->getParent()->getTerminator()->isEHPad())
9222 continue;
9223
9224 // If the defining instruction dominates the dbg.value, we do not need
9225 // to move the dbg.value.
9226 if (DT.dominates(Def: VI, User: Position))
9227 continue;
9228
9229 // If we depend on multiple instructions and any of them doesn't
9230 // dominate this DVI, we probably can't salvage it: moving it to
9231 // after any of the instructions could cause us to lose the others.
9232 if (VIs.size() > 1) {
9233 LLVM_DEBUG(
9234 dbgs()
9235 << "Unable to find valid location for Debug Value, undefing:\n"
9236 << *DbgItem);
9237 DbgItem->setKillLocation();
9238 break;
9239 }
9240
9241 LLVM_DEBUG(dbgs() << "Moving Debug Value before :\n"
9242 << *DbgItem << ' ' << *VI);
9243 DbgInserterHelper(DbgItem, VI->getIterator());
9244 MadeChange = true;
9245 ++NumDbgValueMoved;
9246 }
9247 };
9248
9249 for (BasicBlock &BB : F) {
9250 for (Instruction &Insn : llvm::make_early_inc_range(Range&: BB)) {
9251 // Process any DbgVariableRecord records attached to this
9252 // instruction.
9253 for (DbgVariableRecord &DVR : llvm::make_early_inc_range(
9254 Range: filterDbgVars(R: Insn.getDbgRecordRange()))) {
9255 if (DVR.Type != DbgVariableRecord::LocationType::Value)
9256 continue;
9257 DbgProcessor(&DVR, &Insn);
9258 }
9259 }
9260 }
9261
9262 return MadeChange;
9263}
9264
9265// Group scattered pseudo probes in a block to favor SelectionDAG. Scattered
9266// probes can be chained dependencies of other regular DAG nodes and block DAG
9267// combine optimizations.
9268bool CodeGenPrepare::placePseudoProbes(Function &F) {
9269 bool MadeChange = false;
9270 for (auto &Block : F) {
9271 // Move the rest probes to the beginning of the block.
9272 auto FirstInst = Block.getFirstInsertionPt();
9273 while (FirstInst != Block.end() && FirstInst->isDebugOrPseudoInst())
9274 ++FirstInst;
9275 BasicBlock::iterator I(FirstInst);
9276 I++;
9277 while (I != Block.end()) {
9278 if (auto *II = dyn_cast<PseudoProbeInst>(Val: I++)) {
9279 II->moveBefore(InsertPos: FirstInst);
9280 MadeChange = true;
9281 }
9282 }
9283 }
9284 return MadeChange;
9285}
9286
9287/// Some targets prefer to split a conditional branch like:
9288/// \code
9289/// %0 = icmp ne i32 %a, 0
9290/// %1 = icmp ne i32 %b, 0
9291/// %or.cond = or i1 %0, %1
9292/// br i1 %or.cond, label %TrueBB, label %FalseBB
9293/// \endcode
9294/// into multiple branch instructions like:
9295/// \code
9296/// bb1:
9297/// %0 = icmp ne i32 %a, 0
9298/// br i1 %0, label %TrueBB, label %bb2
9299/// bb2:
9300/// %1 = icmp ne i32 %b, 0
9301/// br i1 %1, label %TrueBB, label %FalseBB
9302/// \endcode
9303/// This usually allows instruction selection to do even further optimizations
9304/// and combine the compare with the branch instruction. Currently this is
9305/// applied for targets which have "cheap" jump instructions.
9306///
9307/// FIXME: Remove the (equivalent?) implementation in SelectionDAG.
9308///
9309bool CodeGenPrepare::splitBranchCondition(Function &F) {
9310 if (!TM->Options.EnableFastISel || TLI->isJumpExpensive())
9311 return false;
9312
9313 bool MadeChange = false;
9314 for (auto &BB : F) {
9315 // Does this BB end with the following?
9316 // %cond1 = icmp|fcmp|binary instruction ...
9317 // %cond2 = icmp|fcmp|binary instruction ...
9318 // %cond.or = or|and i1 %cond1, cond2
9319 // br i1 %cond.or label %dest1, label %dest2"
9320 Instruction *LogicOp;
9321 BasicBlock *TBB, *FBB;
9322 if (!match(V: BB.getTerminator(),
9323 P: m_Br(C: m_OneUse(SubPattern: m_Instruction(I&: LogicOp)), T&: TBB, F&: FBB)))
9324 continue;
9325
9326 auto *Br1 = cast<CondBrInst>(Val: BB.getTerminator());
9327 if (Br1->getMetadata(KindID: LLVMContext::MD_unpredictable))
9328 continue;
9329
9330 // The merging of mostly empty BB can cause a degenerate branch.
9331 if (TBB == FBB)
9332 continue;
9333
9334 unsigned Opc;
9335 Value *Cond1, *Cond2;
9336 if (match(V: LogicOp,
9337 P: m_LogicalAnd(L: m_OneUse(SubPattern: m_Value(V&: Cond1)), R: m_OneUse(SubPattern: m_Value(V&: Cond2)))))
9338 Opc = Instruction::And;
9339 else if (match(V: LogicOp, P: m_LogicalOr(L: m_OneUse(SubPattern: m_Value(V&: Cond1)),
9340 R: m_OneUse(SubPattern: m_Value(V&: Cond2)))))
9341 Opc = Instruction::Or;
9342 else
9343 continue;
9344
9345 auto IsGoodCond = [](Value *Cond) {
9346 return match(
9347 V: Cond,
9348 P: m_CombineOr(Ps: m_Cmp(), Ps: m_CombineOr(Ps: m_LogicalAnd(L: m_Value(), R: m_Value()),
9349 Ps: m_LogicalOr(L: m_Value(), R: m_Value()))));
9350 };
9351 if (!IsGoodCond(Cond1) || !IsGoodCond(Cond2))
9352 continue;
9353
9354 LLVM_DEBUG(dbgs() << "Before branch condition splitting\n"; BB.dump());
9355
9356 // Create a new BB.
9357 auto *TmpBB =
9358 BasicBlock::Create(Context&: BB.getContext(), Name: BB.getName() + ".cond.split",
9359 Parent: BB.getParent(), InsertBefore: BB.getNextNode());
9360 if (IsHugeFunc)
9361 FreshBBs.insert(Ptr: TmpBB);
9362
9363 // Update original basic block by using the first condition directly by the
9364 // branch instruction and removing the no longer needed and/or instruction.
9365 Br1->setCondition(Cond1);
9366 LogicOp->eraseFromParent();
9367
9368 // Depending on the condition we have to either replace the true or the
9369 // false successor of the original branch instruction.
9370 if (Opc == Instruction::And)
9371 Br1->setSuccessor(idx: 0, NewSucc: TmpBB);
9372 else
9373 Br1->setSuccessor(idx: 1, NewSucc: TmpBB);
9374
9375 // Fill in the new basic block.
9376 auto *Br2 = IRBuilder<>(TmpBB).CreateCondBr(Cond: Cond2, True: TBB, False: FBB);
9377 if (auto *I = dyn_cast<Instruction>(Val: Cond2)) {
9378 I->removeFromParent();
9379 I->insertBefore(InsertPos: Br2->getIterator());
9380 }
9381
9382 // Update PHI nodes in both successors. The original BB needs to be
9383 // replaced in one successor's PHI nodes, because the branch comes now from
9384 // the newly generated BB (NewBB). In the other successor we need to add one
9385 // incoming edge to the PHI nodes, because both branch instructions target
9386 // now the same successor. Depending on the original branch condition
9387 // (and/or) we have to swap the successors (TrueDest, FalseDest), so that
9388 // we perform the correct update for the PHI nodes.
9389 // This doesn't change the successor order of the just created branch
9390 // instruction (or any other instruction).
9391 if (Opc == Instruction::Or)
9392 std::swap(a&: TBB, b&: FBB);
9393
9394 // Replace the old BB with the new BB.
9395 TBB->replacePhiUsesWith(Old: &BB, New: TmpBB);
9396
9397 // Add another incoming edge from the new BB.
9398 for (PHINode &PN : FBB->phis()) {
9399 auto *Val = PN.getIncomingValueForBlock(BB: &BB);
9400 PN.addIncoming(V: Val, BB: TmpBB);
9401 }
9402
9403 if (Loop *L = LI->getLoopFor(BB: &BB))
9404 L->addBasicBlockToLoop(NewBB: TmpBB, LI&: *LI);
9405
9406 // The edge we need to delete starts at BB and ends at whatever TBB ends
9407 // up pointing to.
9408 DTU->applyUpdates(Updates: {{DominatorTree::Insert, &BB, TmpBB},
9409 {DominatorTree::Insert, TmpBB, TBB},
9410 {DominatorTree::Insert, TmpBB, FBB},
9411 {DominatorTree::Delete, &BB, TBB}});
9412
9413 // Update the branch weights (from SelectionDAGBuilder::
9414 // FindMergedConditions).
9415 if (Opc == Instruction::Or) {
9416 // Codegen X | Y as:
9417 // BB1:
9418 // jmp_if_X TBB
9419 // jmp TmpBB
9420 // TmpBB:
9421 // jmp_if_Y TBB
9422 // jmp FBB
9423 //
9424
9425 // We have flexibility in setting Prob for BB1 and Prob for NewBB.
9426 // The requirement is that
9427 // TrueProb for BB1 + (FalseProb for BB1 * TrueProb for TmpBB)
9428 // = TrueProb for original BB.
9429 // Assuming the original weights are A and B, one choice is to set BB1's
9430 // weights to A and A+2B, and set TmpBB's weights to A and 2B. This choice
9431 // assumes that
9432 // TrueProb for BB1 == FalseProb for BB1 * TrueProb for TmpBB.
9433 // Another choice is to assume TrueProb for BB1 equals to TrueProb for
9434 // TmpBB, but the math is more complicated.
9435 uint64_t TrueWeight, FalseWeight;
9436 if (extractBranchWeights(I: *Br1, TrueVal&: TrueWeight, FalseVal&: FalseWeight)) {
9437 uint64_t NewTrueWeight = TrueWeight;
9438 uint64_t NewFalseWeight = TrueWeight + 2 * FalseWeight;
9439 setFittedBranchWeights(I&: *Br1, Weights: {NewTrueWeight, NewFalseWeight},
9440 IsExpected: hasBranchWeightOrigin(I: *Br1));
9441
9442 NewTrueWeight = TrueWeight;
9443 NewFalseWeight = 2 * FalseWeight;
9444 setFittedBranchWeights(I&: *Br2, Weights: {NewTrueWeight, NewFalseWeight},
9445 /*IsExpected=*/false);
9446 }
9447 } else {
9448 // Codegen X & Y as:
9449 // BB1:
9450 // jmp_if_X TmpBB
9451 // jmp FBB
9452 // TmpBB:
9453 // jmp_if_Y TBB
9454 // jmp FBB
9455 //
9456 // This requires creation of TmpBB after CurBB.
9457
9458 // We have flexibility in setting Prob for BB1 and Prob for TmpBB.
9459 // The requirement is that
9460 // FalseProb for BB1 + (TrueProb for BB1 * FalseProb for TmpBB)
9461 // = FalseProb for original BB.
9462 // Assuming the original weights are A and B, one choice is to set BB1's
9463 // weights to 2A+B and B, and set TmpBB's weights to 2A and B. This choice
9464 // assumes that
9465 // FalseProb for BB1 == TrueProb for BB1 * FalseProb for TmpBB.
9466 uint64_t TrueWeight, FalseWeight;
9467 if (extractBranchWeights(I: *Br1, TrueVal&: TrueWeight, FalseVal&: FalseWeight)) {
9468 uint64_t NewTrueWeight = 2 * TrueWeight + FalseWeight;
9469 uint64_t NewFalseWeight = FalseWeight;
9470 setFittedBranchWeights(I&: *Br1, Weights: {NewTrueWeight, NewFalseWeight},
9471 /*IsExpected=*/false);
9472
9473 NewTrueWeight = 2 * TrueWeight;
9474 NewFalseWeight = FalseWeight;
9475 setFittedBranchWeights(I&: *Br2, Weights: {NewTrueWeight, NewFalseWeight},
9476 /*IsExpected=*/false);
9477 }
9478 }
9479
9480 MadeChange = true;
9481
9482 LLVM_DEBUG(dbgs() << "After branch condition splitting\n"; BB.dump();
9483 TmpBB->dump());
9484 }
9485 return MadeChange;
9486}
9487