1//===- EarlyCSE.cpp - Simple and fast CSE pass ----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass performs a simple dominator tree walk that eliminates trivially
10// redundant instructions.
11//
12//===----------------------------------------------------------------------===//
13
14#include "llvm/Transforms/Scalar/EarlyCSE.h"
15#include "llvm/ADT/DenseMapInfo.h"
16#include "llvm/ADT/Hashing.h"
17#include "llvm/ADT/STLExtras.h"
18#include "llvm/ADT/ScopedHashTable.h"
19#include "llvm/ADT/SmallVector.h"
20#include "llvm/ADT/Statistic.h"
21#include "llvm/Analysis/AssumptionCache.h"
22#include "llvm/Analysis/GlobalsModRef.h"
23#include "llvm/Analysis/GuardUtils.h"
24#include "llvm/Analysis/InstructionSimplify.h"
25#include "llvm/Analysis/MemorySSA.h"
26#include "llvm/Analysis/MemorySSAUpdater.h"
27#include "llvm/Analysis/TargetLibraryInfo.h"
28#include "llvm/Analysis/TargetTransformInfo.h"
29#include "llvm/Analysis/ValueTracking.h"
30#include "llvm/IR/BasicBlock.h"
31#include "llvm/IR/Constants.h"
32#include "llvm/IR/Dominators.h"
33#include "llvm/IR/Function.h"
34#include "llvm/IR/InstrTypes.h"
35#include "llvm/IR/Instruction.h"
36#include "llvm/IR/Instructions.h"
37#include "llvm/IR/IntrinsicInst.h"
38#include "llvm/IR/LLVMContext.h"
39#include "llvm/IR/PassManager.h"
40#include "llvm/IR/PatternMatch.h"
41#include "llvm/IR/Type.h"
42#include "llvm/IR/Value.h"
43#include "llvm/InitializePasses.h"
44#include "llvm/Pass.h"
45#include "llvm/Support/Allocator.h"
46#include "llvm/Support/AtomicOrdering.h"
47#include "llvm/Support/Casting.h"
48#include "llvm/Support/Debug.h"
49#include "llvm/Support/DebugCounter.h"
50#include "llvm/Support/RecyclingAllocator.h"
51#include "llvm/Support/raw_ostream.h"
52#include "llvm/Transforms/Scalar.h"
53#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
54#include "llvm/Transforms/Utils/Local.h"
55#include <cassert>
56#include <deque>
57#include <memory>
58#include <utility>
59
60using namespace llvm;
61using namespace llvm::PatternMatch;
62
63#define DEBUG_TYPE "early-cse"
64
65STATISTIC(NumSimplify, "Number of instructions simplified or DCE'd");
66STATISTIC(NumCSE, "Number of instructions CSE'd");
67STATISTIC(NumCSECVP, "Number of compare instructions CVP'd");
68STATISTIC(NumCSELoad, "Number of load instructions CSE'd");
69STATISTIC(NumCSECall, "Number of call instructions CSE'd");
70STATISTIC(NumCSEGEP, "Number of GEP instructions CSE'd");
71STATISTIC(NumDSE, "Number of trivial dead stores removed");
72
73DEBUG_COUNTER(CSECounter, "early-cse",
74 "Controls which instructions are removed");
75
76static cl::opt<unsigned> EarlyCSEMssaOptCap(
77 "earlycse-mssa-optimization-cap", cl::init(Val: 500), cl::Hidden,
78 cl::desc("Enable imprecision in EarlyCSE in pathological cases, in exchange "
79 "for faster compile. Caps the MemorySSA clobbering calls."));
80
81static cl::opt<bool> EarlyCSEDebugHash(
82 "earlycse-debug-hash", cl::init(Val: false), cl::Hidden,
83 cl::desc("Perform extra assertion checking to verify that SimpleValue's hash "
84 "function is well-behaved w.r.t. its isEqual predicate"));
85
86//===----------------------------------------------------------------------===//
87// SimpleValue
88//===----------------------------------------------------------------------===//
89
90namespace {
91
92/// Struct representing the available values in the scoped hash table.
93struct SimpleValue {
94 Instruction *Inst;
95
96 SimpleValue(Instruction *I) : Inst(I) {
97 assert(canHandle(I) && "Inst can't be handled!");
98 }
99
100 static bool canHandle(Instruction *Inst) {
101 // This can only handle non-void readnone functions.
102 // Also handled are constrained intrinsic that look like the types
103 // of instruction handled below (UnaryOperator, etc.).
104 if (CallInst *CI = dyn_cast<CallInst>(Val: Inst)) {
105 if (Function *F = CI->getCalledFunction()) {
106 switch (F->getIntrinsicID()) {
107 case Intrinsic::experimental_constrained_fadd:
108 case Intrinsic::experimental_constrained_fsub:
109 case Intrinsic::experimental_constrained_fmul:
110 case Intrinsic::experimental_constrained_fdiv:
111 case Intrinsic::experimental_constrained_frem:
112 case Intrinsic::experimental_constrained_fptosi:
113 case Intrinsic::experimental_constrained_sitofp:
114 case Intrinsic::experimental_constrained_fptoui:
115 case Intrinsic::experimental_constrained_uitofp:
116 case Intrinsic::experimental_constrained_fcmp:
117 case Intrinsic::experimental_constrained_fcmps: {
118 auto *CFP = cast<ConstrainedFPIntrinsic>(Val: CI);
119 if (CFP->getExceptionBehavior() &&
120 CFP->getExceptionBehavior() == fp::ebStrict)
121 return false;
122 // Since we CSE across function calls we must not allow
123 // the rounding mode to change.
124 if (CFP->getRoundingMode() &&
125 CFP->getRoundingMode() == RoundingMode::Dynamic)
126 return false;
127 return true;
128 }
129 }
130 }
131 return CI->doesNotAccessMemory() &&
132 // FIXME: Currently the calls which may access the thread id may
133 // be considered as not accessing the memory. But this is
134 // problematic for coroutines, since coroutines may resume in a
135 // different thread. So we disable the optimization here for the
136 // correctness. However, it may block many other correct
137 // optimizations. Revert this one when we detect the memory
138 // accessing kind more precisely.
139 !CI->getFunction()->isPresplitCoroutine();
140 }
141 return isa<CastInst>(Val: Inst) || isa<UnaryOperator>(Val: Inst) ||
142 isa<BinaryOperator>(Val: Inst) || isa<CmpInst>(Val: Inst) ||
143 isa<SelectInst>(Val: Inst) || isa<ExtractElementInst>(Val: Inst) ||
144 isa<InsertElementInst>(Val: Inst) || isa<ShuffleVectorInst>(Val: Inst) ||
145 isa<ExtractValueInst>(Val: Inst) || isa<InsertValueInst>(Val: Inst) ||
146 isa<FreezeInst>(Val: Inst);
147 }
148};
149
150} // end anonymous namespace
151
152template <> struct llvm::DenseMapInfo<SimpleValue> {
153 static unsigned getHashValue(SimpleValue Val);
154 static bool isEqual(SimpleValue LHS, SimpleValue RHS);
155};
156
157/// Match a 'select' including an optional 'not's of the condition.
158static bool matchSelectWithOptionalNotCond(Value *V, Value *&Cond, Value *&A,
159 Value *&B,
160 SelectPatternFlavor &Flavor) {
161 // Return false if V is not even a select.
162 if (!match(V, P: m_Select(C: m_Value(V&: Cond), L: m_Value(V&: A), R: m_Value(V&: B))))
163 return false;
164
165 // Look through a 'not' of the condition operand by swapping A/B.
166 Value *CondNot;
167 if (match(V: Cond, P: m_Not(V: m_Value(V&: CondNot)))) {
168 Cond = CondNot;
169 std::swap(a&: A, b&: B);
170 }
171
172 // Match canonical forms of min/max. We are not using ValueTracking's
173 // more powerful matchSelectPattern() because it may rely on instruction flags
174 // such as "nsw". That would be incompatible with the current hashing
175 // mechanism that may remove flags to increase the likelihood of CSE.
176
177 Flavor = SPF_UNKNOWN;
178 CmpPredicate Pred;
179
180 if (!match(V: Cond, P: m_ICmp(Pred, L: m_Specific(V: A), R: m_Specific(V: B)))) {
181 // Check for commuted variants of min/max by swapping predicate.
182 // If we do not match the standard or commuted patterns, this is not a
183 // recognized form of min/max, but it is still a select, so return true.
184 if (!match(V: Cond, P: m_ICmp(Pred, L: m_Specific(V: B), R: m_Specific(V: A))))
185 return true;
186 Pred = ICmpInst::getSwappedPredicate(pred: Pred);
187 }
188
189 switch (Pred) {
190 case CmpInst::ICMP_UGT: Flavor = SPF_UMAX; break;
191 case CmpInst::ICMP_ULT: Flavor = SPF_UMIN; break;
192 case CmpInst::ICMP_SGT: Flavor = SPF_SMAX; break;
193 case CmpInst::ICMP_SLT: Flavor = SPF_SMIN; break;
194 // Non-strict inequalities.
195 case CmpInst::ICMP_ULE: Flavor = SPF_UMIN; break;
196 case CmpInst::ICMP_UGE: Flavor = SPF_UMAX; break;
197 case CmpInst::ICMP_SLE: Flavor = SPF_SMIN; break;
198 case CmpInst::ICMP_SGE: Flavor = SPF_SMAX; break;
199 default: break;
200 }
201
202 return true;
203}
204
205static unsigned hashCallInst(CallInst *CI) {
206 // Don't CSE convergent calls in different basic blocks, because they
207 // implicitly depend on the set of threads that is currently executing.
208 if (CI->isConvergent()) {
209 return hash_combine(args: CI->getOpcode(), args: CI->getParent(),
210 args: hash_combine_range(R: CI->operand_values()));
211 }
212 return hash_combine(args: CI->getOpcode(),
213 args: hash_combine_range(R: CI->operand_values()));
214}
215
216static unsigned getHashValueImpl(SimpleValue Val) {
217 Instruction *Inst = Val.Inst;
218 // Hash in all of the operands as pointers.
219 if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Val: Inst)) {
220 Value *LHS = BinOp->getOperand(i_nocapture: 0);
221 Value *RHS = BinOp->getOperand(i_nocapture: 1);
222 if (BinOp->isCommutative() && BinOp->getOperand(i_nocapture: 0) > BinOp->getOperand(i_nocapture: 1))
223 std::swap(a&: LHS, b&: RHS);
224
225 return hash_combine(args: BinOp->getOpcode(), args: LHS, args: RHS);
226 }
227
228 if (CmpInst *CI = dyn_cast<CmpInst>(Val: Inst)) {
229 // Compares can be commuted by swapping the comparands and
230 // updating the predicate. Choose the form that has the
231 // comparands in sorted order, or in the case of a tie, the
232 // one with the lower predicate.
233 Value *LHS = CI->getOperand(i_nocapture: 0);
234 Value *RHS = CI->getOperand(i_nocapture: 1);
235 CmpInst::Predicate Pred = CI->getPredicate();
236 CmpInst::Predicate SwappedPred = CI->getSwappedPredicate();
237 if (std::tie(args&: LHS, args&: Pred) > std::tie(args&: RHS, args&: SwappedPred)) {
238 std::swap(a&: LHS, b&: RHS);
239 Pred = SwappedPred;
240 }
241 return hash_combine(args: Inst->getOpcode(), args: Pred, args: LHS, args: RHS);
242 }
243
244 // Hash general selects to allow matching commuted true/false operands.
245 SelectPatternFlavor SPF;
246 Value *Cond, *A, *B;
247 if (matchSelectWithOptionalNotCond(V: Inst, Cond, A, B, Flavor&: SPF)) {
248 // Hash min/max (cmp + select) to allow for commuted operands.
249 // Min/max may also have non-canonical compare predicate (eg, the compare for
250 // smin may use 'sgt' rather than 'slt'), and non-canonical operands in the
251 // compare.
252 // TODO: We should also detect FP min/max.
253 if (SPF == SPF_SMIN || SPF == SPF_SMAX ||
254 SPF == SPF_UMIN || SPF == SPF_UMAX) {
255 if (A > B)
256 std::swap(a&: A, b&: B);
257 return hash_combine(args: Inst->getOpcode(), args: SPF, args: A, args: B);
258 }
259
260 // Hash general selects to allow matching commuted true/false operands.
261
262 // If we do not have a compare as the condition, just hash in the condition.
263 CmpPredicate Pred;
264 Value *X, *Y;
265 if (!match(V: Cond, P: m_Cmp(Pred, L: m_Value(V&: X), R: m_Value(V&: Y))))
266 return hash_combine(args: Inst->getOpcode(), args: Cond, args: A, args: B);
267
268 // Similar to cmp normalization (above) - canonicalize the predicate value:
269 // select (icmp Pred, X, Y), A, B --> select (icmp InvPred, X, Y), B, A
270 if (CmpInst::getInversePredicate(pred: Pred) < Pred) {
271 Pred = CmpInst::getInversePredicate(pred: Pred);
272 std::swap(a&: A, b&: B);
273 }
274 return hash_combine(args: Inst->getOpcode(),
275 args: static_cast<CmpInst::Predicate>(Pred), args: X, args: Y, args: A, args: B);
276 }
277
278 if (CastInst *CI = dyn_cast<CastInst>(Val: Inst))
279 return hash_combine(args: CI->getOpcode(), args: CI->getType(), args: CI->getOperand(i_nocapture: 0));
280
281 if (FreezeInst *FI = dyn_cast<FreezeInst>(Val: Inst))
282 return hash_combine(args: FI->getOpcode(), args: FI->getOperand(i_nocapture: 0));
283
284 if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(Val: Inst))
285 return hash_combine(args: EVI->getOpcode(), args: EVI->getOperand(i_nocapture: 0),
286 args: hash_combine_range(R: EVI->indices()));
287
288 if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(Val: Inst))
289 return hash_combine(args: IVI->getOpcode(), args: IVI->getOperand(i_nocapture: 0),
290 args: IVI->getOperand(i_nocapture: 1), args: hash_combine_range(R: IVI->indices()));
291
292 assert((isa<CallInst>(Inst) || isa<ExtractElementInst>(Inst) ||
293 isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst) ||
294 isa<UnaryOperator>(Inst) || isa<FreezeInst>(Inst)) &&
295 "Invalid/unknown instruction");
296
297 // Handle intrinsics with commutative operands.
298 auto *II = dyn_cast<IntrinsicInst>(Val: Inst);
299 if (II && II->isCommutative() && II->arg_size() >= 2) {
300 Value *LHS = II->getArgOperand(i: 0), *RHS = II->getArgOperand(i: 1);
301 if (LHS > RHS)
302 std::swap(a&: LHS, b&: RHS);
303 return hash_combine(
304 args: II->getOpcode(), args: LHS, args: RHS,
305 args: hash_combine_range(R: drop_begin(RangeOrContainer: II->operand_values(), N: 2)));
306 }
307
308 // gc.relocate is 'special' call: its second and third operands are
309 // not real values, but indices into statepoint's argument list.
310 // Get values they point to.
311 if (const GCRelocateInst *GCR = dyn_cast<GCRelocateInst>(Val: Inst))
312 return hash_combine(args: GCR->getOpcode(), args: GCR->getOperand(i_nocapture: 0),
313 args: GCR->getBasePtr(), args: GCR->getDerivedPtr());
314
315 // Don't CSE convergent calls in different basic blocks, because they
316 // implicitly depend on the set of threads that is currently executing.
317 if (CallInst *CI = dyn_cast<CallInst>(Val: Inst))
318 return hashCallInst(CI);
319
320 // Mix in the opcode.
321 return hash_combine(args: Inst->getOpcode(),
322 args: hash_combine_range(R: Inst->operand_values()));
323}
324
325unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) {
326#ifndef NDEBUG
327 // If -earlycse-debug-hash was specified, return a constant -- this
328 // will force all hashing to collide, so we'll exhaustively search
329 // the table for a match, and the assertion in isEqual will fire if
330 // there's a bug causing equal keys to hash differently.
331 if (EarlyCSEDebugHash)
332 return 0;
333#endif
334 return getHashValueImpl(Val);
335}
336
337static bool isEqualImpl(SimpleValue LHS, SimpleValue RHS) {
338 Instruction *LHSI = LHS.Inst, *RHSI = RHS.Inst;
339
340 if (LHSI->getOpcode() != RHSI->getOpcode())
341 return false;
342 if (LHSI->isIdenticalToWhenDefined(I: RHSI, /*IntersectAttrs=*/true)) {
343 // Convergent calls implicitly depend on the set of threads that is
344 // currently executing, so conservatively return false if they are in
345 // different basic blocks.
346 if (CallInst *CI = dyn_cast<CallInst>(Val: LHSI);
347 CI && CI->isConvergent() && LHSI->getParent() != RHSI->getParent())
348 return false;
349
350 return true;
351 }
352
353 // If we're not strictly identical, we still might be a commutable instruction
354 if (BinaryOperator *LHSBinOp = dyn_cast<BinaryOperator>(Val: LHSI)) {
355 if (!LHSBinOp->isCommutative())
356 return false;
357
358 assert(isa<BinaryOperator>(RHSI) &&
359 "same opcode, but different instruction type?");
360 BinaryOperator *RHSBinOp = cast<BinaryOperator>(Val: RHSI);
361
362 // Commuted equality
363 return LHSBinOp->getOperand(i_nocapture: 0) == RHSBinOp->getOperand(i_nocapture: 1) &&
364 LHSBinOp->getOperand(i_nocapture: 1) == RHSBinOp->getOperand(i_nocapture: 0);
365 }
366 if (CmpInst *LHSCmp = dyn_cast<CmpInst>(Val: LHSI)) {
367 assert(isa<CmpInst>(RHSI) &&
368 "same opcode, but different instruction type?");
369 CmpInst *RHSCmp = cast<CmpInst>(Val: RHSI);
370 // Commuted equality
371 return LHSCmp->getOperand(i_nocapture: 0) == RHSCmp->getOperand(i_nocapture: 1) &&
372 LHSCmp->getOperand(i_nocapture: 1) == RHSCmp->getOperand(i_nocapture: 0) &&
373 LHSCmp->getSwappedPredicate() == RHSCmp->getPredicate();
374 }
375
376 auto *LII = dyn_cast<IntrinsicInst>(Val: LHSI);
377 auto *RII = dyn_cast<IntrinsicInst>(Val: RHSI);
378 if (LII && RII && LII->getIntrinsicID() == RII->getIntrinsicID() &&
379 LII->isCommutative() && LII->arg_size() >= 2) {
380 return LII->getArgOperand(i: 0) == RII->getArgOperand(i: 1) &&
381 LII->getArgOperand(i: 1) == RII->getArgOperand(i: 0) &&
382 std::equal(first1: LII->arg_begin() + 2, last1: LII->arg_end(),
383 first2: RII->arg_begin() + 2, last2: RII->arg_end()) &&
384 LII->hasSameSpecialState(I2: RII, /*IgnoreAlignment=*/false,
385 /*IntersectAttrs=*/true);
386 }
387
388 // See comment above in `getHashValue()`.
389 if (const GCRelocateInst *GCR1 = dyn_cast<GCRelocateInst>(Val: LHSI))
390 if (const GCRelocateInst *GCR2 = dyn_cast<GCRelocateInst>(Val: RHSI))
391 return GCR1->getOperand(i_nocapture: 0) == GCR2->getOperand(i_nocapture: 0) &&
392 GCR1->getBasePtr() == GCR2->getBasePtr() &&
393 GCR1->getDerivedPtr() == GCR2->getDerivedPtr();
394
395 // Min/max can occur with commuted operands, non-canonical predicates,
396 // and/or non-canonical operands.
397 // Selects can be non-trivially equivalent via inverted conditions and swaps.
398 SelectPatternFlavor LSPF, RSPF;
399 Value *CondL, *CondR, *LHSA, *RHSA, *LHSB, *RHSB;
400 if (matchSelectWithOptionalNotCond(V: LHSI, Cond&: CondL, A&: LHSA, B&: LHSB, Flavor&: LSPF) &&
401 matchSelectWithOptionalNotCond(V: RHSI, Cond&: CondR, A&: RHSA, B&: RHSB, Flavor&: RSPF)) {
402 if (LSPF == RSPF) {
403 // TODO: We should also detect FP min/max.
404 if (LSPF == SPF_SMIN || LSPF == SPF_SMAX ||
405 LSPF == SPF_UMIN || LSPF == SPF_UMAX)
406 return ((LHSA == RHSA && LHSB == RHSB) ||
407 (LHSA == RHSB && LHSB == RHSA));
408
409 // select Cond, A, B <--> select not(Cond), B, A
410 if (CondL == CondR && LHSA == RHSA && LHSB == RHSB)
411 return true;
412 }
413
414 // If the true/false operands are swapped and the conditions are compares
415 // with inverted predicates, the selects are equal:
416 // select (icmp Pred, X, Y), A, B <--> select (icmp InvPred, X, Y), B, A
417 //
418 // This also handles patterns with a double-negation in the sense of not +
419 // inverse, because we looked through a 'not' in the matching function and
420 // swapped A/B:
421 // select (cmp Pred, X, Y), A, B <--> select (not (cmp InvPred, X, Y)), B, A
422 //
423 // This intentionally does NOT handle patterns with a double-negation in
424 // the sense of not + not, because doing so could result in values
425 // comparing
426 // as equal that hash differently in the min/max cases like:
427 // select (cmp slt, X, Y), X, Y <--> select (not (not (cmp slt, X, Y))), X, Y
428 // ^ hashes as min ^ would not hash as min
429 // In the context of the EarlyCSE pass, however, such cases never reach
430 // this code, as we simplify the double-negation before hashing the second
431 // select (and so still succeed at CSEing them).
432 if (LHSA == RHSB && LHSB == RHSA) {
433 CmpPredicate PredL, PredR;
434 Value *X, *Y;
435 if (match(V: CondL, P: m_Cmp(Pred&: PredL, L: m_Value(V&: X), R: m_Value(V&: Y))) &&
436 match(V: CondR, P: m_Cmp(Pred&: PredR, L: m_Specific(V: X), R: m_Specific(V: Y))) &&
437 CmpInst::getInversePredicate(pred: PredL) == PredR)
438 return true;
439 }
440 }
441
442 return false;
443}
444
445bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) {
446 // These comparisons are nontrivial, so assert that equality implies
447 // hash equality (DenseMap demands this as an invariant).
448 bool Result = isEqualImpl(LHS, RHS);
449 assert(!Result || getHashValueImpl(LHS) == getHashValueImpl(RHS));
450 return Result;
451}
452
453//===----------------------------------------------------------------------===//
454// CallValue
455//===----------------------------------------------------------------------===//
456
457namespace {
458
459/// Struct representing the available call values in the scoped hash
460/// table.
461struct CallValue {
462 Instruction *Inst;
463
464 CallValue(Instruction *I) : Inst(I) {
465 assert(canHandle(I) && "Inst can't be handled!");
466 }
467
468 static bool canHandle(Instruction *Inst) {
469 CallInst *CI = dyn_cast<CallInst>(Val: Inst);
470 if (!CI || (!CI->onlyReadsMemory() && !CI->onlyWritesMemory()) ||
471 // FIXME: Currently the calls which may access the thread id may
472 // be considered as not accessing the memory. But this is
473 // problematic for coroutines, since coroutines may resume in a
474 // different thread. So we disable the optimization here for the
475 // correctness. However, it may block many other correct
476 // optimizations. Revert this one when we detect the memory
477 // accessing kind more precisely.
478 CI->getFunction()->isPresplitCoroutine())
479 return false;
480 return true;
481 }
482};
483
484} // end anonymous namespace
485
486template <> struct llvm::DenseMapInfo<CallValue> {
487 static unsigned getHashValue(CallValue Val);
488 static bool isEqual(CallValue LHS, CallValue RHS);
489};
490
491unsigned DenseMapInfo<CallValue>::getHashValue(CallValue Val) {
492 Instruction *Inst = Val.Inst;
493
494 // Hash all of the operands as pointers and mix in the opcode.
495 return hashCallInst(CI: cast<CallInst>(Val: Inst));
496}
497
498bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) {
499 CallInst *LHSI = cast<CallInst>(Val: LHS.Inst);
500 CallInst *RHSI = cast<CallInst>(Val: RHS.Inst);
501
502 // Convergent calls implicitly depend on the set of threads that is
503 // currently executing, so conservatively return false if they are in
504 // different basic blocks.
505 if (LHSI->isConvergent() && LHSI->getParent() != RHSI->getParent())
506 return false;
507
508 return LHSI->isIdenticalToWhenDefined(I: RHSI, /*IntersectAttrs=*/true);
509}
510
511//===----------------------------------------------------------------------===//
512// GEPValue
513//===----------------------------------------------------------------------===//
514
515namespace {
516
517struct GEPValue {
518 Instruction *Inst;
519 std::optional<int64_t> ConstantOffset;
520
521 GEPValue(Instruction *I) : Inst(I) {
522 assert(canHandle(I) && "Inst can't be handled!");
523 }
524
525 GEPValue(Instruction *I, std::optional<int64_t> ConstantOffset)
526 : Inst(I), ConstantOffset(ConstantOffset) {
527 assert(canHandle(I) && "Inst can't be handled!");
528 }
529
530 static bool canHandle(Instruction *Inst) {
531 return isa<GetElementPtrInst>(Val: Inst);
532 }
533};
534
535} // namespace
536
537template <> struct llvm::DenseMapInfo<GEPValue> {
538 static unsigned getHashValue(const GEPValue &Val);
539 static bool isEqual(const GEPValue &LHS, const GEPValue &RHS);
540};
541
542unsigned DenseMapInfo<GEPValue>::getHashValue(const GEPValue &Val) {
543 auto *GEP = cast<GetElementPtrInst>(Val: Val.Inst);
544 if (Val.ConstantOffset.has_value())
545 return hash_combine(args: GEP->getOpcode(), args: GEP->getPointerOperand(),
546 args: Val.ConstantOffset.value());
547 return hash_combine(args: GEP->getOpcode(),
548 args: hash_combine_range(R: GEP->operand_values()));
549}
550
551bool DenseMapInfo<GEPValue>::isEqual(const GEPValue &LHS, const GEPValue &RHS) {
552 auto *LGEP = cast<GetElementPtrInst>(Val: LHS.Inst);
553 auto *RGEP = cast<GetElementPtrInst>(Val: RHS.Inst);
554 if (LGEP->getPointerOperand() != RGEP->getPointerOperand())
555 return false;
556 if (LHS.ConstantOffset.has_value() && RHS.ConstantOffset.has_value())
557 return LHS.ConstantOffset.value() == RHS.ConstantOffset.value();
558 return LGEP->isIdenticalToWhenDefined(I: RGEP);
559}
560
561//===----------------------------------------------------------------------===//
562// EarlyCSE implementation
563//===----------------------------------------------------------------------===//
564
565namespace {
566
567/// A simple and fast domtree-based CSE pass.
568///
569/// This pass does a simple depth-first walk over the dominator tree,
570/// eliminating trivially redundant instructions and using instsimplify to
571/// canonicalize things as it goes. It is intended to be fast and catch obvious
572/// cases so that instcombine and other passes are more effective. It is
573/// expected that a later pass of GVN will catch the interesting/hard cases.
574class EarlyCSE {
575public:
576 const TargetLibraryInfo &TLI;
577 const TargetTransformInfo &TTI;
578 DominatorTree &DT;
579 AssumptionCache &AC;
580 const SimplifyQuery SQ;
581 MemorySSA *MSSA;
582 std::unique_ptr<MemorySSAUpdater> MSSAUpdater;
583
584 using AllocatorTy =
585 RecyclingAllocator<BumpPtrAllocator,
586 ScopedHashTableVal<SimpleValue, Value *>>;
587 using ScopedHTType =
588 ScopedHashTable<SimpleValue, Value *, DenseMapInfo<SimpleValue>,
589 AllocatorTy>;
590
591 /// A scoped hash table of the current values of all of our simple
592 /// scalar expressions.
593 ///
594 /// As we walk down the domtree, we look to see if instructions are in this:
595 /// if so, we replace them with what we find, otherwise we insert them so
596 /// that dominated values can succeed in their lookup.
597 ScopedHTType AvailableValues;
598
599 /// A scoped hash table of the current values of previously encountered
600 /// memory locations.
601 ///
602 /// This allows us to get efficient access to dominating loads or stores when
603 /// we have a fully redundant load. In addition to the most recent load, we
604 /// keep track of a generation count of the read, which is compared against
605 /// the current generation count. The current generation count is incremented
606 /// after every possibly writing memory operation, which ensures that we only
607 /// CSE loads with other loads that have no intervening store. Ordering
608 /// events (such as fences or atomic instructions) increment the generation
609 /// count as well; essentially, we model these as writes to all possible
610 /// locations. Note that atomic and/or volatile loads and stores can be
611 /// present the table; it is the responsibility of the consumer to inspect
612 /// the atomicity/volatility if needed.
613 struct LoadValue {
614 Instruction *DefInst = nullptr;
615 unsigned Generation = 0;
616 int MatchingId = -1;
617 bool IsAtomic = false;
618 bool IsLoad = false;
619
620 LoadValue() = default;
621 LoadValue(Instruction *Inst, unsigned Generation, unsigned MatchingId,
622 bool IsAtomic, bool IsLoad)
623 : DefInst(Inst), Generation(Generation), MatchingId(MatchingId),
624 IsAtomic(IsAtomic), IsLoad(IsLoad) {}
625 };
626
627 using LoadMapAllocator =
628 RecyclingAllocator<BumpPtrAllocator,
629 ScopedHashTableVal<Value *, LoadValue>>;
630 using LoadHTType =
631 ScopedHashTable<Value *, LoadValue, DenseMapInfo<Value *>,
632 LoadMapAllocator>;
633
634 LoadHTType AvailableLoads;
635
636 // A scoped hash table mapping memory locations (represented as typed
637 // addresses) to generation numbers at which that memory location became
638 // (henceforth indefinitely) invariant.
639 using InvariantMapAllocator =
640 RecyclingAllocator<BumpPtrAllocator,
641 ScopedHashTableVal<MemoryLocation, unsigned>>;
642 using InvariantHTType =
643 ScopedHashTable<MemoryLocation, unsigned, DenseMapInfo<MemoryLocation>,
644 InvariantMapAllocator>;
645 InvariantHTType AvailableInvariants;
646
647 /// A scoped hash table of the current values of read-only call
648 /// values.
649 ///
650 /// It uses the same generation count as loads.
651 using CallHTType =
652 ScopedHashTable<CallValue, std::pair<Instruction *, unsigned>>;
653 CallHTType AvailableCalls;
654
655 using GEPMapAllocatorTy =
656 RecyclingAllocator<BumpPtrAllocator,
657 ScopedHashTableVal<GEPValue, Value *>>;
658 using GEPHTType = ScopedHashTable<GEPValue, Value *, DenseMapInfo<GEPValue>,
659 GEPMapAllocatorTy>;
660 GEPHTType AvailableGEPs;
661
662 /// This is the current generation of the memory value.
663 unsigned CurrentGeneration = 0;
664
665 /// Set up the EarlyCSE runner for a particular function.
666 EarlyCSE(const DataLayout &DL, const TargetLibraryInfo &TLI,
667 const TargetTransformInfo &TTI, DominatorTree &DT,
668 AssumptionCache &AC, MemorySSA *MSSA)
669 : TLI(TLI), TTI(TTI), DT(DT), AC(AC), SQ(DL, &TLI, &DT, &AC), MSSA(MSSA),
670 MSSAUpdater(std::make_unique<MemorySSAUpdater>(args&: MSSA)) {}
671
672 bool run();
673
674private:
675 unsigned ClobberCounter = 0;
676 // Almost a POD, but needs to call the constructors for the scoped hash
677 // tables so that a new scope gets pushed on. These are RAII so that the
678 // scope gets popped when the NodeScope is destroyed.
679 class NodeScope {
680 public:
681 NodeScope(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
682 InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls,
683 GEPHTType &AvailableGEPs)
684 : Scope(AvailableValues), LoadScope(AvailableLoads),
685 InvariantScope(AvailableInvariants), CallScope(AvailableCalls),
686 GEPScope(AvailableGEPs) {}
687 NodeScope(const NodeScope &) = delete;
688 NodeScope &operator=(const NodeScope &) = delete;
689
690 private:
691 ScopedHTType::ScopeTy Scope;
692 LoadHTType::ScopeTy LoadScope;
693 InvariantHTType::ScopeTy InvariantScope;
694 CallHTType::ScopeTy CallScope;
695 GEPHTType::ScopeTy GEPScope;
696 };
697
698 // Contains all the needed information to create a stack for doing a depth
699 // first traversal of the tree. This includes scopes for values, loads, and
700 // calls as well as the generation. There is a child iterator so that the
701 // children do not need to be store separately.
702 class StackNode {
703 public:
704 StackNode(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
705 InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls,
706 GEPHTType &AvailableGEPs, unsigned cg, DomTreeNode *n,
707 DomTreeNode::const_iterator child,
708 DomTreeNode::const_iterator end)
709 : CurrentGeneration(cg), ChildGeneration(cg), Node(n), ChildIter(child),
710 EndIter(end),
711 Scopes(AvailableValues, AvailableLoads, AvailableInvariants,
712 AvailableCalls, AvailableGEPs) {}
713 StackNode(const StackNode &) = delete;
714 StackNode &operator=(const StackNode &) = delete;
715
716 // Accessors.
717 unsigned currentGeneration() const { return CurrentGeneration; }
718 unsigned childGeneration() const { return ChildGeneration; }
719 void childGeneration(unsigned generation) { ChildGeneration = generation; }
720 DomTreeNode *node() { return Node; }
721 DomTreeNode::const_iterator childIter() const { return ChildIter; }
722
723 DomTreeNode *nextChild() {
724 DomTreeNode *child = *ChildIter;
725 ++ChildIter;
726 return child;
727 }
728
729 DomTreeNode::const_iterator end() const { return EndIter; }
730 bool isProcessed() const { return Processed; }
731 void process() { Processed = true; }
732
733 private:
734 unsigned CurrentGeneration;
735 unsigned ChildGeneration;
736 DomTreeNode *Node;
737 DomTreeNode::const_iterator ChildIter;
738 DomTreeNode::const_iterator EndIter;
739 NodeScope Scopes;
740 bool Processed = false;
741 };
742
743 /// Wrapper class to handle memory instructions, including loads,
744 /// stores and intrinsic loads and stores defined by the target.
745 class ParseMemoryInst {
746 public:
747 ParseMemoryInst(Instruction *Inst, const TargetTransformInfo &TTI)
748 : Inst(Inst) {
749 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: Inst)) {
750 IntrID = II->getIntrinsicID();
751 if (TTI.getTgtMemIntrinsic(Inst: II, Info))
752 return;
753 if (isHandledNonTargetIntrinsic(ID: IntrID)) {
754 switch (IntrID) {
755 case Intrinsic::masked_load:
756 Info.PtrVal = Inst->getOperand(i: 0);
757 Info.MatchingId = Intrinsic::masked_load;
758 Info.ReadMem = true;
759 Info.WriteMem = false;
760 Info.IsVolatile = false;
761 break;
762 case Intrinsic::masked_store:
763 Info.PtrVal = Inst->getOperand(i: 1);
764 // Use the ID of masked load as the "matching id". This will
765 // prevent matching non-masked loads/stores with masked ones
766 // (which could be done), but at the moment, the code here
767 // does not support matching intrinsics with non-intrinsics,
768 // so keep the MatchingIds specific to masked instructions
769 // for now (TODO).
770 Info.MatchingId = Intrinsic::masked_load;
771 Info.ReadMem = false;
772 Info.WriteMem = true;
773 Info.IsVolatile = false;
774 break;
775 }
776 } else if (auto *MI = dyn_cast<MemSetInst>(Val: Inst)) {
777 Info.PtrVal = MI->getDest();
778 Info.MatchingId = 0;
779 Info.ReadMem = false;
780 Info.WriteMem = true;
781 Info.IsVolatile = MI->isVolatile();
782 }
783 }
784 }
785
786 Instruction *get() { return Inst; }
787 const Instruction *get() const { return Inst; }
788
789 bool isLoad() const {
790 if (IntrID != 0)
791 return Info.ReadMem;
792 return isa<LoadInst>(Val: Inst);
793 }
794
795 bool isStore() const {
796 if (IntrID != 0)
797 return Info.WriteMem;
798 return isa<StoreInst>(Val: Inst);
799 }
800
801 bool isAtomic() const {
802 if (IntrID != 0)
803 return Info.Ordering != AtomicOrdering::NotAtomic;
804 return Inst->isAtomic();
805 }
806
807 bool isUnordered() const {
808 if (IntrID != 0)
809 return Info.isUnordered();
810
811 if (LoadInst *LI = dyn_cast<LoadInst>(Val: Inst)) {
812 return LI->isUnordered();
813 } else if (StoreInst *SI = dyn_cast<StoreInst>(Val: Inst)) {
814 return SI->isUnordered();
815 }
816 // Conservative answer
817 return !Inst->isAtomic();
818 }
819
820 bool isVolatile() const {
821 if (IntrID != 0)
822 return Info.IsVolatile;
823
824 if (LoadInst *LI = dyn_cast<LoadInst>(Val: Inst)) {
825 return LI->isVolatile();
826 } else if (StoreInst *SI = dyn_cast<StoreInst>(Val: Inst)) {
827 return SI->isVolatile();
828 }
829 // Conservative answer
830 return true;
831 }
832
833 bool isInvariantLoad() const {
834 if (auto *LI = dyn_cast<LoadInst>(Val: Inst))
835 return LI->hasMetadata(KindID: LLVMContext::MD_invariant_load);
836 return false;
837 }
838
839 bool isValid() const { return getPointerOperand() != nullptr; }
840
841 // For regular (non-intrinsic) loads/stores, this is set to -1. For
842 // intrinsic loads/stores, the id is retrieved from the corresponding
843 // field in the MemIntrinsicInfo structure. That field contains
844 // non-negative values only.
845 int getMatchingId() const {
846 if (IntrID != 0)
847 return Info.MatchingId;
848 return -1;
849 }
850
851 Value *getPointerOperand() const {
852 if (IntrID != 0)
853 return Info.PtrVal;
854 return getLoadStorePointerOperand(V: Inst);
855 }
856
857 Type *getValueType() const {
858 // TODO: handle target-specific intrinsics.
859 return Inst->getAccessType();
860 }
861
862 bool mayReadFromMemory() const {
863 if (IntrID != 0)
864 return Info.ReadMem;
865 return Inst->mayReadFromMemory();
866 }
867
868 bool mayWriteToMemory() const {
869 if (IntrID != 0)
870 return Info.WriteMem;
871 return Inst->mayWriteToMemory();
872 }
873
874 private:
875 Intrinsic::ID IntrID = 0;
876 MemIntrinsicInfo Info;
877 Instruction *Inst;
878 };
879
880 // This function is to prevent accidentally passing a non-target
881 // intrinsic ID to TargetTransformInfo.
882 static bool isHandledNonTargetIntrinsic(Intrinsic::ID ID) {
883 switch (ID) {
884 case Intrinsic::masked_load:
885 case Intrinsic::masked_store:
886 return true;
887 }
888 return false;
889 }
890 static bool isHandledNonTargetIntrinsic(const Value *V) {
891 if (auto *II = dyn_cast<IntrinsicInst>(Val: V))
892 return isHandledNonTargetIntrinsic(ID: II->getIntrinsicID());
893 return false;
894 }
895
896 bool processNode(DomTreeNode *Node);
897
898 bool handleBranchCondition(Instruction *CondInst, const CondBrInst *BI,
899 const BasicBlock *BB, const BasicBlock *Pred);
900
901 Value *getMatchingValue(LoadValue &InVal, ParseMemoryInst &MemInst,
902 unsigned CurrentGeneration);
903
904 bool overridingStores(const ParseMemoryInst &Earlier,
905 const ParseMemoryInst &Later);
906
907 Value *getOrCreateResult(Instruction *Inst, Type *ExpectedType,
908 bool CanCreate) const {
909 // TODO: We could insert relevant casts on type mismatch.
910 // The load or the store's first operand.
911 Value *V;
912 if (auto *II = dyn_cast<IntrinsicInst>(Val: Inst)) {
913 switch (II->getIntrinsicID()) {
914 case Intrinsic::masked_load:
915 V = II;
916 break;
917 case Intrinsic::masked_store:
918 V = II->getOperand(i_nocapture: 0);
919 break;
920 default:
921 return TTI.getOrCreateResultFromMemIntrinsic(Inst: II, ExpectedType,
922 CanCreate);
923 }
924 } else {
925 V = isa<LoadInst>(Val: Inst) ? Inst : cast<StoreInst>(Val: Inst)->getValueOperand();
926 }
927
928 return V->getType() == ExpectedType ? V : nullptr;
929 }
930
931 /// Return true if the instruction is known to only operate on memory
932 /// provably invariant in the given "generation".
933 bool isOperatingOnInvariantMemAt(Instruction *I, unsigned GenAt);
934
935 bool isSameMemGeneration(unsigned EarlierGeneration, unsigned LaterGeneration,
936 Instruction *EarlierInst, Instruction *LaterInst);
937
938 bool isNonTargetIntrinsicMatch(const IntrinsicInst *Earlier,
939 const IntrinsicInst *Later) {
940 auto IsSubmask = [](const Value *Mask0, const Value *Mask1) {
941 // Is Mask0 a submask of Mask1?
942 if (Mask0 == Mask1)
943 return true;
944 if (isa<UndefValue>(Val: Mask0) || isa<UndefValue>(Val: Mask1))
945 return false;
946 auto *Vec0 = dyn_cast<ConstantVector>(Val: Mask0);
947 auto *Vec1 = dyn_cast<ConstantVector>(Val: Mask1);
948 if (!Vec0 || !Vec1)
949 return false;
950 if (Vec0->getType() != Vec1->getType())
951 return false;
952 for (int i = 0, e = Vec0->getNumOperands(); i != e; ++i) {
953 Constant *Elem0 = Vec0->getOperand(i_nocapture: i);
954 Constant *Elem1 = Vec1->getOperand(i_nocapture: i);
955 auto *Int0 = dyn_cast<ConstantInt>(Val: Elem0);
956 if (Int0 && Int0->isZero())
957 continue;
958 auto *Int1 = dyn_cast<ConstantInt>(Val: Elem1);
959 if (Int1 && !Int1->isZero())
960 continue;
961 if (isa<UndefValue>(Val: Elem0) || isa<UndefValue>(Val: Elem1))
962 return false;
963 if (Elem0 == Elem1)
964 continue;
965 return false;
966 }
967 return true;
968 };
969 auto PtrOp = [](const IntrinsicInst *II) {
970 if (II->getIntrinsicID() == Intrinsic::masked_load)
971 return II->getOperand(i_nocapture: 0);
972 if (II->getIntrinsicID() == Intrinsic::masked_store)
973 return II->getOperand(i_nocapture: 1);
974 llvm_unreachable("Unexpected IntrinsicInst");
975 };
976 auto MaskOp = [](const IntrinsicInst *II) {
977 if (II->getIntrinsicID() == Intrinsic::masked_load)
978 return II->getOperand(i_nocapture: 1);
979 if (II->getIntrinsicID() == Intrinsic::masked_store)
980 return II->getOperand(i_nocapture: 2);
981 llvm_unreachable("Unexpected IntrinsicInst");
982 };
983 auto ThruOp = [](const IntrinsicInst *II) {
984 if (II->getIntrinsicID() == Intrinsic::masked_load)
985 return II->getOperand(i_nocapture: 2);
986 llvm_unreachable("Unexpected IntrinsicInst");
987 };
988
989 if (PtrOp(Earlier) != PtrOp(Later))
990 return false;
991
992 Intrinsic::ID IDE = Earlier->getIntrinsicID();
993 Intrinsic::ID IDL = Later->getIntrinsicID();
994 // We could really use specific intrinsic classes for masked loads
995 // and stores in IntrinsicInst.h.
996 if (IDE == Intrinsic::masked_load && IDL == Intrinsic::masked_load) {
997 // Trying to replace later masked load with the earlier one.
998 // Check that the pointers are the same, and
999 // - masks and pass-throughs are the same, or
1000 // - replacee's pass-through is "undef" and replacer's mask is a
1001 // super-set of the replacee's mask.
1002 if (MaskOp(Earlier) == MaskOp(Later) && ThruOp(Earlier) == ThruOp(Later))
1003 return true;
1004 if (!isa<UndefValue>(Val: ThruOp(Later)))
1005 return false;
1006 return IsSubmask(MaskOp(Later), MaskOp(Earlier));
1007 }
1008 if (IDE == Intrinsic::masked_store && IDL == Intrinsic::masked_load) {
1009 // Trying to replace a load of a stored value with the store's value.
1010 // Check that the pointers are the same, and
1011 // - load's mask is a subset of store's mask, and
1012 // - load's pass-through is "undef".
1013 if (!IsSubmask(MaskOp(Later), MaskOp(Earlier)))
1014 return false;
1015 return isa<UndefValue>(Val: ThruOp(Later));
1016 }
1017 if (IDE == Intrinsic::masked_load && IDL == Intrinsic::masked_store) {
1018 // Trying to remove a store of the loaded value.
1019 // Check that the pointers are the same, and
1020 // - store's mask is a subset of the load's mask.
1021 return IsSubmask(MaskOp(Later), MaskOp(Earlier));
1022 }
1023 if (IDE == Intrinsic::masked_store && IDL == Intrinsic::masked_store) {
1024 // Trying to remove a dead store (earlier).
1025 // Check that the pointers are the same,
1026 // - the to-be-removed store's mask is a subset of the other store's
1027 // mask.
1028 return IsSubmask(MaskOp(Earlier), MaskOp(Later));
1029 }
1030 return false;
1031 }
1032
1033 void removeMSSA(Instruction &Inst) {
1034 if (!MSSA)
1035 return;
1036 if (VerifyMemorySSA)
1037 MSSA->verifyMemorySSA();
1038 // Removing a store here can leave MemorySSA in an unoptimized state by
1039 // creating MemoryPhis that have identical arguments and by creating
1040 // MemoryUses whose defining access is not an actual clobber. The phi case
1041 // is handled by MemorySSA when passing OptimizePhis = true to
1042 // removeMemoryAccess. The non-optimized MemoryUse case is lazily updated
1043 // by MemorySSA's getClobberingMemoryAccess.
1044 MSSAUpdater->removeMemoryAccess(I: &Inst, OptimizePhis: true);
1045 }
1046};
1047
1048} // end anonymous namespace
1049
1050/// Determine if the memory referenced by LaterInst is from the same heap
1051/// version as EarlierInst.
1052/// This is currently called in two scenarios:
1053///
1054/// load p
1055/// ...
1056/// load p
1057///
1058/// and
1059///
1060/// x = load p
1061/// ...
1062/// store x, p
1063///
1064/// in both cases we want to verify that there are no possible writes to the
1065/// memory referenced by p between the earlier and later instruction.
1066bool EarlyCSE::isSameMemGeneration(unsigned EarlierGeneration,
1067 unsigned LaterGeneration,
1068 Instruction *EarlierInst,
1069 Instruction *LaterInst) {
1070 // Check the simple memory generation tracking first.
1071 if (EarlierGeneration == LaterGeneration)
1072 return true;
1073
1074 if (!MSSA)
1075 return false;
1076
1077 // If MemorySSA has determined that one of EarlierInst or LaterInst does not
1078 // read/write memory, then we can safely return true here.
1079 // FIXME: We could be more aggressive when checking doesNotAccessMemory(),
1080 // onlyReadsMemory(), mayReadFromMemory(), and mayWriteToMemory() in this pass
1081 // by also checking the MemorySSA MemoryAccess on the instruction. Initial
1082 // experiments suggest this isn't worthwhile, at least for C/C++ code compiled
1083 // with the default optimization pipeline.
1084 auto *EarlierMA = MSSA->getMemoryAccess(I: EarlierInst);
1085 if (!EarlierMA)
1086 return true;
1087 auto *LaterMA = MSSA->getMemoryAccess(I: LaterInst);
1088 if (!LaterMA)
1089 return true;
1090
1091 // Since we know LaterDef dominates LaterInst and EarlierInst dominates
1092 // LaterInst, if LaterDef dominates EarlierInst then it can't occur between
1093 // EarlierInst and LaterInst and neither can any other write that potentially
1094 // clobbers LaterInst.
1095 MemoryAccess *LaterDef;
1096 if (ClobberCounter < EarlyCSEMssaOptCap) {
1097 LaterDef = MSSA->getWalker()->getClobberingMemoryAccess(I: LaterInst);
1098 ClobberCounter++;
1099 } else
1100 LaterDef = LaterMA->getDefiningAccess();
1101
1102 return MSSA->dominates(A: LaterDef, B: EarlierMA);
1103}
1104
1105bool EarlyCSE::isOperatingOnInvariantMemAt(Instruction *I, unsigned GenAt) {
1106 // A location loaded from with an invariant_load is assumed to *never* change
1107 // within the visible scope of the compilation.
1108 if (auto *LI = dyn_cast<LoadInst>(Val: I))
1109 if (LI->hasMetadata(KindID: LLVMContext::MD_invariant_load))
1110 return true;
1111
1112 auto MemLocOpt = MemoryLocation::getOrNone(Inst: I);
1113 if (!MemLocOpt)
1114 // "target" intrinsic forms of loads aren't currently known to
1115 // MemoryLocation::get. TODO
1116 return false;
1117 MemoryLocation MemLoc = *MemLocOpt;
1118 if (!AvailableInvariants.count(Key: MemLoc))
1119 return false;
1120
1121 // Is the generation at which this became invariant older than the
1122 // current one?
1123 return AvailableInvariants.lookup(Key: MemLoc) <= GenAt;
1124}
1125
1126bool EarlyCSE::handleBranchCondition(Instruction *CondInst,
1127 const CondBrInst *BI, const BasicBlock *BB,
1128 const BasicBlock *Pred) {
1129 assert(BI->getCondition() == CondInst && "Wrong condition?");
1130 assert(BI->getSuccessor(0) == BB || BI->getSuccessor(1) == BB);
1131 auto *TorF = (BI->getSuccessor(i: 0) == BB)
1132 ? ConstantInt::getTrue(Context&: BB->getContext())
1133 : ConstantInt::getFalse(Context&: BB->getContext());
1134 auto MatchBinOp = [](Instruction *I, unsigned Opcode, Value *&LHS,
1135 Value *&RHS) {
1136 if (Opcode == Instruction::And &&
1137 match(V: I, P: m_LogicalAnd(L: m_Value(V&: LHS), R: m_Value(V&: RHS))))
1138 return true;
1139 else if (Opcode == Instruction::Or &&
1140 match(V: I, P: m_LogicalOr(L: m_Value(V&: LHS), R: m_Value(V&: RHS))))
1141 return true;
1142 return false;
1143 };
1144 // If the condition is AND operation, we can propagate its operands into the
1145 // true branch. If it is OR operation, we can propagate them into the false
1146 // branch.
1147 unsigned PropagateOpcode =
1148 (BI->getSuccessor(i: 0) == BB) ? Instruction::And : Instruction::Or;
1149
1150 bool MadeChanges = false;
1151 SmallVector<Instruction *, 4> WorkList;
1152 SmallPtrSet<Instruction *, 4> Visited;
1153 WorkList.push_back(Elt: CondInst);
1154 while (!WorkList.empty()) {
1155 Instruction *Curr = WorkList.pop_back_val();
1156
1157 AvailableValues.insert(Key: Curr, Val: TorF);
1158 LLVM_DEBUG(dbgs() << "EarlyCSE CVP: Add conditional value for '"
1159 << Curr->getName() << "' as " << *TorF << " in "
1160 << BB->getName() << "\n");
1161 if (!DebugCounter::shouldExecute(Counter&: CSECounter)) {
1162 LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
1163 } else {
1164 // Replace all dominated uses with the known value.
1165 if (unsigned Count = replaceDominatedUsesWith(From: Curr, To: TorF, DT,
1166 Edge: BasicBlockEdge(Pred, BB))) {
1167 NumCSECVP += Count;
1168 MadeChanges = true;
1169 }
1170 }
1171
1172 Value *LHS, *RHS;
1173 if (MatchBinOp(Curr, PropagateOpcode, LHS, RHS))
1174 for (auto *Op : { LHS, RHS })
1175 if (Instruction *OPI = dyn_cast<Instruction>(Val: Op))
1176 if (SimpleValue::canHandle(Inst: OPI) && Visited.insert(Ptr: OPI).second)
1177 WorkList.push_back(Elt: OPI);
1178 }
1179
1180 return MadeChanges;
1181}
1182
1183Value *EarlyCSE::getMatchingValue(LoadValue &InVal, ParseMemoryInst &MemInst,
1184 unsigned CurrentGeneration) {
1185 if (InVal.DefInst == nullptr)
1186 return nullptr;
1187 if (auto *MSI = dyn_cast<MemSetInst>(Val: InVal.DefInst)) {
1188 if (!MemInst.isLoad() || MemInst.isVolatile() || !MemInst.isUnordered() ||
1189 MemInst.getMatchingId() != -1)
1190 return nullptr;
1191 if (MSI->isVolatile())
1192 return nullptr;
1193 auto *Val = dyn_cast<ConstantInt>(Val: MSI->getValue());
1194 if (!Val || !Val->isZero())
1195 return nullptr;
1196 auto Len = MSI->getLengthInBytes();
1197 if (!Len)
1198 return nullptr;
1199 Type *InstType = MemInst.getValueType();
1200 if (!InstType)
1201 return nullptr;
1202 TypeSize LoadSize = SQ.DL.getTypeStoreSize(Ty: InstType);
1203 if (LoadSize.isScalable() || Len->ult(RHS: LoadSize.getFixedValue()))
1204 return nullptr;
1205 if (!isOperatingOnInvariantMemAt(I: MemInst.get(), GenAt: InVal.Generation) &&
1206 !isSameMemGeneration(EarlierGeneration: InVal.Generation, LaterGeneration: CurrentGeneration, EarlierInst: InVal.DefInst,
1207 LaterInst: MemInst.get()))
1208 return nullptr;
1209 return Constant::getNullValue(Ty: MemInst.getValueType());
1210 }
1211 if (InVal.MatchingId != MemInst.getMatchingId())
1212 return nullptr;
1213 // We don't yet handle removing loads with ordering of any kind.
1214 if (MemInst.isVolatile() || !MemInst.isUnordered())
1215 return nullptr;
1216 // We can't replace an atomic load with one which isn't also atomic.
1217 if (MemInst.isLoad() && !InVal.IsAtomic && MemInst.isAtomic())
1218 return nullptr;
1219 // The value V returned from this function is used differently depending
1220 // on whether MemInst is a load or a store. If it's a load, we will replace
1221 // MemInst with V, if it's a store, we will check if V is the same as the
1222 // available value.
1223 bool MemInstMatching = !MemInst.isLoad();
1224 Instruction *Matching = MemInstMatching ? MemInst.get() : InVal.DefInst;
1225 Instruction *Other = MemInstMatching ? InVal.DefInst : MemInst.get();
1226
1227 // For stores check the result values before checking memory generation
1228 // (otherwise isSameMemGeneration may crash).
1229 Value *Result =
1230 MemInst.isStore()
1231 ? getOrCreateResult(Inst: Matching, ExpectedType: Other->getType(), /*CanCreate=*/false)
1232 : nullptr;
1233 if (MemInst.isStore() && InVal.DefInst != Result)
1234 return nullptr;
1235
1236 // Deal with non-target memory intrinsics.
1237 bool MatchingNTI = isHandledNonTargetIntrinsic(V: Matching);
1238 bool OtherNTI = isHandledNonTargetIntrinsic(V: Other);
1239 if (OtherNTI != MatchingNTI)
1240 return nullptr;
1241 if (OtherNTI && MatchingNTI) {
1242 if (!isNonTargetIntrinsicMatch(Earlier: cast<IntrinsicInst>(Val: InVal.DefInst),
1243 Later: cast<IntrinsicInst>(Val: MemInst.get())))
1244 return nullptr;
1245 }
1246
1247 if (!isOperatingOnInvariantMemAt(I: MemInst.get(), GenAt: InVal.Generation) &&
1248 !isSameMemGeneration(EarlierGeneration: InVal.Generation, LaterGeneration: CurrentGeneration, EarlierInst: InVal.DefInst,
1249 LaterInst: MemInst.get()))
1250 return nullptr;
1251
1252 if (!Result)
1253 Result = getOrCreateResult(Inst: Matching, ExpectedType: Other->getType(), /*CanCreate=*/true);
1254 return Result;
1255}
1256
1257static void combineIRFlags(Instruction &From, Value *To) {
1258 if (auto *I = dyn_cast<Instruction>(Val: To)) {
1259 // If I being poison triggers UB, there is no need to drop those
1260 // flags. Otherwise, only retain flags present on both I and Inst.
1261 // TODO: Currently some fast-math flags are not treated as
1262 // poison-generating even though they should. Until this is fixed,
1263 // always retain flags present on both I and Inst for floating point
1264 // instructions.
1265 if (isa<FPMathOperator>(Val: I) ||
1266 (I->hasPoisonGeneratingFlags() && !programUndefinedIfPoison(Inst: I)))
1267 I->andIRFlags(V: &From);
1268 }
1269 if (isa<CallBase>(Val: &From) && isa<CallBase>(Val: To)) {
1270 // NB: Intersection of attrs between InVal.first and Inst is overly
1271 // conservative. Since we only CSE readonly functions that have the same
1272 // memory state, we can preserve (or possibly in some cases combine)
1273 // more attributes. Likewise this implies when checking equality of
1274 // callsite for CSEing, we can probably ignore more attributes.
1275 // Generally poison generating attributes need to be handled with more
1276 // care as they can create *new* UB if preserved/combined and violated.
1277 // Attributes that imply immediate UB on the other hand would have been
1278 // violated either way.
1279 bool Success =
1280 cast<CallBase>(Val: To)->tryIntersectAttributes(Other: cast<CallBase>(Val: &From));
1281 assert(Success && "Failed to intersect attributes in callsites that "
1282 "passed identical check");
1283 // For NDEBUG Compile.
1284 (void)Success;
1285 }
1286}
1287
1288bool EarlyCSE::overridingStores(const ParseMemoryInst &Earlier,
1289 const ParseMemoryInst &Later) {
1290 // Can we remove Earlier store because of Later store?
1291
1292 assert(Earlier.isUnordered() && !Earlier.isVolatile() &&
1293 "Violated invariant");
1294 if (Earlier.getPointerOperand() != Later.getPointerOperand())
1295 return false;
1296 if (!Earlier.getValueType() || !Later.getValueType() ||
1297 Earlier.getValueType() != Later.getValueType())
1298 return false;
1299 if (Earlier.getMatchingId() != Later.getMatchingId())
1300 return false;
1301 // At the moment, we don't remove ordered stores, but do remove
1302 // unordered atomic stores. There's no special requirement (for
1303 // unordered atomics) about removing atomic stores only in favor of
1304 // other atomic stores since we were going to execute the non-atomic
1305 // one anyway and the atomic one might never have become visible.
1306 if (!Earlier.isUnordered() || !Later.isUnordered())
1307 return false;
1308
1309 // Deal with non-target memory intrinsics.
1310 bool ENTI = isHandledNonTargetIntrinsic(V: Earlier.get());
1311 bool LNTI = isHandledNonTargetIntrinsic(V: Later.get());
1312 if (ENTI && LNTI)
1313 return isNonTargetIntrinsicMatch(Earlier: cast<IntrinsicInst>(Val: Earlier.get()),
1314 Later: cast<IntrinsicInst>(Val: Later.get()));
1315
1316 // Because of the check above, at least one of them is false.
1317 // For now disallow matching intrinsics with non-intrinsics,
1318 // so assume that the stores match if neither is an intrinsic.
1319 return ENTI == LNTI;
1320}
1321
1322bool EarlyCSE::processNode(DomTreeNode *Node) {
1323 bool Changed = false;
1324 BasicBlock *BB = Node->getBlock();
1325
1326 // If this block has a single predecessor, then the predecessor is the parent
1327 // of the domtree node and all of the live out memory values are still current
1328 // in this block. If this block has multiple predecessors, then they could
1329 // have invalidated the live-out memory values of our parent value. For now,
1330 // just be conservative and invalidate memory if this block has multiple
1331 // predecessors.
1332 if (!BB->getSinglePredecessor())
1333 ++CurrentGeneration;
1334
1335 // If this node has a single predecessor which ends in a conditional branch,
1336 // we can infer the value of the branch condition given that we took this
1337 // path. We need the single predecessor to ensure there's not another path
1338 // which reaches this block where the condition might hold a different
1339 // value. Since we're adding this to the scoped hash table (like any other
1340 // def), it will have been popped if we encounter a future merge block.
1341 if (BasicBlock *Pred = BB->getSinglePredecessor()) {
1342 if (auto *BI = dyn_cast<CondBrInst>(Val: Pred->getTerminator())) {
1343 auto *CondInst = dyn_cast<Instruction>(Val: BI->getCondition());
1344 if (CondInst && SimpleValue::canHandle(Inst: CondInst))
1345 Changed |= handleBranchCondition(CondInst, BI, BB, Pred);
1346 }
1347 }
1348
1349 /// LastStore - Keep track of the last non-volatile store that we saw... for
1350 /// as long as there in no instruction that reads memory. If we see a store
1351 /// to the same location, we delete the dead store. This zaps trivial dead
1352 /// stores which can occur in bitfield code among other things.
1353 Instruction *LastStore = nullptr;
1354
1355 // See if any instructions in the block can be eliminated. If so, do it. If
1356 // not, add them to AvailableValues.
1357 for (Instruction &Inst : make_early_inc_range(Range&: *BB)) {
1358 // Dead instructions should just be removed.
1359 if (isInstructionTriviallyDead(I: &Inst, TLI: &TLI)) {
1360 LLVM_DEBUG(dbgs() << "EarlyCSE DCE: " << Inst << '\n');
1361 if (!DebugCounter::shouldExecute(Counter&: CSECounter)) {
1362 LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
1363 continue;
1364 }
1365
1366 salvageKnowledge(I: &Inst, AC: &AC);
1367 salvageDebugInfo(I&: Inst);
1368 removeMSSA(Inst);
1369 Inst.eraseFromParent();
1370 Changed = true;
1371 ++NumSimplify;
1372 continue;
1373 }
1374
1375 // Skip assume intrinsics, they don't really have side effects (although
1376 // they're marked as such to ensure preservation of control dependencies),
1377 // and this pass will not bother with its removal. However, we should mark
1378 // its condition as true for all dominated blocks.
1379 if (auto *Assume = dyn_cast<AssumeInst>(Val: &Inst)) {
1380 auto *CondI = dyn_cast<Instruction>(Val: Assume->getArgOperand(i: 0));
1381 if (CondI && SimpleValue::canHandle(Inst: CondI)) {
1382 LLVM_DEBUG(dbgs() << "EarlyCSE considering assumption: " << Inst
1383 << '\n');
1384 AvailableValues.insert(Key: CondI, Val: ConstantInt::getTrue(Context&: BB->getContext()));
1385 } else
1386 LLVM_DEBUG(dbgs() << "EarlyCSE skipping assumption: " << Inst << '\n');
1387 continue;
1388 }
1389
1390 // Likewise, noalias intrinsics don't actually write.
1391 if (match(V: &Inst,
1392 P: m_Intrinsic<Intrinsic::experimental_noalias_scope_decl>())) {
1393 LLVM_DEBUG(dbgs() << "EarlyCSE skipping noalias intrinsic: " << Inst
1394 << '\n');
1395 continue;
1396 }
1397
1398 // Skip sideeffect intrinsics, for the same reason as assume intrinsics.
1399 if (match(V: &Inst, P: m_Intrinsic<Intrinsic::sideeffect>())) {
1400 LLVM_DEBUG(dbgs() << "EarlyCSE skipping sideeffect: " << Inst << '\n');
1401 continue;
1402 }
1403
1404 // Skip pseudoprobe intrinsics, for the same reason as assume intrinsics.
1405 if (match(V: &Inst, P: m_Intrinsic<Intrinsic::pseudoprobe>())) {
1406 LLVM_DEBUG(dbgs() << "EarlyCSE skipping pseudoprobe: " << Inst << '\n');
1407 continue;
1408 }
1409
1410 // We can skip all invariant.start intrinsics since they only read memory,
1411 // and we can forward values across it. For invariant starts without
1412 // invariant ends, we can use the fact that the invariantness never ends to
1413 // start a scope in the current generaton which is true for all future
1414 // generations. Also, we dont need to consume the last store since the
1415 // semantics of invariant.start allow us to perform DSE of the last
1416 // store, if there was a store following invariant.start. Consider:
1417 //
1418 // store 30, i8* p
1419 // invariant.start(p)
1420 // store 40, i8* p
1421 // We can DSE the store to 30, since the store 40 to invariant location p
1422 // causes undefined behaviour.
1423 if (match(V: &Inst, P: m_Intrinsic<Intrinsic::invariant_start>())) {
1424 // If there are any uses, the scope might end.
1425 if (!Inst.use_empty())
1426 continue;
1427 MemoryLocation MemLoc =
1428 MemoryLocation::getForArgument(Call: &cast<CallInst>(Val&: Inst), ArgIdx: 1, TLI);
1429 // Don't start a scope if we already have a better one pushed
1430 if (!AvailableInvariants.count(Key: MemLoc))
1431 AvailableInvariants.insert(Key: MemLoc, Val: CurrentGeneration);
1432 continue;
1433 }
1434
1435 if (isGuard(U: &Inst)) {
1436 if (auto *CondI =
1437 dyn_cast<Instruction>(Val: cast<CallInst>(Val&: Inst).getArgOperand(i: 0))) {
1438 if (SimpleValue::canHandle(Inst: CondI)) {
1439 // Do we already know the actual value of this condition?
1440 if (auto *KnownCond = AvailableValues.lookup(Key: CondI)) {
1441 // Is the condition known to be true?
1442 if (isa<ConstantInt>(Val: KnownCond) &&
1443 cast<ConstantInt>(Val: KnownCond)->isOne()) {
1444 LLVM_DEBUG(dbgs()
1445 << "EarlyCSE removing guard: " << Inst << '\n');
1446 salvageKnowledge(I: &Inst, AC: &AC);
1447 removeMSSA(Inst);
1448 Inst.eraseFromParent();
1449 Changed = true;
1450 continue;
1451 } else
1452 // Use the known value if it wasn't true.
1453 cast<CallInst>(Val&: Inst).setArgOperand(i: 0, v: KnownCond);
1454 }
1455 // The condition we're on guarding here is true for all dominated
1456 // locations.
1457 AvailableValues.insert(Key: CondI, Val: ConstantInt::getTrue(Context&: BB->getContext()));
1458 }
1459 }
1460
1461 // Guard intrinsics read all memory, but don't write any memory.
1462 // Accordingly, don't update the generation but consume the last store (to
1463 // avoid an incorrect DSE).
1464 LastStore = nullptr;
1465 continue;
1466 }
1467
1468 // If the instruction can be simplified (e.g. X+0 = X) then replace it with
1469 // its simpler value.
1470 if (Value *V = simplifyInstruction(I: &Inst, Q: SQ)) {
1471 LLVM_DEBUG(dbgs() << "EarlyCSE Simplify: " << Inst << " to: " << *V
1472 << '\n');
1473 if (!DebugCounter::shouldExecute(Counter&: CSECounter)) {
1474 LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
1475 } else {
1476 bool Killed = false;
1477 if (!Inst.use_empty()) {
1478 Inst.replaceAllUsesWith(V);
1479 Changed = true;
1480 }
1481 if (isInstructionTriviallyDead(I: &Inst, TLI: &TLI)) {
1482 salvageKnowledge(I: &Inst, AC: &AC);
1483 removeMSSA(Inst);
1484 Inst.eraseFromParent();
1485 Changed = true;
1486 Killed = true;
1487 }
1488 if (Changed)
1489 ++NumSimplify;
1490 if (Killed)
1491 continue;
1492 }
1493 }
1494
1495 // Make sure stores prior to a potential unwind are not removed, as the
1496 // caller may read the memory.
1497 if (Inst.mayThrow())
1498 LastStore = nullptr;
1499
1500 // If this is a simple instruction that we can value number, process it.
1501 if (SimpleValue::canHandle(Inst: &Inst)) {
1502 if ([[maybe_unused]] auto *CI = dyn_cast<ConstrainedFPIntrinsic>(Val: &Inst)) {
1503 assert(CI->getExceptionBehavior() != fp::ebStrict &&
1504 "Unexpected ebStrict from SimpleValue::canHandle()");
1505 assert((!CI->getRoundingMode() ||
1506 CI->getRoundingMode() != RoundingMode::Dynamic) &&
1507 "Unexpected dynamic rounding from SimpleValue::canHandle()");
1508 }
1509 // See if the instruction has an available value. If so, use it.
1510 if (Value *V = AvailableValues.lookup(Key: &Inst)) {
1511 LLVM_DEBUG(dbgs() << "EarlyCSE CSE: " << Inst << " to: " << *V
1512 << '\n');
1513 if (!DebugCounter::shouldExecute(Counter&: CSECounter)) {
1514 LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
1515 continue;
1516 }
1517 combineIRFlags(From&: Inst, To: V);
1518 Inst.replaceAllUsesWith(V);
1519 salvageKnowledge(I: &Inst, AC: &AC);
1520 removeMSSA(Inst);
1521 Inst.eraseFromParent();
1522 Changed = true;
1523 ++NumCSE;
1524 continue;
1525 }
1526
1527 // Otherwise, just remember that this value is available.
1528 AvailableValues.insert(Key: &Inst, Val: &Inst);
1529 continue;
1530 }
1531
1532 ParseMemoryInst MemInst(&Inst, TTI);
1533 // If this is a non-volatile load, process it.
1534 if (MemInst.isValid() && MemInst.isLoad()) {
1535 // (conservatively) we can't peak past the ordering implied by this
1536 // operation, but we can add this load to our set of available values
1537 if (MemInst.isVolatile() || !MemInst.isUnordered()) {
1538 LastStore = nullptr;
1539 ++CurrentGeneration;
1540 }
1541
1542 if (MemInst.isInvariantLoad()) {
1543 // If we pass an invariant load, we know that memory location is
1544 // indefinitely constant from the moment of first dereferenceability.
1545 // We conservatively treat the invariant_load as that moment. If we
1546 // pass a invariant load after already establishing a scope, don't
1547 // restart it since we want to preserve the earliest point seen.
1548 auto MemLoc = MemoryLocation::get(Inst: &Inst);
1549 if (!AvailableInvariants.count(Key: MemLoc))
1550 AvailableInvariants.insert(Key: MemLoc, Val: CurrentGeneration);
1551 }
1552
1553 // If we have an available version of this load, and if it is the right
1554 // generation or the load is known to be from an invariant location,
1555 // replace this instruction.
1556 //
1557 // If either the dominating load or the current load are invariant, then
1558 // we can assume the current load loads the same value as the dominating
1559 // load.
1560 LoadValue InVal = AvailableLoads.lookup(Key: MemInst.getPointerOperand());
1561 if (Value *Op = getMatchingValue(InVal, MemInst, CurrentGeneration)) {
1562 LLVM_DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << Inst
1563 << " to: " << *InVal.DefInst << '\n');
1564 if (!DebugCounter::shouldExecute(Counter&: CSECounter)) {
1565 LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
1566 continue;
1567 }
1568 if (InVal.IsLoad)
1569 if (auto *I = dyn_cast<Instruction>(Val: Op))
1570 combineMetadataForCSE(K: I, J: &Inst, DoesKMove: false);
1571 if (!Inst.use_empty())
1572 Inst.replaceAllUsesWith(V: Op);
1573 salvageKnowledge(I: &Inst, AC: &AC);
1574 removeMSSA(Inst);
1575 Inst.eraseFromParent();
1576 Changed = true;
1577 ++NumCSELoad;
1578 continue;
1579 }
1580
1581 // Otherwise, remember that we have this instruction.
1582 AvailableLoads.insert(Key: MemInst.getPointerOperand(),
1583 Val: LoadValue(&Inst, CurrentGeneration,
1584 MemInst.getMatchingId(),
1585 MemInst.isAtomic(),
1586 MemInst.isLoad()));
1587 LastStore = nullptr;
1588 continue;
1589 }
1590
1591 // If this instruction may read from memory, forget LastStore. Load/store
1592 // intrinsics will indicate both a read and a write to memory. The target
1593 // may override this (e.g. so that a store intrinsic does not read from
1594 // memory, and thus will be treated the same as a regular store for
1595 // commoning purposes).
1596 if (Inst.mayReadFromMemory() &&
1597 !(MemInst.isValid() && !MemInst.mayReadFromMemory()))
1598 LastStore = nullptr;
1599
1600 // If this is a read-only or write-only call, process it. Skip store
1601 // MemInsts, as they will be more precisely handled later on. Also skip
1602 // memsets, as DSE may be able to optimize them better by removing the
1603 // earlier rather than later store.
1604 if (CallValue::canHandle(Inst: &Inst) &&
1605 (!MemInst.isValid() || !MemInst.isStore()) && !isa<MemSetInst>(Val: &Inst)) {
1606 // If we have an available version of this call, and if it is the right
1607 // generation, replace this instruction.
1608 std::pair<Instruction *, unsigned> InVal = AvailableCalls.lookup(Key: &Inst);
1609 if (InVal.first != nullptr &&
1610 isSameMemGeneration(EarlierGeneration: InVal.second, LaterGeneration: CurrentGeneration, EarlierInst: InVal.first,
1611 LaterInst: &Inst) &&
1612 InVal.first->mayReadFromMemory() == Inst.mayReadFromMemory()) {
1613 LLVM_DEBUG(dbgs() << "EarlyCSE CSE CALL: " << Inst
1614 << " to: " << *InVal.first << '\n');
1615 if (!DebugCounter::shouldExecute(Counter&: CSECounter)) {
1616 LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
1617 continue;
1618 }
1619 combineIRFlags(From&: Inst, To: InVal.first);
1620 if (!Inst.use_empty())
1621 Inst.replaceAllUsesWith(V: InVal.first);
1622 salvageKnowledge(I: &Inst, AC: &AC);
1623 removeMSSA(Inst);
1624 Inst.eraseFromParent();
1625 Changed = true;
1626 ++NumCSECall;
1627 continue;
1628 }
1629
1630 // Increase memory generation for writes. Do this before inserting
1631 // the call, so it has the generation after the write occurred.
1632 if (Inst.mayWriteToMemory())
1633 ++CurrentGeneration;
1634
1635 // Otherwise, remember that we have this instruction.
1636 AvailableCalls.insert(Key: &Inst, Val: std::make_pair(x: &Inst, y&: CurrentGeneration));
1637 continue;
1638 }
1639
1640 // Compare GEP instructions based on offset.
1641 if (GEPValue::canHandle(Inst: &Inst)) {
1642 auto *GEP = cast<GetElementPtrInst>(Val: &Inst);
1643 APInt Offset = APInt(SQ.DL.getIndexTypeSizeInBits(Ty: GEP->getType()), 0);
1644 GEPValue GEPVal(GEP, GEP->accumulateConstantOffset(DL: SQ.DL, Offset)
1645 ? Offset.trySExtValue()
1646 : std::nullopt);
1647 if (Value *V = AvailableGEPs.lookup(Key: GEPVal)) {
1648 LLVM_DEBUG(dbgs() << "EarlyCSE CSE GEP: " << Inst << " to: " << *V
1649 << '\n');
1650 combineIRFlags(From&: Inst, To: V);
1651 Inst.replaceAllUsesWith(V);
1652 salvageKnowledge(I: &Inst, AC: &AC);
1653 removeMSSA(Inst);
1654 Inst.eraseFromParent();
1655 Changed = true;
1656 ++NumCSEGEP;
1657 continue;
1658 }
1659
1660 // Otherwise, just remember that we have this GEP.
1661 AvailableGEPs.insert(Key: GEPVal, Val: &Inst);
1662 continue;
1663 }
1664
1665 // A release fence requires that all stores complete before it, but does
1666 // not prevent the reordering of following loads 'before' the fence. As a
1667 // result, we don't need to consider it as writing to memory and don't need
1668 // to advance the generation. We do need to prevent DSE across the fence,
1669 // but that's handled above.
1670 if (auto *FI = dyn_cast<FenceInst>(Val: &Inst))
1671 if (FI->getOrdering() == AtomicOrdering::Release) {
1672 assert(Inst.mayReadFromMemory() && "relied on to prevent DSE above");
1673 continue;
1674 }
1675
1676 // write back DSE - If we write back the same value we just loaded from
1677 // the same location and haven't passed any intervening writes or ordering
1678 // operations, we can remove the write. The primary benefit is in allowing
1679 // the available load table to remain valid and value forward past where
1680 // the store originally was.
1681 if (MemInst.isValid() && MemInst.isStore()) {
1682 LoadValue InVal = AvailableLoads.lookup(Key: MemInst.getPointerOperand());
1683 if (InVal.DefInst &&
1684 InVal.DefInst ==
1685 getMatchingValue(InVal, MemInst, CurrentGeneration)) {
1686 LLVM_DEBUG(dbgs() << "EarlyCSE DSE (writeback): " << Inst << '\n');
1687 if (!DebugCounter::shouldExecute(Counter&: CSECounter)) {
1688 LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
1689 continue;
1690 }
1691 salvageKnowledge(I: &Inst, AC: &AC);
1692 removeMSSA(Inst);
1693 Inst.eraseFromParent();
1694 Changed = true;
1695 ++NumDSE;
1696 // We can avoid incrementing the generation count since we were able
1697 // to eliminate this store.
1698 continue;
1699 }
1700 }
1701
1702 // Okay, this isn't something we can CSE at all. Check to see if it is
1703 // something that could modify memory. If so, our available memory values
1704 // cannot be used so bump the generation count.
1705 if (Inst.mayWriteToMemory()) {
1706 ++CurrentGeneration;
1707
1708 if (MemInst.isValid() && MemInst.isStore()) {
1709 // We do a trivial form of DSE if there are two stores to the same
1710 // location with no intervening loads. Delete the earlier store.
1711 if (LastStore) {
1712 if (overridingStores(Earlier: ParseMemoryInst(LastStore, TTI), Later: MemInst)) {
1713 LLVM_DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore
1714 << " due to: " << Inst << '\n');
1715 if (!DebugCounter::shouldExecute(Counter&: CSECounter)) {
1716 LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
1717 } else {
1718 salvageKnowledge(I: &Inst, AC: &AC);
1719 removeMSSA(Inst&: *LastStore);
1720 LastStore->eraseFromParent();
1721 Changed = true;
1722 ++NumDSE;
1723 LastStore = nullptr;
1724 }
1725 }
1726 // fallthrough - we can exploit information about this store
1727 }
1728
1729 // Okay, we just invalidated anything we knew about loaded values. Try
1730 // to salvage *something* by remembering that the stored value is a live
1731 // version of the pointer. It is safe to forward from volatile stores
1732 // to non-volatile loads, so we don't have to check for volatility of
1733 // the store.
1734 AvailableLoads.insert(Key: MemInst.getPointerOperand(),
1735 Val: LoadValue(&Inst, CurrentGeneration,
1736 MemInst.getMatchingId(),
1737 MemInst.isAtomic(),
1738 MemInst.isLoad()));
1739
1740 // Remember that this was the last unordered store we saw for DSE. We
1741 // don't yet handle DSE on ordered or volatile stores since we don't
1742 // have a good way to model the ordering requirement for following
1743 // passes once the store is removed. We could insert a fence, but
1744 // since fences are slightly stronger than stores in their ordering,
1745 // it's not clear this is a profitable transform. Another option would
1746 // be to merge the ordering with that of the post dominating store.
1747 if (MemInst.isUnordered() && !MemInst.isVolatile())
1748 LastStore = &Inst;
1749 else
1750 LastStore = nullptr;
1751 }
1752 }
1753 }
1754
1755 return Changed;
1756}
1757
1758bool EarlyCSE::run() {
1759 // Note, deque is being used here because there is significant performance
1760 // gains over vector when the container becomes very large due to the
1761 // specific access patterns. For more information see the mailing list
1762 // discussion on this:
1763 // http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20120116/135228.html
1764 std::deque<StackNode *> nodesToProcess;
1765
1766 bool Changed = false;
1767
1768 // Process the root node.
1769 nodesToProcess.push_back(x: new StackNode(
1770 AvailableValues, AvailableLoads, AvailableInvariants, AvailableCalls,
1771 AvailableGEPs, CurrentGeneration, DT.getRootNode(),
1772 DT.getRootNode()->begin(), DT.getRootNode()->end()));
1773
1774 assert(!CurrentGeneration && "Create a new EarlyCSE instance to rerun it.");
1775
1776 // Process the stack.
1777 while (!nodesToProcess.empty()) {
1778 // Grab the first item off the stack. Set the current generation, remove
1779 // the node from the stack, and process it.
1780 StackNode *NodeToProcess = nodesToProcess.back();
1781
1782 // Initialize class members.
1783 CurrentGeneration = NodeToProcess->currentGeneration();
1784
1785 // Check if the node needs to be processed.
1786 if (!NodeToProcess->isProcessed()) {
1787 // Process the node.
1788 Changed |= processNode(Node: NodeToProcess->node());
1789 NodeToProcess->childGeneration(generation: CurrentGeneration);
1790 NodeToProcess->process();
1791 } else if (NodeToProcess->childIter() != NodeToProcess->end()) {
1792 // Push the next child onto the stack.
1793 DomTreeNode *child = NodeToProcess->nextChild();
1794 nodesToProcess.push_back(x: new StackNode(
1795 AvailableValues, AvailableLoads, AvailableInvariants, AvailableCalls,
1796 AvailableGEPs, NodeToProcess->childGeneration(), child,
1797 child->begin(), child->end()));
1798 } else {
1799 // It has been processed, and there are no more children to process,
1800 // so delete it and pop it off the stack.
1801 delete NodeToProcess;
1802 nodesToProcess.pop_back();
1803 }
1804 } // while (!nodes...)
1805
1806 return Changed;
1807}
1808
1809PreservedAnalyses EarlyCSEPass::run(Function &F,
1810 FunctionAnalysisManager &AM) {
1811 auto &TLI = AM.getResult<TargetLibraryAnalysis>(IR&: F);
1812 auto &TTI = AM.getResult<TargetIRAnalysis>(IR&: F);
1813 auto &DT = AM.getResult<DominatorTreeAnalysis>(IR&: F);
1814 auto &AC = AM.getResult<AssumptionAnalysis>(IR&: F);
1815 auto *MSSA =
1816 UseMemorySSA ? &AM.getResult<MemorySSAAnalysis>(IR&: F).getMSSA() : nullptr;
1817
1818 EarlyCSE CSE(F.getDataLayout(), TLI, TTI, DT, AC, MSSA);
1819
1820 if (!CSE.run())
1821 return PreservedAnalyses::all();
1822
1823 PreservedAnalyses PA;
1824 PA.preserveSet<CFGAnalyses>();
1825 if (UseMemorySSA)
1826 PA.preserve<MemorySSAAnalysis>();
1827 return PA;
1828}
1829
1830void EarlyCSEPass::printPipeline(
1831 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
1832 static_cast<PassInfoMixin<EarlyCSEPass> *>(this)->printPipeline(
1833 OS, MapClassName2PassName);
1834 OS << '<';
1835 if (UseMemorySSA)
1836 OS << "memssa";
1837 OS << '>';
1838}
1839
1840namespace {
1841
1842/// A simple and fast domtree-based CSE pass.
1843///
1844/// This pass does a simple depth-first walk over the dominator tree,
1845/// eliminating trivially redundant instructions and using instsimplify to
1846/// canonicalize things as it goes. It is intended to be fast and catch obvious
1847/// cases so that instcombine and other passes are more effective. It is
1848/// expected that a later pass of GVN will catch the interesting/hard cases.
1849template<bool UseMemorySSA>
1850class EarlyCSELegacyCommonPass : public FunctionPass {
1851public:
1852 static char ID;
1853
1854 EarlyCSELegacyCommonPass() : FunctionPass(ID) {
1855 if (UseMemorySSA)
1856 initializeEarlyCSEMemSSALegacyPassPass(*PassRegistry::getPassRegistry());
1857 else
1858 initializeEarlyCSELegacyPassPass(*PassRegistry::getPassRegistry());
1859 }
1860
1861 bool runOnFunction(Function &F) override {
1862 if (skipFunction(F))
1863 return false;
1864
1865 auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
1866 auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1867 auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1868 auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1869 auto *MSSA =
1870 UseMemorySSA ? &getAnalysis<MemorySSAWrapperPass>().getMSSA() : nullptr;
1871
1872 EarlyCSE CSE(F.getDataLayout(), TLI, TTI, DT, AC, MSSA);
1873
1874 return CSE.run();
1875 }
1876
1877 void getAnalysisUsage(AnalysisUsage &AU) const override {
1878 AU.addRequired<AssumptionCacheTracker>();
1879 AU.addRequired<DominatorTreeWrapperPass>();
1880 AU.addRequired<TargetLibraryInfoWrapperPass>();
1881 AU.addRequired<TargetTransformInfoWrapperPass>();
1882 if (UseMemorySSA) {
1883 AU.addRequired<AAResultsWrapperPass>();
1884 AU.addRequired<MemorySSAWrapperPass>();
1885 AU.addPreserved<MemorySSAWrapperPass>();
1886 }
1887 AU.addPreserved<GlobalsAAWrapperPass>();
1888 AU.addPreserved<AAResultsWrapperPass>();
1889 AU.setPreservesCFG();
1890 }
1891};
1892
1893} // end anonymous namespace
1894
1895using EarlyCSELegacyPass = EarlyCSELegacyCommonPass</*UseMemorySSA=*/false>;
1896
1897template<>
1898char EarlyCSELegacyPass::ID = 0;
1899
1900INITIALIZE_PASS_BEGIN(EarlyCSELegacyPass, "early-cse", "Early CSE", false,
1901 false)
1902INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
1903INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
1904INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
1905INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
1906INITIALIZE_PASS_END(EarlyCSELegacyPass, "early-cse", "Early CSE", false, false)
1907
1908using EarlyCSEMemSSALegacyPass =
1909 EarlyCSELegacyCommonPass</*UseMemorySSA=*/true>;
1910
1911template<>
1912char EarlyCSEMemSSALegacyPass::ID = 0;
1913
1914FunctionPass *llvm::createEarlyCSEPass(bool UseMemorySSA) {
1915 if (UseMemorySSA)
1916 return new EarlyCSEMemSSALegacyPass();
1917 else
1918 return new EarlyCSELegacyPass();
1919}
1920
1921INITIALIZE_PASS_BEGIN(EarlyCSEMemSSALegacyPass, "early-cse-memssa",
1922 "Early CSE w/ MemorySSA", false, false)
1923INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
1924INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
1925INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
1926INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
1927INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
1928INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
1929INITIALIZE_PASS_END(EarlyCSEMemSSALegacyPass, "early-cse-memssa",
1930 "Early CSE w/ MemorySSA", false, false)
1931