1//===- SeparateConstOffsetFromGEP.cpp -------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Loop unrolling may create many similar GEPs for array accesses.
10// e.g., a 2-level loop
11//
12// float a[32][32]; // global variable
13//
14// for (int i = 0; i < 2; ++i) {
15// for (int j = 0; j < 2; ++j) {
16// ...
17// ... = a[x + i][y + j];
18// ...
19// }
20// }
21//
22// will probably be unrolled to:
23//
24// gep %a, 0, %x, %y; load
25// gep %a, 0, %x, %y + 1; load
26// gep %a, 0, %x + 1, %y; load
27// gep %a, 0, %x + 1, %y + 1; load
28//
29// LLVM's GVN does not use partial redundancy elimination yet, and is thus
30// unable to reuse (gep %a, 0, %x, %y). As a result, this misoptimization incurs
31// significant slowdown in targets with limited addressing modes. For instance,
32// because the PTX target does not support the reg+reg addressing mode, the
33// NVPTX backend emits PTX code that literally computes the pointer address of
34// each GEP, wasting tons of registers. It emits the following PTX for the
35// first load and similar PTX for other loads.
36//
37// mov.u32 %r1, %x;
38// mov.u32 %r2, %y;
39// mul.wide.u32 %rl2, %r1, 128;
40// mov.u64 %rl3, a;
41// add.s64 %rl4, %rl3, %rl2;
42// mul.wide.u32 %rl5, %r2, 4;
43// add.s64 %rl6, %rl4, %rl5;
44// ld.global.f32 %f1, [%rl6];
45//
46// To reduce the register pressure, the optimization implemented in this file
47// merges the common part of a group of GEPs, so we can compute each pointer
48// address by adding a simple offset to the common part, saving many registers.
49//
50// It works by splitting each GEP into a variadic base and a constant offset.
51// The variadic base can be computed once and reused by multiple GEPs, and the
52// constant offsets can be nicely folded into the reg+immediate addressing mode
53// (supported by most targets) without using any extra register.
54//
55// For instance, we transform the four GEPs and four loads in the above example
56// into:
57//
58// base = gep a, 0, x, y
59// load base
60// load base + 1 * sizeof(float)
61// load base + 32 * sizeof(float)
62// load base + 33 * sizeof(float)
63//
64// Given the transformed IR, a backend that supports the reg+immediate
65// addressing mode can easily fold the pointer arithmetics into the loads. For
66// example, the NVPTX backend can easily fold the pointer arithmetics into the
67// ld.global.f32 instructions, and the resultant PTX uses much fewer registers.
68//
69// mov.u32 %r1, %tid.x;
70// mov.u32 %r2, %tid.y;
71// mul.wide.u32 %rl2, %r1, 128;
72// mov.u64 %rl3, a;
73// add.s64 %rl4, %rl3, %rl2;
74// mul.wide.u32 %rl5, %r2, 4;
75// add.s64 %rl6, %rl4, %rl5;
76// ld.global.f32 %f1, [%rl6]; // so far the same as unoptimized PTX
77// ld.global.f32 %f2, [%rl6+4]; // much better
78// ld.global.f32 %f3, [%rl6+128]; // much better
79// ld.global.f32 %f4, [%rl6+132]; // much better
80//
81// Another improvement enabled by the LowerGEP flag is to lower a GEP with
82// multiple indices to multiple GEPs with a single index.
83// Such transformation can have following benefits:
84// (1) It can always extract constants in the indices of structure type.
85// (2) After such Lowering, there are more optimization opportunities such as
86// CSE, LICM and CGP.
87//
88// E.g. The following GEPs have multiple indices:
89// BB1:
90// %p = getelementptr [10 x %struct], ptr %ptr, i64 %i, i64 %j1, i32 3
91// load %p
92// ...
93// BB2:
94// %p2 = getelementptr [10 x %struct], ptr %ptr, i64 %i, i64 %j1, i32 2
95// load %p2
96// ...
97//
98// We can not do CSE to the common part related to index "i64 %i". Lowering
99// GEPs can achieve such goals.
100//
101// This pass will lower a GEP with multiple indices into multiple GEPs with a
102// single index:
103// BB1:
104// %2 = mul i64 %i, length_of_10xstruct ; CSE opportunity
105// %3 = getelementptr i8, ptr %ptr, i64 %2 ; CSE opportunity
106// %4 = mul i64 %j1, length_of_struct
107// %5 = getelementptr i8, ptr %3, i64 %4
108// %p = getelementptr i8, ptr %5, struct_field_3 ; Constant offset
109// load %p
110// ...
111// BB2:
112// %8 = mul i64 %i, length_of_10xstruct ; CSE opportunity
113// %9 = getelementptr i8, ptr %ptr, i64 %8 ; CSE opportunity
114// %10 = mul i64 %j2, length_of_struct
115// %11 = getelementptr i8, ptr %9, i64 %10
116// %p2 = getelementptr i8, ptr %11, struct_field_2 ; Constant offset
117// load %p2
118// ...
119//
120// Lowering GEPs can also benefit other passes such as LICM and CGP.
121// LICM (Loop Invariant Code Motion) can not hoist/sink a GEP of multiple
122// indices if one of the index is variant. If we lower such GEP into invariant
123// parts and variant parts, LICM can hoist/sink those invariant parts.
124// CGP (CodeGen Prepare) tries to sink address calculations that match the
125// target's addressing modes. A GEP with multiple indices may not match and will
126// not be sunk. If we lower such GEP into smaller parts, CGP may sink some of
127// them. So we end up with a better addressing mode.
128//
129//===----------------------------------------------------------------------===//
130
131#include "llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h"
132#include "llvm/ADT/APInt.h"
133#include "llvm/ADT/DenseMap.h"
134#include "llvm/ADT/DepthFirstIterator.h"
135#include "llvm/ADT/SmallVector.h"
136#include "llvm/Analysis/LoopInfo.h"
137#include "llvm/Analysis/MemoryBuiltins.h"
138#include "llvm/Analysis/TargetLibraryInfo.h"
139#include "llvm/Analysis/TargetTransformInfo.h"
140#include "llvm/Analysis/ValueTracking.h"
141#include "llvm/IR/BasicBlock.h"
142#include "llvm/IR/Constant.h"
143#include "llvm/IR/Constants.h"
144#include "llvm/IR/DataLayout.h"
145#include "llvm/IR/DerivedTypes.h"
146#include "llvm/IR/Dominators.h"
147#include "llvm/IR/Function.h"
148#include "llvm/IR/GetElementPtrTypeIterator.h"
149#include "llvm/IR/IRBuilder.h"
150#include "llvm/IR/InstrTypes.h"
151#include "llvm/IR/Instruction.h"
152#include "llvm/IR/Instructions.h"
153#include "llvm/IR/Module.h"
154#include "llvm/IR/PassManager.h"
155#include "llvm/IR/PatternMatch.h"
156#include "llvm/IR/Type.h"
157#include "llvm/IR/User.h"
158#include "llvm/IR/Value.h"
159#include "llvm/InitializePasses.h"
160#include "llvm/Pass.h"
161#include "llvm/Support/Casting.h"
162#include "llvm/Support/CommandLine.h"
163#include "llvm/Support/ErrorHandling.h"
164#include "llvm/Support/KnownBits.h"
165#include "llvm/Support/raw_ostream.h"
166#include "llvm/Transforms/Scalar.h"
167#include "llvm/Transforms/Utils/Local.h"
168#include <cassert>
169#include <cstdint>
170#include <string>
171
172using namespace llvm;
173using namespace llvm::PatternMatch;
174
175static cl::opt<bool> DisableSeparateConstOffsetFromGEP(
176 "disable-separate-const-offset-from-gep", cl::init(Val: false),
177 cl::desc("Do not separate the constant offset from a GEP instruction"),
178 cl::Hidden);
179
180// Setting this flag may emit false positives when the input module already
181// contains dead instructions. Therefore, we set it only in unit tests that are
182// free of dead code.
183static cl::opt<bool>
184 VerifyNoDeadCode("reassociate-geps-verify-no-dead-code", cl::init(Val: false),
185 cl::desc("Verify this pass produces no dead code"),
186 cl::Hidden);
187
188namespace {
189
190/// A helper class for separating a constant offset from a GEP index.
191///
192/// In real programs, a GEP index may be more complicated than a simple addition
193/// of something and a constant integer which can be trivially splitted. For
194/// example, to split ((a << 3) | 5) + b, we need to search deeper for the
195/// constant offset, so that we can separate the index to (a << 3) + b and 5.
196///
197/// Therefore, this class looks into the expression that computes a given GEP
198/// index, and tries to find a constant integer that can be hoisted to the
199/// outermost level of the expression as an addition. Not every constant in an
200/// expression can jump out. e.g., we cannot transform (b * (a + 5)) to (b * a +
201/// 5); nor can we transform (3 * (a + 5)) to (3 * a + 5), however in this case,
202/// -instcombine probably already optimized (3 * (a + 5)) to (3 * a + 15).
203class ConstantOffsetExtractor {
204public:
205 /// Extracts a constant offset from the given GEP index. It returns the
206 /// new index representing the remainder (equal to the original index minus
207 /// the constant offset), or nullptr if we cannot extract a constant offset.
208 /// \p Idx The given GEP index
209 /// \p GEP The given GEP
210 /// \p UserChainTail Outputs the tail of UserChain so that we can
211 /// garbage-collect unused instructions in UserChain.
212 /// \p PreservesNUW Outputs whether the extraction allows preserving the
213 /// GEP's nuw flag, if it has one.
214 static Value *Extract(Value *Idx, GetElementPtrInst *GEP,
215 User *&UserChainTail, bool &PreservesNUW);
216
217 /// Looks for a constant offset from the given GEP index without extracting
218 /// it. It returns the numeric value of the extracted constant offset (0 if
219 /// failed). The meaning of the arguments are the same as Extract.
220 static APInt Find(Value *Idx, GetElementPtrInst *GEP);
221
222private:
223 ConstantOffsetExtractor(BasicBlock::iterator InsertionPt)
224 : IP(InsertionPt), DL(InsertionPt->getDataLayout()), SQ(DL) {}
225
226 /// Searches the expression that computes V for a non-zero constant C s.t.
227 /// V can be reassociated into the form V' + C. If the searching is
228 /// successful, returns C and update UserChain as a def-use chain from C to V;
229 /// otherwise, UserChain is empty.
230 ///
231 /// \p V The given expression
232 /// \p GEP The base GEP instruction, used for determining relevant
233 /// types, flags, and non-negativity needed for safe
234 /// reassociation
235 /// \p Idx The original index of the GEP
236 /// \p SignExtended Whether V will be sign-extended in the computation of
237 /// the GEP index
238 /// \p ZeroExtended Whether V will be zero-extended in the computation of
239 /// the GEP index
240 APInt find(Value *V, GetElementPtrInst *GEP, Value *Idx, bool SignExtended,
241 bool ZeroExtended);
242
243 /// A helper function to look into both operands of a binary operator.
244 APInt findInEitherOperand(BinaryOperator *BO, bool SignExtended,
245 bool ZeroExtended);
246
247 /// After finding the constant offset C from the GEP index I, we build a new
248 /// index I' s.t. I' + C = I. This function builds and returns the new
249 /// index I' according to UserChain produced by function "find".
250 ///
251 /// The building conceptually takes two steps:
252 /// 1) iteratively distribute sext/zext/trunc towards the leaves of the
253 /// expression tree that computes I
254 /// 2) reassociate the expression tree to the form I' + C.
255 ///
256 /// For example, to extract the 5 from sext(a + (b + 5)), we first distribute
257 /// sext to a, b and 5 so that we have
258 /// sext(a) + (sext(b) + 5).
259 /// Then, we reassociate it to
260 /// (sext(a) + sext(b)) + 5.
261 /// Given this form, we know I' is sext(a) + sext(b).
262 Value *rebuildWithoutConstOffset();
263
264 /// After the first step of rebuilding the GEP index without the constant
265 /// offset, distribute sext/zext/trunc to the operands of all operators in
266 /// UserChain. e.g., zext(sext(a + (b + 5)) (assuming no overflow) =>
267 /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))).
268 ///
269 /// The function also updates UserChain to point to new subexpressions after
270 /// distributing sext/zext/trunc. e.g., the old UserChain of the above example
271 /// is
272 /// 5 -> b + 5 -> a + (b + 5) -> sext(...) -> zext(sext(...)),
273 /// and the new UserChain is
274 /// zext(sext(5)) -> zext(sext(b)) + zext(sext(5)) ->
275 /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))
276 ///
277 /// \p ChainIndex The index to UserChain. ChainIndex is initially
278 /// UserChain.size() - 1, and is decremented during
279 /// the recursion.
280 Value *distributeCastsAndCloneChain(unsigned ChainIndex);
281
282 /// Reassociates the GEP index to the form I' + C and returns I'.
283 Value *removeConstOffset(unsigned ChainIndex);
284
285 /// A helper function to apply CastInsts, a list of sext/zext/trunc, to value
286 /// V. e.g., if CastInsts = [sext i32 to i64, zext i16 to i32], this function
287 /// returns "sext i32 (zext i16 V to i32) to i64".
288 Value *applyCasts(Value *V);
289
290 /// A helper function that returns whether we can trace into the operands
291 /// of binary operator BO for a constant offset.
292 ///
293 /// \p SignExtended Whether BO is surrounded by sext
294 /// \p ZeroExtended Whether BO is surrounded by zext
295 /// \p GEP The base GEP instruction, used for determining relevant
296 /// types and flags needed for safe reassociation.
297 /// \p Idx The original index of the GEP
298 bool canTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO,
299 GetElementPtrInst *GEP, Value *Idx);
300
301 /// Analyze a xor expression, and identify the bits in the constant operand
302 /// that are disjoint from the base operand's known set bits. For these
303 /// disjoint bits, a xor is equivalent to an addition, which allows us to
304 /// extract them as constant offsets that can be folded into the immediate
305 /// field of addressing operations. The transformation is the following one:
306 ///
307 /// Base ^ Const becomes (Base ^ NonDisjointBits) + DisjointBits
308 ///
309 /// where DisjointBits = Const & KnownZeros(Base) and
310 /// NonDisjointBits = Const & ~DisjointBits.
311 ///
312 /// Example with ptr having known-zero low bit:
313 /// Original: `xor %ptr, 3` ; 3 = 0b11
314 /// Analysis: DisjointBits = 3 & KnownZeros(%ptr) = 0b11 & 0b01 = 0b01
315 /// Result: `(xor %ptr, 2) + 1` where 1 can be folded into address mode
316 ///
317 /// \param XorInst The XOR binary operator to analyze
318 /// \return Returns the disjoint bits (the extractable offset), or zero if
319 /// none exist. On success, stores NonDisjointBits in
320 /// NonDisjointXorConstantBits.
321 APInt extractDisjointBitsFromXor(BinaryOperator *XorInst);
322
323 /// The non-disjoint bits remaining after xor decomposition in
324 /// `extractDisjointBitsFromXor`, which are later used while replacing the
325 /// original xor constant operand.
326 ConstantInt *NonDisjointXorConstantBits = nullptr;
327
328 /// The path from the constant offset to the old GEP index. e.g., if the GEP
329 /// index is "a * b + (c + 5)". After running function find, UserChain[0] will
330 /// be the constant 5, UserChain[1] will be the subexpression "c + 5", and
331 /// UserChain[2] will be the entire expression "a * b + (c + 5)".
332 ///
333 /// This path helps to rebuild the new GEP index.
334 SmallVector<User *, 8> UserChain;
335
336 /// A data structure used in rebuildWithoutConstOffset. Contains all
337 /// sext/zext/trunc instructions along UserChain.
338 SmallVector<CastInst *, 16> CastInsts;
339
340 /// Insertion position of cloned instructions.
341 BasicBlock::iterator IP;
342
343 const DataLayout &DL;
344 const SimplifyQuery SQ;
345};
346
347/// A pass that tries to split every GEP in the function into a variadic
348/// base and a constant offset. It is a FunctionPass because searching for the
349/// constant offset may inspect other basic blocks.
350class SeparateConstOffsetFromGEPLegacyPass : public FunctionPass {
351public:
352 static char ID;
353
354 SeparateConstOffsetFromGEPLegacyPass(bool LowerGEP = false)
355 : FunctionPass(ID), LowerGEP(LowerGEP) {
356 initializeSeparateConstOffsetFromGEPLegacyPassPass(
357 *PassRegistry::getPassRegistry());
358 }
359
360 void getAnalysisUsage(AnalysisUsage &AU) const override {
361 AU.addRequired<DominatorTreeWrapperPass>();
362 AU.addRequired<TargetTransformInfoWrapperPass>();
363 AU.addRequired<LoopInfoWrapperPass>();
364 AU.setPreservesCFG();
365 AU.addRequired<TargetLibraryInfoWrapperPass>();
366 }
367
368 bool runOnFunction(Function &F) override;
369
370private:
371 bool LowerGEP;
372};
373
374/// A pass that tries to split every GEP in the function into a variadic
375/// base and a constant offset. It is a FunctionPass because searching for the
376/// constant offset may inspect other basic blocks.
377class SeparateConstOffsetFromGEP {
378public:
379 SeparateConstOffsetFromGEP(
380 DominatorTree *DT, LoopInfo *LI, TargetLibraryInfo *TLI,
381 function_ref<TargetTransformInfo &(Function &)> GetTTI, bool LowerGEP)
382 : DT(DT), LI(LI), TLI(TLI), GetTTI(GetTTI), LowerGEP(LowerGEP) {}
383
384 bool run(Function &F);
385
386private:
387 /// Track the operands of an add or sub.
388 using ExprKey = std::pair<Value *, Value *>;
389
390 /// Create a pair for use as a map key for a commutable operation.
391 static ExprKey createNormalizedCommutablePair(Value *A, Value *B) {
392 if (A < B)
393 return {A, B};
394 return {B, A};
395 }
396
397 /// Tries to split the given GEP into a variadic base and a constant offset,
398 /// and returns true if the splitting succeeds.
399 bool splitGEP(GetElementPtrInst *GEP);
400
401 /// Tries to reorder the given GEP with the GEP that produces the base if
402 /// doing so results in producing a constant offset as the outermost
403 /// index.
404 bool reorderGEP(GetElementPtrInst *GEP, TargetTransformInfo &TTI);
405
406 /// Lower a GEP with multiple indices into multiple GEPs with a single index.
407 /// Function splitGEP already split the original GEP into a variadic part and
408 /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
409 /// variadic part into a set of GEPs with a single index and applies
410 /// AccumulativeByteOffset to it.
411 /// \p Variadic The variadic part of the original GEP.
412 /// \p AccumulativeByteOffset The constant offset.
413 void lowerToSingleIndexGEPs(GetElementPtrInst *Variadic,
414 const APInt &AccumulativeByteOffset);
415
416 /// Finds the constant offset within each index and accumulates them. If
417 /// LowerGEP is true, it finds in indices of both sequential and structure
418 /// types, otherwise it only finds in sequential indices. The output
419 /// NeedsExtraction indicates whether we successfully find a non-zero constant
420 /// offset, and SignedOverflow indicates if there was signed overflow in
421 /// offset calculation.
422 APInt accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction,
423 bool &SignedOverflow);
424
425 /// Canonicalize array indices to pointer-size integers. This helps to
426 /// simplify the logic of splitting a GEP. For example, if a + b is a
427 /// pointer-size integer, we have
428 /// gep base, a + b = gep (gep base, a), b
429 /// However, this equality may not hold if the size of a + b is smaller than
430 /// the pointer size, because LLVM conceptually sign-extends GEP indices to
431 /// pointer size before computing the address
432 /// (http://llvm.org/docs/LangRef.html#id181).
433 ///
434 /// This canonicalization is very likely already done in clang and
435 /// instcombine. Therefore, the program will probably remain the same.
436 ///
437 /// Returns true if the module changes.
438 ///
439 /// Verified in @i32_add in split-gep.ll
440 bool canonicalizeArrayIndicesToIndexSize(GetElementPtrInst *GEP);
441
442 /// Optimize sext(a)+sext(b) to sext(a+b) when a+b can't sign overflow.
443 /// SeparateConstOffsetFromGEP distributes a sext to leaves before extracting
444 /// the constant offset. After extraction, it becomes desirable to reunion the
445 /// distributed sexts. For example,
446 ///
447 /// &a[sext(i +nsw (j +nsw 5)]
448 /// => distribute &a[sext(i) +nsw (sext(j) +nsw 5)]
449 /// => constant extraction &a[sext(i) + sext(j)] + 5
450 /// => reunion &a[sext(i +nsw j)] + 5
451 bool reuniteExts(Function &F);
452
453 /// A helper that reunites sexts in an instruction.
454 bool reuniteExts(Instruction *I);
455
456 /// Find the closest dominator of <Dominatee> that is equivalent to <Key>.
457 Instruction *findClosestMatchingDominator(
458 ExprKey Key, Instruction *Dominatee,
459 DenseMap<ExprKey, SmallVector<Instruction *, 2>> &DominatingExprs);
460
461 /// Verify F is free of dead code.
462 void verifyNoDeadCode(Function &F);
463
464 bool hasMoreThanOneUseInLoop(Value *v, Loop *L);
465
466 // Swap the index operand of two GEP.
467 void swapGEPOperand(GetElementPtrInst *First, GetElementPtrInst *Second);
468
469 // Check if it is safe to swap operand of two GEP.
470 bool isLegalToSwapOperand(GetElementPtrInst *First, GetElementPtrInst *Second,
471 Loop *CurLoop);
472
473 const DataLayout *DL = nullptr;
474 DominatorTree *DT = nullptr;
475 LoopInfo *LI;
476 TargetLibraryInfo *TLI;
477 // Retrieved lazily since not always used.
478 function_ref<TargetTransformInfo &(Function &)> GetTTI;
479
480 /// Whether to lower a GEP with multiple indices into arithmetic operations or
481 /// multiple GEPs with a single index.
482 bool LowerGEP;
483
484 DenseMap<ExprKey, SmallVector<Instruction *, 2>> DominatingAdds;
485 DenseMap<ExprKey, SmallVector<Instruction *, 2>> DominatingSubs;
486};
487
488} // end anonymous namespace
489
490char SeparateConstOffsetFromGEPLegacyPass::ID = 0;
491
492INITIALIZE_PASS_BEGIN(
493 SeparateConstOffsetFromGEPLegacyPass, "separate-const-offset-from-gep",
494 "Split GEPs to a variadic base and a constant offset for better CSE", false,
495 false)
496INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
497INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
498INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
499INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
500INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
501INITIALIZE_PASS_END(
502 SeparateConstOffsetFromGEPLegacyPass, "separate-const-offset-from-gep",
503 "Split GEPs to a variadic base and a constant offset for better CSE", false,
504 false)
505
506FunctionPass *llvm::createSeparateConstOffsetFromGEPPass(bool LowerGEP) {
507 return new SeparateConstOffsetFromGEPLegacyPass(LowerGEP);
508}
509
510// Checks if it is safe to reorder an add/sext result used in a GEP.
511//
512// An inbounds GEP does not guarantee that the index is non-negative.
513// This helper checks first if the index is known non-negative. If the index is
514// non-negative, the transform is always safe.
515// Second, it checks whether the GEP is inbounds and directly based on a global
516// or an alloca, which are required to prove futher transform validity.
517// If the GEP:
518// - Has a zero offset from the base, the index is non-negative (any negative
519// value would produce poison/UB)
520// - Has ObjectSize < (2^(N-1) - C + 1) * stride, where C is a constant from the
521// add, stride is the element size of Idx, and N is bitwidth of Idx.
522// This is because with this pattern:
523// %add = add iN %val, C
524// %sext = sext iN %add to i64
525// %gep = getelementptr inbounds TYPE, %sext
526// The worst-case is when %val sign-flips to produce the smallest magnitude
527// negative value, at 2^(N-1)-1. In this case, the add/sext is -(2^(N-1)-C+1),
528// and the sext/add is 2^(N-1)+C-1 (2^N difference). The original add/sext
529// only produces a defined GEP when -(2^(N-1)-C+1) is inbounds. So, if
530// ObjectSize < (2^(N-1) - C + 1) * stride, it is impossible for the
531// worst-case sign-flip to be defined.
532// Note that in this case the GEP is not neccesarily non-negative, but any
533// negative results will still produce the same behavior in the reordered
534// version with a defined GEP.
535// This can also work for negative C, but the threshold is instead
536// (2^(N-1)+C)*stride, since the sign-flip is done in reverse and is instead
537// producing a large positive value that still needs to be inbounds to the
538// object size. If C is negative, we cannot make any useful assumptions based
539// on the offset, since it would need to be extremely large.
540static bool canReorderAddSextToGEP(const GetElementPtrInst *GEP,
541 const Value *Idx, const BinaryOperator *Add,
542 const DataLayout &DL) {
543 if (isKnownNonNegative(V: Idx, SQ: DL))
544 return true;
545
546 if (!GEP->isInBounds())
547 return false;
548
549 const Value *Ptr = GEP->getPointerOperand();
550 int64_t Offset = 0;
551 const Value *Base =
552 GetPointerBaseWithConstantOffset(Ptr: const_cast<Value *>(Ptr), Offset, DL);
553
554 // We need one of the operands to be a constant to be able to trace into the
555 // operator.
556 const ConstantInt *CI = dyn_cast<ConstantInt>(Val: Add->getOperand(i_nocapture: 0));
557 if (!CI)
558 CI = dyn_cast<ConstantInt>(Val: Add->getOperand(i_nocapture: 1));
559 if (!CI)
560 return false;
561 // Calculate the threshold
562 APInt Threshold;
563 unsigned N = Add->getType()->getIntegerBitWidth();
564 TypeSize ElemSize = DL.getTypeAllocSize(Ty: GEP->getSourceElementType());
565 if (ElemSize.isScalable())
566 return false;
567 uint64_t Stride = ElemSize.getFixedValue();
568 if (!CI->isNegative()) {
569 // (2^(N-1) - C + 1) * stride
570 Threshold = (APInt::getSignedMinValue(numBits: N).zext(width: 128) -
571 CI->getValue().zextOrTrunc(width: 128) + 1) *
572 APInt(128, Stride);
573 } else {
574 // (2^(N-1) + C) * stride
575 Threshold = (APInt::getSignedMinValue(numBits: N).zext(width: 128) +
576 CI->getValue().sextOrTrunc(width: 128)) *
577 APInt(128, Stride);
578 }
579
580 if (Base && (isa<AllocaInst>(Val: Base) || isa<GlobalObject>(Val: Base)) &&
581 !CI->isNegative()) {
582 // If the offset is zero from an alloca or global, inbounds is sufficient to
583 // prove non-negativity if one add operand is non-negative
584 if (Offset == 0)
585 return true;
586
587 // Check if the Offset < Threshold (positive CI only) otherwise
588 if (Offset < 0)
589 return true;
590 if (APInt(128, (uint64_t)Offset).ult(RHS: Threshold))
591 return true;
592 } else {
593 // If we can't determine the offset from the base object, we can still use
594 // the underlying object and type size constraints
595 Base = getUnderlyingObject(V: Ptr);
596 // Can only prove non-negativity if the base object is known
597 if (!(isa<AllocaInst>(Val: Base) || isa<GlobalObject>(Val: Base)))
598 return false;
599 }
600
601 // Check if the ObjectSize < Threshold (for both positive or negative C)
602 uint64_t ObjSize = 0;
603 if (const auto *AI = dyn_cast<AllocaInst>(Val: Base)) {
604 if (auto AllocSize = AI->getAllocationSize(DL))
605 if (!AllocSize->isScalable())
606 ObjSize = AllocSize->getFixedValue();
607 } else if (const auto *GV = dyn_cast<GlobalVariable>(Val: Base)) {
608 TypeSize GVSize = DL.getTypeAllocSize(Ty: GV->getValueType());
609 if (!GVSize.isScalable())
610 ObjSize = GVSize.getFixedValue();
611 }
612 if (ObjSize > 0 && APInt(128, ObjSize).ult(RHS: Threshold))
613 return true;
614
615 return false;
616}
617
618bool ConstantOffsetExtractor::canTraceInto(bool SignExtended, bool ZeroExtended,
619 BinaryOperator *BO,
620 GetElementPtrInst *GEP, Value *Idx) {
621 // We only consider ADD, SUB and OR, because a non-zero constant found in
622 // expressions composed of these operations can be easily hoisted as a
623 // constant offset by reassociation.
624 if (BO->getOpcode() != Instruction::Add &&
625 BO->getOpcode() != Instruction::Sub &&
626 BO->getOpcode() != Instruction::Or) {
627 return false;
628 }
629
630 // Do not trace into "or" unless it is equivalent to "add nuw nsw".
631 // This is the case if the or's disjoint flag is set.
632 if (BO->getOpcode() == Instruction::Or &&
633 !cast<PossiblyDisjointInst>(Val: BO)->isDisjoint())
634 return false;
635
636 // FIXME: We don't currently support constants from the RHS of subs,
637 // when we are zero-extended, because we need a way to zero-extended
638 // them before they are negated.
639 if (ZeroExtended && !SignExtended && BO->getOpcode() == Instruction::Sub)
640 return false;
641
642 // In addition, tracing into BO requires that its surrounding sext/zext/trunc
643 // (if any) is distributable to both operands.
644 //
645 // Suppose BO = A op B.
646 // SignExtended | ZeroExtended | Distributable?
647 // --------------+--------------+----------------------------------
648 // 0 | 0 | true because no s/zext exists
649 // 0 | 1 | zext(BO) == zext(A) op zext(B)
650 // 1 | 0 | sext(BO) == sext(A) op sext(B)
651 // 1 | 1 | zext(sext(BO)) ==
652 // | | zext(sext(A)) op zext(sext(B))
653 if (BO->getOpcode() == Instruction::Add && !ZeroExtended && GEP) {
654 // If a + b >= 0 and (a >= 0 or b >= 0), then
655 // sext(a + b) = sext(a) + sext(b)
656 // even if the addition is not marked nsw.
657 //
658 // Leveraging this invariant, we can trace into an sext'ed inbound GEP
659 // index under certain conditions (see canReorderAddSextToGEP).
660 //
661 // Verified in @sext_add in split-gep.ll.
662 if (canReorderAddSextToGEP(GEP, Idx, Add: BO, DL))
663 return true;
664 }
665
666 // For a sext(add nuw), allow tracing through when the enclosing GEP is both
667 // inbounds and nuw.
668 bool GEPInboundsNUW =
669 GEP ? (GEP->isInBounds() && GEP->hasNoUnsignedWrap()) : false;
670 if (BO->getOpcode() == Instruction::Add && SignExtended && !ZeroExtended &&
671 GEPInboundsNUW && BO->hasNoUnsignedWrap())
672 return true;
673
674 // sext (add/sub nsw A, B) == add/sub nsw (sext A), (sext B)
675 // zext (add/sub nuw A, B) == add/sub nuw (zext A), (zext B)
676 if (BO->getOpcode() == Instruction::Add ||
677 BO->getOpcode() == Instruction::Sub) {
678 if (SignExtended && !BO->hasNoSignedWrap())
679 return false;
680 if (ZeroExtended && !BO->hasNoUnsignedWrap())
681 return false;
682 }
683
684 return true;
685}
686
687APInt ConstantOffsetExtractor::findInEitherOperand(BinaryOperator *BO,
688 bool SignExtended,
689 bool ZeroExtended) {
690 // Save off the current height of the chain, in case we need to restore it.
691 size_t ChainLength = UserChain.size();
692
693 // BO cannot use information from the base GEP at this point, so clear it.
694 APInt ConstantOffset =
695 find(V: BO->getOperand(i_nocapture: 0), GEP: nullptr, Idx: nullptr, SignExtended, ZeroExtended);
696 // If we found a constant offset in the left operand, stop and return that.
697 // This shortcut might cause us to miss opportunities of combining the
698 // constant offsets in both operands, e.g., (a + 4) + (b + 5) => (a + b) + 9.
699 // However, such cases are probably already handled by -instcombine,
700 // given this pass runs after the standard optimizations.
701 if (ConstantOffset != 0) return ConstantOffset;
702
703 // Reset the chain back to where it was when we started exploring this node,
704 // since visiting the LHS didn't pan out.
705 UserChain.resize(N: ChainLength);
706
707 ConstantOffset =
708 find(V: BO->getOperand(i_nocapture: 1), GEP: nullptr, Idx: nullptr, SignExtended, ZeroExtended);
709 // If U is a sub operator, negate the constant offset found in the right
710 // operand.
711 if (BO->getOpcode() == Instruction::Sub)
712 ConstantOffset = -ConstantOffset;
713
714 // If RHS wasn't a suitable candidate either, reset the chain again.
715 if (ConstantOffset == 0)
716 UserChain.resize(N: ChainLength);
717
718 return ConstantOffset;
719}
720
721APInt ConstantOffsetExtractor::find(Value *V, GetElementPtrInst *GEP,
722 Value *Idx, bool SignExtended,
723 bool ZeroExtended) {
724 // TODO(jingyue): We could trace into integer/pointer casts, such as
725 // inttoptr, ptrtoint, bitcast, and addrspacecast. We choose to handle only
726 // integers because it gives good enough results for our benchmarks.
727 unsigned BitWidth = cast<IntegerType>(Val: V->getType())->getBitWidth();
728
729 // We cannot do much with Values that are not a User, such as an Argument.
730 User *U = dyn_cast<User>(Val: V);
731 if (U == nullptr) return APInt(BitWidth, 0);
732
733 APInt ConstantOffset(BitWidth, 0);
734 if (ConstantInt *CI = dyn_cast<ConstantInt>(Val: V)) {
735 // Hooray, we found it!
736 ConstantOffset = CI->getValue();
737 } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Val: V)) {
738 // Trace into subexpressions for more hoisting opportunities.
739 if (canTraceInto(SignExtended, ZeroExtended, BO, GEP, Idx))
740 ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended);
741 else if (BO->getOpcode() == Instruction::Xor)
742 ConstantOffset = extractDisjointBitsFromXor(XorInst: BO);
743 } else if (isa<TruncInst>(Val: V)) {
744 ConstantOffset =
745 find(V: U->getOperand(i: 0), GEP, Idx, SignExtended, ZeroExtended)
746 .trunc(width: BitWidth);
747 } else if (isa<SExtInst>(Val: V)) {
748 ConstantOffset =
749 find(V: U->getOperand(i: 0), GEP, Idx, /* SignExtended */ true, ZeroExtended)
750 .sext(width: BitWidth);
751 } else if (isa<ZExtInst>(Val: V)) {
752 // As an optimization, we can clear the SignExtended flag because
753 // sext(zext(a)) = zext(a). Verified in @sext_zext in split-gep.ll.
754 ConstantOffset = find(V: U->getOperand(i: 0), GEP, Idx, /* SignExtended */ false,
755 /* ZeroExtended */ true)
756 .zext(width: BitWidth);
757 }
758
759 // If we found a non-zero constant offset, add it to the path for
760 // rebuildWithoutConstOffset. Zero is a valid constant offset, but doesn't
761 // help this optimization.
762 if (ConstantOffset != 0)
763 UserChain.push_back(Elt: U);
764 return ConstantOffset;
765}
766
767Value *ConstantOffsetExtractor::applyCasts(Value *V) {
768 Value *Current = V;
769 // CastInsts is built in the use-def order. Therefore, we apply them to V
770 // in the reversed order.
771 for (CastInst *I : llvm::reverse(C&: CastInsts)) {
772 if (Constant *C = dyn_cast<Constant>(Val: Current)) {
773 // Try to constant fold the cast.
774 Current = ConstantFoldCastOperand(Opcode: I->getOpcode(), C, DestTy: I->getType(), DL);
775 if (Current)
776 continue;
777 }
778
779 Instruction *Cast = I->clone();
780 Cast->setOperand(i: 0, Val: Current);
781 // In ConstantOffsetExtractor::find we do not analyze nuw/nsw for trunc, so
782 // we assume that it is ok to redistribute trunc over add/sub/or. But for
783 // example (add (trunc nuw A), (trunc nuw B)) is more poisonous than (trunc
784 // nuw (add A, B))). To make such redistributions legal we drop all the
785 // poison generating flags from cloned trunc instructions here.
786 if (isa<TruncInst>(Val: Cast))
787 Cast->dropPoisonGeneratingFlags();
788 Cast->insertBefore(BB&: *IP->getParent(), InsertPos: IP);
789 Current = Cast;
790 }
791 return Current;
792}
793
794Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() {
795 distributeCastsAndCloneChain(ChainIndex: UserChain.size() - 1);
796 // Remove all nullptrs (used to be sext/zext/trunc) from UserChain.
797 unsigned NewSize = 0;
798 for (User *I : UserChain) {
799 if (I != nullptr) {
800 UserChain[NewSize] = I;
801 NewSize++;
802 }
803 }
804 UserChain.resize(N: NewSize);
805 return removeConstOffset(ChainIndex: UserChain.size() - 1);
806}
807
808Value *
809ConstantOffsetExtractor::distributeCastsAndCloneChain(unsigned ChainIndex) {
810 User *U = UserChain[ChainIndex];
811 if (ChainIndex == 0) {
812 assert(isa<ConstantInt>(U));
813 // If U is a ConstantInt, applyCasts will return a ConstantInt as well.
814 return UserChain[ChainIndex] = cast<ConstantInt>(Val: applyCasts(V: U));
815 }
816
817 if (CastInst *Cast = dyn_cast<CastInst>(Val: U)) {
818 assert(
819 (isa<SExtInst>(Cast) || isa<ZExtInst>(Cast) || isa<TruncInst>(Cast)) &&
820 "Only following instructions can be traced: sext, zext & trunc");
821 CastInsts.push_back(Elt: Cast);
822 UserChain[ChainIndex] = nullptr;
823 return distributeCastsAndCloneChain(ChainIndex: ChainIndex - 1);
824 }
825
826 // Function find only trace into BinaryOperator and CastInst.
827 BinaryOperator *BO = cast<BinaryOperator>(Val: U);
828 // OpNo = which operand of BO is UserChain[ChainIndex - 1]
829 unsigned OpNo = (BO->getOperand(i_nocapture: 0) == UserChain[ChainIndex - 1] ? 0 : 1);
830 Value *TheOther = applyCasts(V: BO->getOperand(i_nocapture: 1 - OpNo));
831 Value *NextInChain = distributeCastsAndCloneChain(ChainIndex: ChainIndex - 1);
832
833 BinaryOperator *NewBO = nullptr;
834 if (OpNo == 0) {
835 NewBO = BinaryOperator::Create(Op: BO->getOpcode(), S1: NextInChain, S2: TheOther,
836 Name: BO->getName(), InsertBefore: IP);
837 } else {
838 NewBO = BinaryOperator::Create(Op: BO->getOpcode(), S1: TheOther, S2: NextInChain,
839 Name: BO->getName(), InsertBefore: IP);
840 }
841 return UserChain[ChainIndex] = NewBO;
842}
843
844Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
845 if (ChainIndex == 0) {
846 assert(isa<ConstantInt>(UserChain[ChainIndex]));
847 return ConstantInt::getNullValue(Ty: UserChain[ChainIndex]->getType());
848 }
849
850 BinaryOperator *BO = cast<BinaryOperator>(Val: UserChain[ChainIndex]);
851 assert((BO->use_empty() || BO->hasOneUse()) &&
852 "distributeCastsAndCloneChain clones each BinaryOperator in "
853 "UserChain, so no one should be used more than "
854 "once");
855
856 unsigned OpNo = (BO->getOperand(i_nocapture: 0) == UserChain[ChainIndex - 1] ? 0 : 1);
857 assert(BO->getOperand(OpNo) == UserChain[ChainIndex - 1]);
858 Value *NextInChain = removeConstOffset(ChainIndex: ChainIndex - 1);
859 Value *TheOther = BO->getOperand(i_nocapture: 1 - OpNo);
860
861 // When rewriting xor(TheOther, NextInChain) expressions, the original
862 // constant operand is replaced with the non-disjoints bits, which are the
863 // non-extractable bits, i.e., those that must remain in the xor (the other
864 // bits have already compounded the GEP offset).
865 if (BO->getOpcode() == Instruction::Xor) {
866 // The non-disjoint bits are cached in NonDisjointXorConstantBits, which is
867 // always up-to-date.
868 assert(NonDisjointXorConstantBits &&
869 "XOR in UserChain without recorded non-disjoint bits");
870 // Only casts can happen to be distributed among the xor operands.
871 NextInChain = applyCasts(V: NonDisjointXorConstantBits);
872 }
873
874 // If NextInChain is 0 and not the LHS of a sub, we can simplify the
875 // sub-expression to be just TheOther.
876 if (ConstantInt *CI = dyn_cast<ConstantInt>(Val: NextInChain)) {
877 if (CI->isZero() && !(BO->getOpcode() == Instruction::Sub && OpNo == 0))
878 return TheOther;
879 }
880
881 BinaryOperator::BinaryOps NewOp = BO->getOpcode();
882 if (BO->getOpcode() == Instruction::Or) {
883 // Rebuild "or" as "add", because "or" may be invalid for the new
884 // expression.
885 //
886 // For instance, given
887 // a | (b + 5) where a and b + 5 have no common bits,
888 // we can extract 5 as the constant offset.
889 //
890 // However, reusing the "or" in the new index would give us
891 // (a | b) + 5
892 // which does not equal a | (b + 5).
893 //
894 // Replacing the "or" with "add" is fine, because
895 // a | (b + 5) = a + (b + 5) = (a + b) + 5
896 NewOp = Instruction::Add;
897 }
898
899 BinaryOperator *NewBO;
900 if (OpNo == 0) {
901 NewBO = BinaryOperator::Create(Op: NewOp, S1: NextInChain, S2: TheOther, Name: "", InsertBefore: IP);
902 } else {
903 NewBO = BinaryOperator::Create(Op: NewOp, S1: TheOther, S2: NextInChain, Name: "", InsertBefore: IP);
904 }
905 NewBO->takeName(V: BO);
906 return NewBO;
907}
908
909APInt ConstantOffsetExtractor::extractDisjointBitsFromXor(
910 BinaryOperator *XorInst) {
911 assert(XorInst && XorInst->getOpcode() == Instruction::Xor &&
912 "Expected XOR instruction");
913
914 unsigned BitWidth = XorInst->getType()->getScalarSizeInBits();
915 Value *BaseOp;
916 ConstantInt *XorConstantOp;
917
918 if (!match(V: XorInst, P: m_Xor(L: m_Value(V&: BaseOp), R: m_ConstantInt(CI&: XorConstantOp))))
919 return APInt::getZero(numBits: BitWidth);
920
921 const KnownBits BaseKnownBits = computeKnownBits(V: BaseOp, Q: SQ);
922 const APInt &ConstantValue = XorConstantOp->getValue();
923
924 // Compute the disjoint bits, i.e., those bits of the constant operand that
925 // are known-zero in the base. These disjoint bits will contribute to the
926 // final GEP offset. If there are no disjoint bits, there isn't any offset to
927 // extract from the xor.
928 const APInt DisjointBits = ConstantValue & BaseKnownBits.Zero;
929 if (DisjointBits.isZero())
930 return DisjointBits;
931
932 // Avoid a pessimizing rewrite if the disjoint bits include the sign bit.
933 if (DisjointBits.isSignBitSet())
934 return APInt::getZero(numBits: BitWidth);
935
936 // Compute the remaining bits, i.e., the non-disjoint ones, which are those
937 // that must be preserved in the xor.
938 const APInt NonDisjointBits = ConstantValue & ~DisjointBits;
939 NonDisjointXorConstantBits =
940 ConstantInt::get(Context&: XorInst->getContext(), V: NonDisjointBits);
941
942 // UserChain maintains a path from the constant up to the GEP index. Push the
943 // xor constant operand, which is the constant leaf of the chain (which is
944 // also what `distributeCastsAndCloneChain` expects). Such a chained operand
945 // is the one to be replaced with the non-disjoint bits, while rebuilding the
946 // xor afterwards. The xor instruction itself is pushed upon returning.
947 UserChain.push_back(Elt: XorConstantOp);
948
949 return DisjointBits;
950}
951
952/// A helper function to check if reassociating through an entry in the user
953/// chain would invalidate the GEP's nuw flag.
954static bool allowsPreservingNUW(const User *U) {
955 if (const BinaryOperator *BO = dyn_cast<BinaryOperator>(Val: U)) {
956 // Binary operations need to be effectively add nuw.
957 auto Opcode = BO->getOpcode();
958 if (Opcode == BinaryOperator::Or) {
959 // Ors are only considered here if they are disjoint. The addition that
960 // they represent in this case is NUW.
961 assert(cast<PossiblyDisjointInst>(BO)->isDisjoint());
962 return true;
963 }
964 return Opcode == BinaryOperator::Add && BO->hasNoUnsignedWrap();
965 }
966 // UserChain can only contain ConstantInt, CastInst, or BinaryOperator.
967 // Among the possible CastInsts, only trunc without nuw is a problem: If it
968 // is distributed through an add nuw, wrapping may occur:
969 // "add nuw trunc(a), trunc(b)" is more poisonous than "trunc(add nuw a, b)"
970 if (const TruncInst *TI = dyn_cast<TruncInst>(Val: U))
971 return TI->hasNoUnsignedWrap();
972 assert((isa<CastInst>(U) || isa<ConstantInt>(U)) && "Unexpected User.");
973 return true;
974}
975
976Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP,
977 User *&UserChainTail,
978 bool &PreservesNUW) {
979 ConstantOffsetExtractor Extractor(GEP->getIterator());
980 // Find a non-zero constant offset first.
981 APInt ConstantOffset = Extractor.find(V: Idx, GEP, Idx, /* SignExtended */ false,
982 /* ZeroExtended */ false);
983 if (ConstantOffset == 0) {
984 UserChainTail = nullptr;
985 PreservesNUW = true;
986 return nullptr;
987 }
988
989 PreservesNUW = all_of(Range&: Extractor.UserChain, P: allowsPreservingNUW);
990
991 // Separates the constant offset from the GEP index.
992 Value *IdxWithoutConstOffset = Extractor.rebuildWithoutConstOffset();
993 UserChainTail = Extractor.UserChain.back();
994 return IdxWithoutConstOffset;
995}
996
997APInt ConstantOffsetExtractor::Find(Value *Idx, GetElementPtrInst *GEP) {
998 return ConstantOffsetExtractor(GEP->getIterator())
999 .find(V: Idx, GEP, Idx, /* SignExtended */ false, /* ZeroExtended */ false);
1000}
1001
1002bool SeparateConstOffsetFromGEP::canonicalizeArrayIndicesToIndexSize(
1003 GetElementPtrInst *GEP) {
1004 bool Changed = false;
1005 Type *PtrIdxTy = DL->getIndexType(PtrTy: GEP->getType());
1006 gep_type_iterator GTI = gep_type_begin(GEP: *GEP);
1007 for (User::op_iterator I = GEP->op_begin() + 1, E = GEP->op_end();
1008 I != E; ++I, ++GTI) {
1009 // Skip struct member indices which must be i32.
1010 if (GTI.isSequential()) {
1011 if ((*I)->getType() != PtrIdxTy) {
1012 *I = CastInst::CreateIntegerCast(S: *I, Ty: PtrIdxTy, isSigned: true, Name: "idxprom",
1013 InsertBefore: GEP->getIterator());
1014 Changed = true;
1015 }
1016 }
1017 }
1018 return Changed;
1019}
1020
1021APInt SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP,
1022 bool &NeedsExtraction,
1023 bool &SignedOverflow) {
1024 NeedsExtraction = false;
1025 SignedOverflow = false;
1026 unsigned IdxWidth = DL->getIndexTypeSizeInBits(Ty: GEP->getType());
1027 APInt AccumulativeByteOffset(IdxWidth, 0);
1028 gep_type_iterator GTI = gep_type_begin(GEP: *GEP);
1029 for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
1030 if (GTI.isSequential()) {
1031 // Constant offsets of scalable types are not really constant.
1032 if (GTI.getIndexedType()->isScalableTy())
1033 continue;
1034
1035 // Tries to extract a constant offset from this GEP index.
1036 APInt ConstantOffset =
1037 ConstantOffsetExtractor::Find(Idx: GEP->getOperand(i_nocapture: I), GEP)
1038 .sextOrTrunc(width: IdxWidth);
1039 if (ConstantOffset != 0) {
1040 NeedsExtraction = true;
1041 // A GEP may have multiple indices. We accumulate the extracted
1042 // constant offset to a byte offset, and later offset the remainder of
1043 // the original GEP with this byte offset.
1044 bool Overflow;
1045 auto ByteOffset = ConstantOffset.smul_ov(
1046 RHS: APInt(IdxWidth, GTI.getSequentialElementStride(DL: *DL),
1047 /*IsSigned=*/true, /*ImplicitTrunc=*/true),
1048 Overflow);
1049 SignedOverflow |= Overflow;
1050 AccumulativeByteOffset =
1051 AccumulativeByteOffset.sadd_ov(RHS: ByteOffset, Overflow);
1052 SignedOverflow |= Overflow;
1053 }
1054 } else if (LowerGEP) {
1055 StructType *StTy = GTI.getStructType();
1056 uint64_t Field = cast<ConstantInt>(Val: GEP->getOperand(i_nocapture: I))->getZExtValue();
1057 // Skip field 0 as the offset is always 0.
1058 if (Field != 0) {
1059 NeedsExtraction = true;
1060 AccumulativeByteOffset +=
1061 APInt(IdxWidth, DL->getStructLayout(Ty: StTy)->getElementOffset(Idx: Field),
1062 /*IsSigned=*/true, /*ImplicitTrunc=*/true);
1063 }
1064 }
1065 }
1066 return AccumulativeByteOffset;
1067}
1068
1069void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs(
1070 GetElementPtrInst *Variadic, const APInt &AccumulativeByteOffset) {
1071 IRBuilder<> Builder(Variadic);
1072 Type *PtrIndexTy = DL->getIndexType(PtrTy: Variadic->getType());
1073
1074 Value *ResultPtr = Variadic->getOperand(i_nocapture: 0);
1075 Loop *L = LI->getLoopFor(BB: Variadic->getParent());
1076 // Check if the base is not loop invariant or used more than once.
1077 bool isSwapCandidate =
1078 L && L->isLoopInvariant(V: ResultPtr) &&
1079 !hasMoreThanOneUseInLoop(v: ResultPtr, L);
1080 Value *FirstResult = nullptr;
1081
1082 gep_type_iterator GTI = gep_type_begin(GEP: *Variadic);
1083 // Create an ugly GEP for each sequential index. We don't create GEPs for
1084 // structure indices, as they are accumulated in the constant offset index.
1085 for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) {
1086 if (GTI.isSequential()) {
1087 Value *Idx = Variadic->getOperand(i_nocapture: I);
1088 // Skip zero indices.
1089 if (ConstantInt *CI = dyn_cast<ConstantInt>(Val: Idx))
1090 if (CI->isZero())
1091 continue;
1092
1093 APInt ElementSize = APInt(PtrIndexTy->getIntegerBitWidth(),
1094 GTI.getSequentialElementStride(DL: *DL));
1095 // Scale the index by element size.
1096 if (ElementSize != 1) {
1097 if (ElementSize.isPowerOf2()) {
1098 Idx = Builder.CreateShl(
1099 LHS: Idx, RHS: ConstantInt::get(Ty: PtrIndexTy, V: ElementSize.logBase2()));
1100 } else {
1101 Idx =
1102 Builder.CreateMul(LHS: Idx, RHS: ConstantInt::get(Ty: PtrIndexTy, V: ElementSize));
1103 }
1104 }
1105 // Create an ugly GEP with a single index for each index.
1106 ResultPtr = Builder.CreatePtrAdd(Ptr: ResultPtr, Offset: Idx, Name: "uglygep");
1107 if (FirstResult == nullptr)
1108 FirstResult = ResultPtr;
1109 }
1110 }
1111
1112 // Create a GEP with the constant offset index.
1113 if (AccumulativeByteOffset != 0) {
1114 Value *Offset = ConstantInt::get(Ty: PtrIndexTy, V: AccumulativeByteOffset);
1115 ResultPtr = Builder.CreatePtrAdd(Ptr: ResultPtr, Offset, Name: "uglygep");
1116 } else
1117 isSwapCandidate = false;
1118
1119 // If we created a GEP with constant index, and the base is loop invariant,
1120 // then we swap the first one with it, so LICM can move constant GEP out
1121 // later.
1122 auto *FirstGEP = dyn_cast_or_null<GetElementPtrInst>(Val: FirstResult);
1123 auto *SecondGEP = dyn_cast<GetElementPtrInst>(Val: ResultPtr);
1124 if (isSwapCandidate && isLegalToSwapOperand(First: FirstGEP, Second: SecondGEP, CurLoop: L))
1125 swapGEPOperand(First: FirstGEP, Second: SecondGEP);
1126
1127 Variadic->replaceAllUsesWith(V: ResultPtr);
1128 Variadic->eraseFromParent();
1129}
1130
1131bool SeparateConstOffsetFromGEP::reorderGEP(GetElementPtrInst *GEP,
1132 TargetTransformInfo &TTI) {
1133 auto PtrGEP = dyn_cast<GetElementPtrInst>(Val: GEP->getPointerOperand());
1134 if (!PtrGEP)
1135 return false;
1136
1137 bool NestedNeedsExtraction, OffsetOverflow;
1138 APInt NestedByteOffset =
1139 accumulateByteOffset(GEP: PtrGEP, NeedsExtraction&: NestedNeedsExtraction, SignedOverflow&: OffsetOverflow);
1140 if (!NestedNeedsExtraction)
1141 return false;
1142
1143 unsigned AddrSpace = PtrGEP->getPointerAddressSpace();
1144 if (!TTI.isLegalAddressingMode(Ty: GEP->getResultElementType(),
1145 /*BaseGV=*/nullptr,
1146 BaseOffset: NestedByteOffset.getSExtValue(),
1147 /*HasBaseReg=*/true, /*Scale=*/0, AddrSpace))
1148 return false;
1149
1150 bool GEPInBounds = GEP->isInBounds();
1151 bool PtrGEPInBounds = PtrGEP->isInBounds();
1152 bool IsChainInBounds = GEPInBounds && PtrGEPInBounds;
1153 if (IsChainInBounds) {
1154 auto IsKnownNonNegative = [this](Value *V) {
1155 return isKnownNonNegative(V, SQ: *DL);
1156 };
1157 IsChainInBounds &= all_of(Range: GEP->indices(), P: IsKnownNonNegative);
1158 if (IsChainInBounds)
1159 IsChainInBounds &= all_of(Range: PtrGEP->indices(), P: IsKnownNonNegative);
1160 }
1161
1162 IRBuilder<> Builder(GEP);
1163 // For trivial GEP chains, we can swap the indices.
1164 Value *NewSrc = Builder.CreateGEP(
1165 Ty: GEP->getSourceElementType(), Ptr: PtrGEP->getPointerOperand(),
1166 IdxList: SmallVector<Value *, 4>(GEP->indices()), Name: "", NW: IsChainInBounds);
1167 Value *NewGEP = Builder.CreateGEP(Ty: PtrGEP->getSourceElementType(), Ptr: NewSrc,
1168 IdxList: SmallVector<Value *, 4>(PtrGEP->indices()),
1169 Name: "", NW: IsChainInBounds);
1170 GEP->replaceAllUsesWith(V: NewGEP);
1171 RecursivelyDeleteTriviallyDeadInstructions(V: GEP);
1172 return true;
1173}
1174
1175bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
1176 // Skip vector GEPs.
1177 if (GEP->getType()->isVectorTy())
1178 return false;
1179
1180 // If the base of this GEP is a ptradd of a constant, lets pass the constant
1181 // along. This ensures that when we have a chain of GEPs the constant
1182 // offset from each is accumulated.
1183 Value *NewBase;
1184 const APInt *BaseOffset;
1185 bool ExtractBase = match(V: GEP->getPointerOperand(),
1186 P: m_PtrAdd(PointerOp: m_Value(V&: NewBase), OffsetOp: m_APInt(Res&: BaseOffset)));
1187
1188 unsigned IdxWidth = DL->getIndexTypeSizeInBits(Ty: GEP->getType());
1189 APInt BaseByteOffset =
1190 ExtractBase ? BaseOffset->sextOrTrunc(width: IdxWidth) : APInt(IdxWidth, 0);
1191
1192 // The backend can already nicely handle the case where all indices are
1193 // constant.
1194 if (GEP->hasAllConstantIndices() && !ExtractBase)
1195 return false;
1196
1197 bool Changed = canonicalizeArrayIndicesToIndexSize(GEP);
1198
1199 bool NeedsExtraction, OffsetOverflow;
1200 APInt NonBaseByteOffset =
1201 accumulateByteOffset(GEP, NeedsExtraction, SignedOverflow&: OffsetOverflow);
1202 bool AddOverflow;
1203 APInt AccumulativeByteOffset =
1204 BaseByteOffset.sadd_ov(RHS: NonBaseByteOffset, Overflow&: AddOverflow);
1205 OffsetOverflow |= AddOverflow;
1206
1207 TargetTransformInfo &TTI = GetTTI(*GEP->getFunction());
1208
1209 if (!NeedsExtraction && !ExtractBase) {
1210 Changed |= reorderGEP(GEP, TTI);
1211 return Changed;
1212 }
1213
1214 // If LowerGEP is disabled, before really splitting the GEP, check whether the
1215 // backend supports the addressing mode we are about to produce. If no, this
1216 // splitting probably won't be beneficial.
1217 // If LowerGEP is enabled, even the extracted constant offset can not match
1218 // the addressing mode, we can still do optimizations to other lowered parts
1219 // of variable indices. Therefore, we don't check for addressing modes in that
1220 // case.
1221 if (!LowerGEP) {
1222 unsigned AddrSpace = GEP->getPointerAddressSpace();
1223 if (!TTI.isLegalAddressingMode(
1224 Ty: GEP->getResultElementType(),
1225 /*BaseGV=*/nullptr, BaseOffset: AccumulativeByteOffset.getSExtValue(),
1226 /*HasBaseReg=*/true, /*Scale=*/0, AddrSpace)) {
1227 // If the addressing mode was not legal and the base byte offset was not
1228 // 0, it could be a case where the total offset became too large for
1229 // the addressing mode. Try again without extracting the base offset.
1230 if (!ExtractBase)
1231 return Changed;
1232 ExtractBase = false;
1233 BaseByteOffset = APInt(IdxWidth, 0);
1234 AccumulativeByteOffset = NonBaseByteOffset;
1235 if (!TTI.isLegalAddressingMode(
1236 Ty: GEP->getResultElementType(),
1237 /*BaseGV=*/nullptr, BaseOffset: AccumulativeByteOffset.getSExtValue(),
1238 /*HasBaseReg=*/true, /*Scale=*/0, AddrSpace))
1239 return Changed;
1240 // We can proceed with just extracting the other (non-base) offsets.
1241 NeedsExtraction = true;
1242 }
1243 }
1244
1245 // Track information for preserving GEP flags.
1246 bool AllOffsetsNonNegative =
1247 AccumulativeByteOffset.isNonNegative() && !OffsetOverflow;
1248 bool AllNUWPreserved = GEP->hasNoUnsignedWrap();
1249 bool NewGEPInBounds = GEP->isInBounds();
1250 bool NewGEPNUSW = GEP->hasNoUnsignedSignedWrap();
1251
1252 // Remove the constant offset in each sequential index. The resultant GEP
1253 // computes the variadic base.
1254 // Notice that we don't remove struct field indices here. If LowerGEP is
1255 // disabled, a structure index is not accumulated and we still use the old
1256 // one. If LowerGEP is enabled, a structure index is accumulated in the
1257 // constant offset. LowerToSingleIndexGEPs will later handle the constant
1258 // offset and won't need a new structure index.
1259 gep_type_iterator GTI = gep_type_begin(GEP: *GEP);
1260 for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
1261 if (GTI.isSequential()) {
1262 // Constant offsets of scalable types are not really constant.
1263 if (GTI.getIndexedType()->isScalableTy())
1264 continue;
1265
1266 // Splits this GEP index into a variadic part and a constant offset, and
1267 // uses the variadic part as the new index.
1268 Value *Idx = GEP->getOperand(i_nocapture: I);
1269 User *UserChainTail;
1270 bool PreservesNUW;
1271 Value *NewIdx = ConstantOffsetExtractor::Extract(Idx, GEP, UserChainTail,
1272 PreservesNUW);
1273 if (NewIdx != nullptr) {
1274 // Switches to the index with the constant offset removed.
1275 GEP->setOperand(i_nocapture: I, Val_nocapture: NewIdx);
1276 // After switching to the new index, we can garbage-collect UserChain
1277 // and the old index if they are not used.
1278 RecursivelyDeleteTriviallyDeadInstructions(V: UserChainTail);
1279 RecursivelyDeleteTriviallyDeadInstructions(V: Idx);
1280 Idx = NewIdx;
1281 AllNUWPreserved &= PreservesNUW;
1282 }
1283 AllOffsetsNonNegative =
1284 AllOffsetsNonNegative && isKnownNonNegative(V: Idx, SQ: *DL);
1285 }
1286 }
1287 if (ExtractBase) {
1288 GEPOperator *Base = cast<GEPOperator>(Val: GEP->getPointerOperand());
1289 AllNUWPreserved &= Base->hasNoUnsignedWrap();
1290 NewGEPInBounds &= Base->isInBounds();
1291 NewGEPNUSW &= Base->hasNoUnsignedSignedWrap();
1292 AllOffsetsNonNegative &= BaseByteOffset.isNonNegative();
1293
1294 GEP->setOperand(i_nocapture: 0, Val_nocapture: NewBase);
1295 RecursivelyDeleteTriviallyDeadInstructions(V: Base);
1296 }
1297
1298 // Clear the inbounds attribute because the new index may be off-bound.
1299 // e.g.,
1300 //
1301 // b = add i64 a, 5
1302 // addr = gep inbounds float, float* p, i64 b
1303 //
1304 // is transformed to:
1305 //
1306 // addr2 = gep float, float* p, i64 a ; inbounds removed
1307 // addr = gep float, float* addr2, i64 5 ; inbounds removed
1308 //
1309 // If a is -4, although the old index b is in bounds, the new index a is
1310 // off-bound. http://llvm.org/docs/LangRef.html#id181 says "if the
1311 // inbounds keyword is not present, the offsets are added to the base
1312 // address with silently-wrapping two's complement arithmetic".
1313 // Therefore, the final code will be a semantically equivalent.
1314 GEPNoWrapFlags NewGEPFlags = GEPNoWrapFlags::none();
1315
1316 // If the initial GEP was inbounds/nusw and all variable indices and the
1317 // accumulated offsets are non-negative, they can be added in any order and
1318 // the intermediate results are in bounds and don't overflow in a nusw sense.
1319 // So, we can preserve the inbounds/nusw flag for both GEPs.
1320 bool CanPreserveInBoundsNUSW = AllOffsetsNonNegative;
1321
1322 // If the initial GEP was NUW and all operations that we reassociate were NUW
1323 // additions, the resulting GEPs are also NUW.
1324 if (AllNUWPreserved) {
1325 NewGEPFlags |= GEPNoWrapFlags::noUnsignedWrap();
1326 // If the initial GEP additionally had NUSW (or inbounds, which implies
1327 // NUSW), we know that the indices in the initial GEP must all have their
1328 // signbit not set. For indices that are the result of NUW adds, the
1329 // add-operands therefore also don't have their signbit set. Therefore, all
1330 // indices of the resulting GEPs are non-negative -> we can preserve
1331 // the inbounds/nusw flag.
1332 CanPreserveInBoundsNUSW |= NewGEPNUSW;
1333 }
1334
1335 if (CanPreserveInBoundsNUSW) {
1336 if (NewGEPInBounds)
1337 NewGEPFlags |= GEPNoWrapFlags::inBounds();
1338 else if (NewGEPNUSW)
1339 NewGEPFlags |= GEPNoWrapFlags::noUnsignedSignedWrap();
1340 }
1341
1342 GEP->setNoWrapFlags(NewGEPFlags);
1343
1344 // Lowers a GEP to GEPs with a single index.
1345 if (LowerGEP) {
1346 lowerToSingleIndexGEPs(Variadic: GEP, AccumulativeByteOffset);
1347 return true;
1348 }
1349
1350 // No need to create another GEP if the accumulative byte offset is 0.
1351 if (AccumulativeByteOffset == 0)
1352 return true;
1353
1354 // Offsets the base with the accumulative byte offset.
1355 //
1356 // %gep ; the base
1357 // ... %gep ...
1358 //
1359 // => add the offset
1360 //
1361 // %gep2 ; clone of %gep
1362 // %new.gep = gep i8, %gep2, %offset
1363 // %gep ; will be removed
1364 // ... %gep ...
1365 //
1366 // => replace all uses of %gep with %new.gep and remove %gep
1367 //
1368 // %gep2 ; clone of %gep
1369 // %new.gep = gep i8, %gep2, %offset
1370 // ... %new.gep ...
1371 Instruction *NewGEP = GEP->clone();
1372 NewGEP->insertBefore(InsertPos: GEP->getIterator());
1373
1374 Type *PtrIdxTy = DL->getIndexType(PtrTy: GEP->getType());
1375 IRBuilder<> Builder(GEP);
1376 NewGEP = cast<Instruction>(Val: Builder.CreatePtrAdd(
1377 Ptr: NewGEP, Offset: ConstantInt::get(Ty: PtrIdxTy, V: AccumulativeByteOffset),
1378 Name: GEP->getName(), NW: NewGEPFlags));
1379 NewGEP->copyMetadata(SrcInst: *GEP);
1380
1381 GEP->replaceAllUsesWith(V: NewGEP);
1382 GEP->eraseFromParent();
1383
1384 return true;
1385}
1386
1387bool SeparateConstOffsetFromGEPLegacyPass::runOnFunction(Function &F) {
1388 if (skipFunction(F))
1389 return false;
1390 auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1391 auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1392 auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
1393 auto GetTTI = [this](Function &F) -> TargetTransformInfo & {
1394 return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1395 };
1396 SeparateConstOffsetFromGEP Impl(DT, LI, TLI, GetTTI, LowerGEP);
1397 return Impl.run(F);
1398}
1399
1400bool SeparateConstOffsetFromGEP::run(Function &F) {
1401 if (DisableSeparateConstOffsetFromGEP)
1402 return false;
1403
1404 DL = &F.getDataLayout();
1405 bool Changed = false;
1406
1407 ReversePostOrderTraversal<Function *> RPOT(&F);
1408 for (BasicBlock *B : RPOT) {
1409 if (!DT->isReachableFromEntry(A: B))
1410 continue;
1411
1412 for (Instruction &I : llvm::make_early_inc_range(Range&: *B))
1413 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: &I))
1414 Changed |= splitGEP(GEP);
1415 // No need to split GEP ConstantExprs because all its indices are constant
1416 // already.
1417 }
1418
1419 Changed |= reuniteExts(F);
1420
1421 if (VerifyNoDeadCode)
1422 verifyNoDeadCode(F);
1423
1424 return Changed;
1425}
1426
1427Instruction *SeparateConstOffsetFromGEP::findClosestMatchingDominator(
1428 ExprKey Key, Instruction *Dominatee,
1429 DenseMap<ExprKey, SmallVector<Instruction *, 2>> &DominatingExprs) {
1430 auto Pos = DominatingExprs.find(Val: Key);
1431 if (Pos == DominatingExprs.end())
1432 return nullptr;
1433
1434 auto &Candidates = Pos->second;
1435 // Because we process the basic blocks in pre-order of the dominator tree, a
1436 // candidate that doesn't dominate the current instruction won't dominate any
1437 // future instruction either. Therefore, we pop it out of the stack. This
1438 // optimization makes the algorithm O(n).
1439 while (!Candidates.empty()) {
1440 Instruction *Candidate = Candidates.back();
1441 if (DT->dominates(Def: Candidate, User: Dominatee))
1442 return Candidate;
1443 Candidates.pop_back();
1444 }
1445 return nullptr;
1446}
1447
1448bool SeparateConstOffsetFromGEP::reuniteExts(Instruction *I) {
1449 if (!I->getType()->isIntOrIntVectorTy())
1450 return false;
1451
1452 // Dom: LHS+RHS
1453 // I: sext(LHS)+sext(RHS)
1454 // If Dom can't sign overflow and Dom dominates I, optimize I to sext(Dom).
1455 // TODO: handle zext
1456 Value *LHS = nullptr, *RHS = nullptr;
1457 if (match(V: I, P: m_Add(L: m_SExt(Op: m_Value(V&: LHS)), R: m_SExt(Op: m_Value(V&: RHS))))) {
1458 if (LHS->getType() == RHS->getType()) {
1459 ExprKey Key = createNormalizedCommutablePair(A: LHS, B: RHS);
1460 if (auto *Dom = findClosestMatchingDominator(Key, Dominatee: I, DominatingExprs&: DominatingAdds)) {
1461 Instruction *NewSExt =
1462 new SExtInst(Dom, I->getType(), "", I->getIterator());
1463 NewSExt->takeName(V: I);
1464 I->replaceAllUsesWith(V: NewSExt);
1465 NewSExt->setDebugLoc(I->getDebugLoc());
1466 RecursivelyDeleteTriviallyDeadInstructions(V: I);
1467 return true;
1468 }
1469 }
1470 } else if (match(V: I, P: m_Sub(L: m_SExt(Op: m_Value(V&: LHS)), R: m_SExt(Op: m_Value(V&: RHS))))) {
1471 if (LHS->getType() == RHS->getType()) {
1472 if (auto *Dom =
1473 findClosestMatchingDominator(Key: {LHS, RHS}, Dominatee: I, DominatingExprs&: DominatingSubs)) {
1474 Instruction *NewSExt =
1475 new SExtInst(Dom, I->getType(), "", I->getIterator());
1476 NewSExt->takeName(V: I);
1477 I->replaceAllUsesWith(V: NewSExt);
1478 NewSExt->setDebugLoc(I->getDebugLoc());
1479 RecursivelyDeleteTriviallyDeadInstructions(V: I);
1480 return true;
1481 }
1482 }
1483 }
1484
1485 // Add I to DominatingExprs if it's an add/sub that can't sign overflow.
1486 if (match(V: I, P: m_NSWAdd(L: m_Value(V&: LHS), R: m_Value(V&: RHS)))) {
1487 if (programUndefinedIfPoison(Inst: I)) {
1488 ExprKey Key = createNormalizedCommutablePair(A: LHS, B: RHS);
1489 DominatingAdds[Key].push_back(Elt: I);
1490 }
1491 } else if (match(V: I, P: m_NSWSub(L: m_Value(V&: LHS), R: m_Value(V&: RHS)))) {
1492 if (programUndefinedIfPoison(Inst: I))
1493 DominatingSubs[{LHS, RHS}].push_back(Elt: I);
1494 }
1495 return false;
1496}
1497
1498bool SeparateConstOffsetFromGEP::reuniteExts(Function &F) {
1499 bool Changed = false;
1500 DominatingAdds.clear();
1501 DominatingSubs.clear();
1502 for (const auto Node : depth_first(G: DT)) {
1503 BasicBlock *BB = Node->getBlock();
1504 for (Instruction &I : llvm::make_early_inc_range(Range&: *BB))
1505 Changed |= reuniteExts(I: &I);
1506 }
1507 return Changed;
1508}
1509
1510void SeparateConstOffsetFromGEP::verifyNoDeadCode(Function &F) {
1511 for (BasicBlock &B : F) {
1512 for (Instruction &I : B) {
1513 if (isInstructionTriviallyDead(I: &I)) {
1514 std::string ErrMessage;
1515 raw_string_ostream RSO(ErrMessage);
1516 RSO << "Dead instruction detected!\n" << I << "\n";
1517 llvm_unreachable(RSO.str().c_str());
1518 }
1519 }
1520 }
1521}
1522
1523bool SeparateConstOffsetFromGEP::isLegalToSwapOperand(
1524 GetElementPtrInst *FirstGEP, GetElementPtrInst *SecondGEP, Loop *CurLoop) {
1525 if (!FirstGEP || !FirstGEP->hasOneUse())
1526 return false;
1527
1528 if (!SecondGEP || FirstGEP->getParent() != SecondGEP->getParent())
1529 return false;
1530
1531 if (FirstGEP == SecondGEP)
1532 return false;
1533
1534 unsigned FirstNum = FirstGEP->getNumOperands();
1535 unsigned SecondNum = SecondGEP->getNumOperands();
1536 // Give up if the number of operands are not 2.
1537 if (FirstNum != SecondNum || FirstNum != 2)
1538 return false;
1539
1540 Value *FirstBase = FirstGEP->getOperand(i_nocapture: 0);
1541 Value *SecondBase = SecondGEP->getOperand(i_nocapture: 0);
1542 Value *FirstOffset = FirstGEP->getOperand(i_nocapture: 1);
1543 // Give up if the index of the first GEP is loop invariant.
1544 if (CurLoop->isLoopInvariant(V: FirstOffset))
1545 return false;
1546
1547 // Give up if base doesn't have same type.
1548 if (FirstBase->getType() != SecondBase->getType())
1549 return false;
1550
1551 Instruction *FirstOffsetDef = dyn_cast<Instruction>(Val: FirstOffset);
1552
1553 // Check if the second operand of first GEP has constant coefficient.
1554 // For an example, for the following code, we won't gain anything by
1555 // hoisting the second GEP out because the second GEP can be folded away.
1556 // %scevgep.sum.ur159 = add i64 %idxprom48.ur, 256
1557 // %67 = shl i64 %scevgep.sum.ur159, 2
1558 // %uglygep160 = getelementptr i8* %65, i64 %67
1559 // %uglygep161 = getelementptr i8* %uglygep160, i64 -1024
1560
1561 // Skip constant shift instruction which may be generated by Splitting GEPs.
1562 if (FirstOffsetDef && FirstOffsetDef->isShift() &&
1563 isa<ConstantInt>(Val: FirstOffsetDef->getOperand(i: 1)))
1564 FirstOffsetDef = dyn_cast<Instruction>(Val: FirstOffsetDef->getOperand(i: 0));
1565
1566 // Give up if FirstOffsetDef is an Add or Sub with constant.
1567 // Because it may not profitable at all due to constant folding.
1568 if (FirstOffsetDef)
1569 if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Val: FirstOffsetDef)) {
1570 unsigned opc = BO->getOpcode();
1571 if ((opc == Instruction::Add || opc == Instruction::Sub) &&
1572 (isa<ConstantInt>(Val: BO->getOperand(i_nocapture: 0)) ||
1573 isa<ConstantInt>(Val: BO->getOperand(i_nocapture: 1))))
1574 return false;
1575 }
1576 return true;
1577}
1578
1579bool SeparateConstOffsetFromGEP::hasMoreThanOneUseInLoop(Value *V, Loop *L) {
1580 // TODO: Could look at uses of globals, but we need to make sure we are
1581 // looking at the correct function.
1582 if (isa<Constant>(Val: V))
1583 return false;
1584
1585 int UsesInLoop = 0;
1586 for (User *U : V->users()) {
1587 if (Instruction *User = dyn_cast<Instruction>(Val: U))
1588 if (L->contains(Inst: User))
1589 if (++UsesInLoop > 1)
1590 return true;
1591 }
1592 return false;
1593}
1594
1595void SeparateConstOffsetFromGEP::swapGEPOperand(GetElementPtrInst *First,
1596 GetElementPtrInst *Second) {
1597 Value *Offset1 = First->getOperand(i_nocapture: 1);
1598 Value *Offset2 = Second->getOperand(i_nocapture: 1);
1599 First->setOperand(i_nocapture: 1, Val_nocapture: Offset2);
1600 Second->setOperand(i_nocapture: 1, Val_nocapture: Offset1);
1601
1602 // After changing (p+o)+c to (p+c)+o, the inner GEP may not be inbounds
1603 // anymore.
1604 const DataLayout &DAL = First->getDataLayout();
1605 unsigned IdxBits = DAL.getIndexSizeInBits(
1606 AS: cast<PointerType>(Val: First->getType())->getAddressSpace());
1607
1608 auto ClearNoWrapFlags = [&] {
1609 // TODO(gep_nowrap): Make flag preservation more precise.
1610 First->setNoWrapFlags(GEPNoWrapFlags::none());
1611 Second->setNoWrapFlags(GEPNoWrapFlags::none());
1612 };
1613
1614 APInt FirstOffset(IdxBits, 0);
1615 if (!First->accumulateConstantOffset(DL: DAL, Offset&: FirstOffset)) {
1616 ClearNoWrapFlags();
1617 return;
1618 }
1619
1620 APInt BaseOffset(IdxBits, 0);
1621 Value *NewBase =
1622 First->getOperand(i_nocapture: 0)->stripAndAccumulateInBoundsConstantOffsets(
1623 DL: DAL, Offset&: BaseOffset);
1624
1625 bool Overflow = false;
1626 APInt TotalOffset = BaseOffset.uadd_ov(RHS: FirstOffset, Overflow);
1627 uint64_t ObjectSize;
1628 if (Overflow || !getObjectSize(Ptr: NewBase, Size&: ObjectSize, DL: DAL, TLI) ||
1629 TotalOffset.ugt(RHS: ObjectSize)) {
1630 ClearNoWrapFlags();
1631 return;
1632 }
1633
1634 First->setIsInBounds(true);
1635}
1636
1637void SeparateConstOffsetFromGEPPass::printPipeline(
1638 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
1639 static_cast<PassInfoMixin<SeparateConstOffsetFromGEPPass> *>(this)
1640 ->printPipeline(OS, MapClassName2PassName);
1641 OS << '<';
1642 if (LowerGEP)
1643 OS << "lower-gep";
1644 OS << '>';
1645}
1646
1647PreservedAnalyses
1648SeparateConstOffsetFromGEPPass::run(Function &F, FunctionAnalysisManager &AM) {
1649 auto *DT = &AM.getResult<DominatorTreeAnalysis>(IR&: F);
1650 auto *LI = &AM.getResult<LoopAnalysis>(IR&: F);
1651 auto *TLI = &AM.getResult<TargetLibraryAnalysis>(IR&: F);
1652 auto GetTTI = [&AM](Function &F) -> TargetTransformInfo & {
1653 return AM.getResult<TargetIRAnalysis>(IR&: F);
1654 };
1655 SeparateConstOffsetFromGEP Impl(DT, LI, TLI, GetTTI, LowerGEP);
1656 if (!Impl.run(F))
1657 return PreservedAnalyses::all();
1658 PreservedAnalyses PA;
1659 PA.preserveSet<CFGAnalyses>();
1660 return PA;
1661}
1662