1//===- BypassSlowDivision.cpp - Bypass slow division ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains an optimization for div and rem on architectures that
10// execute short instructions significantly faster than longer instructions.
11// For example, on Intel Atom 32-bit divides are slow enough that during
12// runtime it is profitable to check the value of the operands, and if they are
13// positive and less than 256 use an unsigned 8-bit divide.
14//
15//===----------------------------------------------------------------------===//
16
17#include "llvm/Transforms/Utils/BypassSlowDivision.h"
18#include "llvm/ADT/DenseMap.h"
19#include "llvm/ADT/STLExtras.h"
20#include "llvm/ADT/SmallPtrSet.h"
21#include "llvm/Analysis/ValueTracking.h"
22#include "llvm/IR/BasicBlock.h"
23#include "llvm/IR/Constants.h"
24#include "llvm/IR/DerivedTypes.h"
25#include "llvm/IR/Function.h"
26#include "llvm/IR/IRBuilder.h"
27#include "llvm/IR/Instruction.h"
28#include "llvm/IR/Instructions.h"
29#include "llvm/IR/Type.h"
30#include "llvm/IR/Value.h"
31#include "llvm/Support/Casting.h"
32#include "llvm/Support/KnownBits.h"
33#include "llvm/Transforms/Utils/Local.h"
34#include <cassert>
35#include <cstdint>
36
37using namespace llvm;
38
39#define DEBUG_TYPE "bypass-slow-division"
40
41namespace {
42
43 struct QuotRemPair {
44 Value *Quotient;
45 Value *Remainder;
46
47 QuotRemPair(Value *InQuotient, Value *InRemainder)
48 : Quotient(InQuotient), Remainder(InRemainder) {}
49 };
50
51 /// A quotient and remainder, plus a BB from which they logically "originate".
52 /// If you use Quotient or Remainder in a Phi node, you should use BB as its
53 /// corresponding predecessor.
54 struct QuotRemWithBB {
55 BasicBlock *BB = nullptr;
56 Value *Quotient = nullptr;
57 Value *Remainder = nullptr;
58 };
59
60using DivCacheTy = DenseMap<DivRemMapKey, QuotRemPair>;
61using BypassWidthsTy = DenseMap<unsigned, unsigned>;
62using VisitedSetTy = SmallPtrSet<Instruction *, 4>;
63
64enum ValueRange {
65 /// Operand definitely fits into BypassType. No runtime checks are needed.
66 VALRNG_KNOWN_SHORT,
67 /// A runtime check is required, as value range is unknown.
68 VALRNG_UNKNOWN,
69 /// Operand is unlikely to fit into BypassType. The bypassing should be
70 /// disabled.
71 VALRNG_LIKELY_LONG
72};
73
74class FastDivInsertionTask {
75 bool IsValidTask = false;
76 Instruction *SlowDivOrRem = nullptr;
77 IntegerType *BypassType = nullptr;
78 BasicBlock *MainBB = nullptr;
79
80 bool isHashLikeValue(Value *V, VisitedSetTy &Visited);
81 ValueRange getValueRange(Value *Op, VisitedSetTy &Visited);
82 QuotRemWithBB createSlowBB(BasicBlock *Successor);
83 QuotRemWithBB createFastBB(BasicBlock *Successor);
84 QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS,
85 BasicBlock *PhiBB);
86 Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2);
87 std::optional<QuotRemPair> insertFastDivAndRem();
88
89 bool isSignedOp() {
90 return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
91 SlowDivOrRem->getOpcode() == Instruction::SRem;
92 }
93
94 bool isDivisionOp() {
95 return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
96 SlowDivOrRem->getOpcode() == Instruction::UDiv;
97 }
98
99 Type *getSlowType() { return SlowDivOrRem->getType(); }
100
101public:
102 FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths);
103
104 Value *getReplacement(DivCacheTy &Cache);
105};
106
107} // end anonymous namespace
108
109FastDivInsertionTask::FastDivInsertionTask(Instruction *I,
110 const BypassWidthsTy &BypassWidths) {
111 switch (I->getOpcode()) {
112 case Instruction::UDiv:
113 case Instruction::SDiv:
114 case Instruction::URem:
115 case Instruction::SRem:
116 SlowDivOrRem = I;
117 break;
118 default:
119 // I is not a div/rem operation.
120 return;
121 }
122
123 // Skip division on vector types. Only optimize integer instructions.
124 IntegerType *SlowType = dyn_cast<IntegerType>(Val: SlowDivOrRem->getType());
125 if (!SlowType)
126 return;
127
128 // Skip if this bitwidth is not bypassed.
129 auto BI = BypassWidths.find(Val: SlowType->getBitWidth());
130 if (BI == BypassWidths.end())
131 return;
132
133 // Get type for div/rem instruction with bypass bitwidth.
134 IntegerType *BT = IntegerType::get(C&: I->getContext(), NumBits: BI->second);
135 BypassType = BT;
136
137 // The original basic block.
138 MainBB = I->getParent();
139
140 // The instruction is indeed a slow div or rem operation.
141 IsValidTask = true;
142}
143
144/// Reuses previously-computed dividend or remainder from the current BB if
145/// operands and operation are identical. Otherwise calls insertFastDivAndRem to
146/// perform the optimization and caches the resulting dividend and remainder.
147/// If no replacement can be generated, nullptr is returned.
148Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) {
149 // First, make sure that the task is valid.
150 if (!IsValidTask)
151 return nullptr;
152
153 // Then, look for a value in Cache.
154 Value *Dividend = SlowDivOrRem->getOperand(i: 0);
155 Value *Divisor = SlowDivOrRem->getOperand(i: 1);
156 DivRemMapKey Key(isSignedOp(), Dividend, Divisor);
157 auto CacheI = Cache.find(Val: Key);
158
159 if (CacheI == Cache.end()) {
160 // If previous instance does not exist, try to insert fast div.
161 std::optional<QuotRemPair> OptResult = insertFastDivAndRem();
162 // Bail out if insertFastDivAndRem has failed.
163 if (!OptResult)
164 return nullptr;
165 CacheI = Cache.insert(KV: {Key, *OptResult}).first;
166 }
167
168 QuotRemPair &Value = CacheI->second;
169 return isDivisionOp() ? Value.Quotient : Value.Remainder;
170}
171
172/// Check if a value looks like a hash.
173///
174/// The routine is expected to detect values computed using the most common hash
175/// algorithms. Typically, hash computations end with one of the following
176/// instructions:
177///
178/// 1) MUL with a constant wider than BypassType
179/// 2) XOR instruction
180///
181/// And even if we are wrong and the value is not a hash, it is still quite
182/// unlikely that such values will fit into BypassType.
183///
184/// To detect string hash algorithms like FNV we have to look through PHI-nodes.
185/// It is implemented as a depth-first search for values that look neither long
186/// nor hash-like.
187bool FastDivInsertionTask::isHashLikeValue(Value *V, VisitedSetTy &Visited) {
188 Instruction *I = dyn_cast<Instruction>(Val: V);
189 if (!I)
190 return false;
191
192 switch (I->getOpcode()) {
193 case Instruction::Xor:
194 return true;
195 case Instruction::Mul: {
196 // After Constant Hoisting pass, long constants may be represented as
197 // bitcast instructions. As a result, some constants may look like an
198 // instruction at first, and an additional check is necessary to find out if
199 // an operand is actually a constant.
200 Value *Op1 = I->getOperand(i: 1);
201 ConstantInt *C = dyn_cast<ConstantInt>(Val: Op1);
202 if (!C && isa<BitCastInst>(Val: Op1))
203 C = dyn_cast<ConstantInt>(Val: cast<BitCastInst>(Val: Op1)->getOperand(i_nocapture: 0));
204 return C && C->getValue().getSignificantBits() > BypassType->getBitWidth();
205 }
206 case Instruction::PHI:
207 // Stop IR traversal in case of a crazy input code. This limits recursion
208 // depth.
209 if (Visited.size() >= 16)
210 return false;
211 // Do not visit nodes that have been visited already. We return true because
212 // it means that we couldn't find any value that doesn't look hash-like.
213 if (!Visited.insert(Ptr: I).second)
214 return true;
215 return llvm::all_of(Range: cast<PHINode>(Val: I)->incoming_values(), P: [&](Value *V) {
216 // Ignore undef values as they probably don't affect the division
217 // operands.
218 return getValueRange(Op: V, Visited) == VALRNG_LIKELY_LONG ||
219 isa<UndefValue>(Val: V);
220 });
221 default:
222 return false;
223 }
224}
225
226/// Check if an integer value fits into our bypass type.
227ValueRange FastDivInsertionTask::getValueRange(Value *V,
228 VisitedSetTy &Visited) {
229 unsigned ShortLen = BypassType->getBitWidth();
230 unsigned LongLen = V->getType()->getIntegerBitWidth();
231
232 assert(LongLen > ShortLen && "Value type must be wider than BypassType");
233 unsigned HiBits = LongLen - ShortLen;
234
235 const DataLayout &DL = SlowDivOrRem->getDataLayout();
236 KnownBits Known(LongLen);
237
238 computeKnownBits(V, Known, DL);
239
240 if (Known.countMinLeadingZeros() >= HiBits)
241 return VALRNG_KNOWN_SHORT;
242
243 if (Known.countMaxLeadingZeros() < HiBits)
244 return VALRNG_LIKELY_LONG;
245
246 // Long integer divisions are often used in hashtable implementations. It's
247 // not worth bypassing such divisions because hash values are extremely
248 // unlikely to have enough leading zeros. The call below tries to detect
249 // values that are unlikely to fit BypassType (including hashes).
250 if (isHashLikeValue(V, Visited))
251 return VALRNG_LIKELY_LONG;
252
253 return VALRNG_UNKNOWN;
254}
255
256/// Add new basic block for slow div and rem operations and put it before
257/// SuccessorBB.
258QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) {
259 QuotRemWithBB DivRemPair;
260 DivRemPair.BB = BasicBlock::Create(Context&: MainBB->getParent()->getContext(), Name: "",
261 Parent: MainBB->getParent(), InsertBefore: SuccessorBB);
262 IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
263 Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
264
265 Value *Dividend = SlowDivOrRem->getOperand(i: 0);
266 Value *Divisor = SlowDivOrRem->getOperand(i: 1);
267
268 if (isSignedOp()) {
269 DivRemPair.Quotient = Builder.CreateSDiv(LHS: Dividend, RHS: Divisor);
270 DivRemPair.Remainder = Builder.CreateSRem(LHS: Dividend, RHS: Divisor);
271 } else {
272 DivRemPair.Quotient = Builder.CreateUDiv(LHS: Dividend, RHS: Divisor);
273 DivRemPair.Remainder = Builder.CreateURem(LHS: Dividend, RHS: Divisor);
274 }
275
276 Builder.CreateBr(Dest: SuccessorBB);
277 return DivRemPair;
278}
279
280/// Add new basic block for fast div and rem operations and put it before
281/// SuccessorBB.
282QuotRemWithBB FastDivInsertionTask::createFastBB(BasicBlock *SuccessorBB) {
283 QuotRemWithBB DivRemPair;
284 DivRemPair.BB = BasicBlock::Create(Context&: MainBB->getParent()->getContext(), Name: "",
285 Parent: MainBB->getParent(), InsertBefore: SuccessorBB);
286 IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
287 Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
288
289 Value *Dividend = SlowDivOrRem->getOperand(i: 0);
290 Value *Divisor = SlowDivOrRem->getOperand(i: 1);
291 Value *ShortDivisorV =
292 Builder.CreateCast(Op: Instruction::Trunc, V: Divisor, DestTy: BypassType);
293 Value *ShortDividendV =
294 Builder.CreateCast(Op: Instruction::Trunc, V: Dividend, DestTy: BypassType);
295
296 // udiv/urem because this optimization only handles positive numbers.
297 Value *ShortQV = Builder.CreateUDiv(LHS: ShortDividendV, RHS: ShortDivisorV);
298 Value *ShortRV = Builder.CreateURem(LHS: ShortDividendV, RHS: ShortDivisorV);
299 DivRemPair.Quotient =
300 Builder.CreateCast(Op: Instruction::ZExt, V: ShortQV, DestTy: getSlowType());
301 DivRemPair.Remainder =
302 Builder.CreateCast(Op: Instruction::ZExt, V: ShortRV, DestTy: getSlowType());
303 Builder.CreateBr(Dest: SuccessorBB);
304
305 return DivRemPair;
306}
307
308/// Creates Phi nodes for result of Div and Rem.
309QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS,
310 QuotRemWithBB &RHS,
311 BasicBlock *PhiBB) {
312 IRBuilder<> Builder(PhiBB, PhiBB->begin());
313 Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
314 PHINode *QuoPhi = Builder.CreatePHI(Ty: getSlowType(), NumReservedValues: 2);
315 QuoPhi->addIncoming(V: LHS.Quotient, BB: LHS.BB);
316 QuoPhi->addIncoming(V: RHS.Quotient, BB: RHS.BB);
317 PHINode *RemPhi = Builder.CreatePHI(Ty: getSlowType(), NumReservedValues: 2);
318 RemPhi->addIncoming(V: LHS.Remainder, BB: LHS.BB);
319 RemPhi->addIncoming(V: RHS.Remainder, BB: RHS.BB);
320 return QuotRemPair(QuoPhi, RemPhi);
321}
322
323/// Creates a runtime check to test whether both the divisor and dividend fit
324/// into BypassType. The check is inserted at the end of MainBB. True return
325/// value means that the operands fit. Either of the operands may be NULL if it
326/// doesn't need a runtime check.
327Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) {
328 assert((Op1 || Op2) && "Nothing to check");
329 IRBuilder<> Builder(MainBB, MainBB->end());
330 Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
331
332 Value *OrV;
333 if (Op1 && Op2)
334 OrV = Builder.CreateOr(LHS: Op1, RHS: Op2);
335 else
336 OrV = Op1 ? Op1 : Op2;
337
338 // BitMask is inverted to check if the operands are
339 // larger than the bypass type
340 uint64_t BitMask = ~BypassType->getBitMask();
341 Value *AndV = Builder.CreateAnd(LHS: OrV, RHS: BitMask);
342
343 // Compare operand values
344 Value *ZeroV = ConstantInt::getSigned(Ty: getSlowType(), V: 0);
345 return Builder.CreateICmpEQ(LHS: AndV, RHS: ZeroV);
346}
347
348/// Substitutes the div/rem instruction with code that checks the value of the
349/// operands and uses a shorter-faster div/rem instruction when possible.
350std::optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() {
351 Value *Dividend = SlowDivOrRem->getOperand(i: 0);
352 Value *Divisor = SlowDivOrRem->getOperand(i: 1);
353
354 VisitedSetTy SetL;
355 ValueRange DividendRange = getValueRange(V: Dividend, Visited&: SetL);
356 if (DividendRange == VALRNG_LIKELY_LONG)
357 return std::nullopt;
358
359 VisitedSetTy SetR;
360 ValueRange DivisorRange = getValueRange(V: Divisor, Visited&: SetR);
361 if (DivisorRange == VALRNG_LIKELY_LONG)
362 return std::nullopt;
363
364 bool DividendShort = (DividendRange == VALRNG_KNOWN_SHORT);
365 bool DivisorShort = (DivisorRange == VALRNG_KNOWN_SHORT);
366
367 if (DividendShort && DivisorShort) {
368 // If both operands are known to be short then just replace the long
369 // division with a short one in-place. Since we're not introducing control
370 // flow in this case, narrowing the division is always a win, even if the
371 // divisor is a constant (and will later get replaced by a multiplication).
372
373 IRBuilder<> Builder(SlowDivOrRem);
374 Value *TruncDividend = Builder.CreateTrunc(V: Dividend, DestTy: BypassType);
375 Value *TruncDivisor = Builder.CreateTrunc(V: Divisor, DestTy: BypassType);
376 Value *TruncDiv = Builder.CreateUDiv(LHS: TruncDividend, RHS: TruncDivisor);
377 Value *TruncRem = Builder.CreateURem(LHS: TruncDividend, RHS: TruncDivisor);
378 Value *ExtDiv = Builder.CreateZExt(V: TruncDiv, DestTy: getSlowType());
379 Value *ExtRem = Builder.CreateZExt(V: TruncRem, DestTy: getSlowType());
380 return QuotRemPair(ExtDiv, ExtRem);
381 }
382
383 if (isa<ConstantInt>(Val: Divisor)) {
384 // If the divisor is not a constant, DAGCombiner will convert it to a
385 // multiplication by a magic constant. It isn't clear if it is worth
386 // introducing control flow to get a narrower multiply.
387 return std::nullopt;
388 }
389
390 // After Constant Hoisting pass, long constants may be represented as
391 // bitcast instructions. As a result, some constants may look like an
392 // instruction at first, and an additional check is necessary to find out if
393 // an operand is actually a constant.
394 if (auto *BCI = dyn_cast<BitCastInst>(Val: Divisor))
395 if (BCI->getParent() == SlowDivOrRem->getParent() &&
396 isa<ConstantInt>(Val: BCI->getOperand(i_nocapture: 0)))
397 return std::nullopt;
398
399 IRBuilder<> Builder(MainBB, MainBB->end());
400 Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
401
402 if (DividendShort && !isSignedOp()) {
403 // If the division is unsigned and Dividend is known to be short, then
404 // either
405 // 1) Divisor is less or equal to Dividend, and the result can be computed
406 // with a short division.
407 // 2) Divisor is greater than Dividend. In this case, no division is needed
408 // at all: The quotient is 0 and the remainder is equal to Dividend.
409 //
410 // So instead of checking at runtime whether Divisor fits into BypassType,
411 // we emit a runtime check to differentiate between these two cases. This
412 // lets us entirely avoid a long div.
413
414 // Split the basic block before the div/rem.
415 BasicBlock *SuccessorBB = MainBB->splitBasicBlock(I: SlowDivOrRem);
416 // Remove the unconditional branch from MainBB to SuccessorBB.
417 MainBB->back().eraseFromParent();
418 QuotRemWithBB Long;
419 Long.BB = MainBB;
420 Long.Quotient = ConstantInt::get(Ty: getSlowType(), V: 0);
421 Long.Remainder = Dividend;
422 QuotRemWithBB Fast = createFastBB(SuccessorBB);
423 QuotRemPair Result = createDivRemPhiNodes(LHS&: Fast, RHS&: Long, PhiBB: SuccessorBB);
424 Value *CmpV = Builder.CreateICmpUGE(LHS: Dividend, RHS: Divisor);
425 Builder.CreateCondBr(Cond: CmpV, True: Fast.BB, False: SuccessorBB);
426 return Result;
427 } else {
428 // General case. Create both slow and fast div/rem pairs and choose one of
429 // them at runtime.
430
431 // Split the basic block before the div/rem.
432 BasicBlock *SuccessorBB = MainBB->splitBasicBlock(I: SlowDivOrRem);
433 // Remove the unconditional branch from MainBB to SuccessorBB.
434 MainBB->back().eraseFromParent();
435 QuotRemWithBB Fast = createFastBB(SuccessorBB);
436 QuotRemWithBB Slow = createSlowBB(SuccessorBB);
437 QuotRemPair Result = createDivRemPhiNodes(LHS&: Fast, RHS&: Slow, PhiBB: SuccessorBB);
438 Value *CmpV = insertOperandRuntimeCheck(Op1: DividendShort ? nullptr : Dividend,
439 Op2: DivisorShort ? nullptr : Divisor);
440 Builder.CreateCondBr(Cond: CmpV, True: Fast.BB, False: Slow.BB);
441 return Result;
442 }
443}
444
445/// This optimization identifies DIV/REM instructions in a BB that can be
446/// profitably bypassed and carried out with a shorter, faster divide.
447bool llvm::bypassSlowDivision(BasicBlock *BB,
448 const BypassWidthsTy &BypassWidths) {
449 DivCacheTy PerBBDivCache;
450
451 bool MadeChange = false;
452 Instruction *Next = &*BB->begin();
453 while (Next != nullptr) {
454 // We may add instructions immediately after I, but we want to skip over
455 // them.
456 Instruction *I = Next;
457 Next = Next->getNextNode();
458
459 // Ignore dead code to save time and avoid bugs.
460 if (I->use_empty())
461 continue;
462
463 FastDivInsertionTask Task(I, BypassWidths);
464 if (Value *Replacement = Task.getReplacement(Cache&: PerBBDivCache)) {
465 I->replaceAllUsesWith(V: Replacement);
466 I->eraseFromParent();
467 MadeChange = true;
468 }
469 }
470
471 // Above we eagerly create divs and rems, as pairs, so that we can efficiently
472 // create divrem machine instructions. Now erase any unused divs / rems so we
473 // don't leave extra instructions sitting around.
474 for (auto &KV : PerBBDivCache)
475 for (Value *V : {KV.second.Quotient, KV.second.Remainder})
476 RecursivelyDeleteTriviallyDeadInstructions(V);
477
478 return MadeChange;
479}
480