1//===- BypassSlowDivision.cpp - Bypass slow division ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains an optimization for div and rem on architectures that
10// execute short instructions significantly faster than longer instructions.
11// For example, on Intel Atom 32-bit divides are slow enough that during
12// runtime it is profitable to check the value of the operands, and if they are
13// positive and less than 256 use an unsigned 8-bit divide.
14//
15//===----------------------------------------------------------------------===//
16
17#include "llvm/Transforms/Utils/BypassSlowDivision.h"
18#include "llvm/ADT/DenseMap.h"
19#include "llvm/ADT/STLExtras.h"
20#include "llvm/ADT/SmallPtrSet.h"
21#include "llvm/Analysis/DomTreeUpdater.h"
22#include "llvm/Analysis/LoopInfo.h"
23#include "llvm/Analysis/ValueTracking.h"
24#include "llvm/IR/BasicBlock.h"
25#include "llvm/IR/Constants.h"
26#include "llvm/IR/DerivedTypes.h"
27#include "llvm/IR/Function.h"
28#include "llvm/IR/IRBuilder.h"
29#include "llvm/IR/Instruction.h"
30#include "llvm/IR/Instructions.h"
31#include "llvm/IR/Type.h"
32#include "llvm/IR/Value.h"
33#include "llvm/Support/Casting.h"
34#include "llvm/Support/KnownBits.h"
35#include "llvm/Transforms/Utils/BasicBlockUtils.h"
36#include "llvm/Transforms/Utils/Local.h"
37#include <cassert>
38#include <cstdint>
39
40using namespace llvm;
41
42#define DEBUG_TYPE "bypass-slow-division"
43
44namespace {
45
46struct QuotRemPair {
47 Value *Quotient;
48 Value *Remainder;
49
50 QuotRemPair(Value *InQuotient, Value *InRemainder)
51 : Quotient(InQuotient), Remainder(InRemainder) {}
52};
53
54/// A quotient and remainder, plus a BB from which they logically "originate".
55/// If you use Quotient or Remainder in a Phi node, you should use BB as its
56/// corresponding predecessor.
57struct QuotRemWithBB {
58 BasicBlock *BB = nullptr;
59 Value *Quotient = nullptr;
60 Value *Remainder = nullptr;
61};
62
63using DivCacheTy = DenseMap<DivRemMapKey, QuotRemPair>;
64using BypassWidthsTy = DenseMap<unsigned, unsigned>;
65using VisitedSetTy = SmallPtrSet<Instruction *, 4>;
66
67enum ValueRange {
68 /// Operand definitely fits into BypassType. No runtime checks are needed.
69 VALRNG_KNOWN_SHORT,
70 /// A runtime check is required, as value range is unknown.
71 VALRNG_UNKNOWN,
72 /// Operand is unlikely to fit into BypassType. The bypassing should be
73 /// disabled.
74 VALRNG_LIKELY_LONG
75};
76
77class FastDivInsertionTask {
78 bool IsValidTask = false;
79 Instruction *SlowDivOrRem = nullptr;
80 IntegerType *BypassType = nullptr;
81 BasicBlock *MainBB = nullptr;
82 DomTreeUpdater *DTU = nullptr;
83 LoopInfo *LI = nullptr;
84
85 bool isHashLikeValue(Value *V, VisitedSetTy &Visited);
86 ValueRange getValueRange(Value *Op, VisitedSetTy &Visited);
87 QuotRemWithBB createSlowBB(BasicBlock *Successor);
88 QuotRemWithBB createFastBB(BasicBlock *Successor);
89 QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS,
90 BasicBlock *PhiBB);
91 Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2);
92 std::optional<QuotRemPair> insertFastDivAndRem();
93
94 bool isSignedOp() {
95 return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
96 SlowDivOrRem->getOpcode() == Instruction::SRem;
97 }
98
99 bool isDivisionOp() {
100 return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
101 SlowDivOrRem->getOpcode() == Instruction::UDiv;
102 }
103
104 Type *getSlowType() { return SlowDivOrRem->getType(); }
105
106public:
107 FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths,
108 DomTreeUpdater *DTU, LoopInfo *LI);
109
110 Value *getReplacement(DivCacheTy &Cache);
111};
112
113} // end anonymous namespace
114
115FastDivInsertionTask::FastDivInsertionTask(Instruction *I,
116 const BypassWidthsTy &BypassWidths,
117 DomTreeUpdater *DTU, LoopInfo *LI)
118 : DTU(DTU), LI(LI) {
119 switch (I->getOpcode()) {
120 case Instruction::UDiv:
121 case Instruction::SDiv:
122 case Instruction::URem:
123 case Instruction::SRem:
124 SlowDivOrRem = I;
125 break;
126 default:
127 // I is not a div/rem operation.
128 return;
129 }
130
131 // Skip division on vector types. Only optimize integer instructions.
132 IntegerType *SlowType = dyn_cast<IntegerType>(Val: SlowDivOrRem->getType());
133 if (!SlowType)
134 return;
135
136 // Skip if this bitwidth is not bypassed.
137 auto BI = BypassWidths.find(Val: SlowType->getBitWidth());
138 if (BI == BypassWidths.end())
139 return;
140
141 // Get type for div/rem instruction with bypass bitwidth.
142 IntegerType *BT = IntegerType::get(C&: I->getContext(), NumBits: BI->second);
143 BypassType = BT;
144
145 // The original basic block.
146 MainBB = I->getParent();
147
148 // The instruction is indeed a slow div or rem operation.
149 IsValidTask = true;
150}
151
152/// Reuses previously-computed dividend or remainder from the current BB if
153/// operands and operation are identical. Otherwise calls insertFastDivAndRem to
154/// perform the optimization and caches the resulting dividend and remainder.
155/// If no replacement can be generated, nullptr is returned.
156Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) {
157 // First, make sure that the task is valid.
158 if (!IsValidTask)
159 return nullptr;
160
161 // Then, look for a value in Cache.
162 Value *Dividend = SlowDivOrRem->getOperand(i: 0);
163 Value *Divisor = SlowDivOrRem->getOperand(i: 1);
164 DivRemMapKey Key(isSignedOp(), Dividend, Divisor);
165 auto CacheI = Cache.find(Val: Key);
166
167 if (CacheI == Cache.end()) {
168 // If previous instance does not exist, try to insert fast div.
169 std::optional<QuotRemPair> OptResult = insertFastDivAndRem();
170 // Bail out if insertFastDivAndRem has failed.
171 if (!OptResult)
172 return nullptr;
173 CacheI = Cache.insert(KV: {Key, *OptResult}).first;
174 }
175
176 QuotRemPair &Value = CacheI->second;
177 return isDivisionOp() ? Value.Quotient : Value.Remainder;
178}
179
180/// Check if a value looks like a hash.
181///
182/// The routine is expected to detect values computed using the most common hash
183/// algorithms. Typically, hash computations end with one of the following
184/// instructions:
185///
186/// 1) MUL with a constant wider than BypassType
187/// 2) XOR instruction
188///
189/// And even if we are wrong and the value is not a hash, it is still quite
190/// unlikely that such values will fit into BypassType.
191///
192/// To detect string hash algorithms like FNV we have to look through PHI-nodes.
193/// It is implemented as a depth-first search for values that look neither long
194/// nor hash-like.
195bool FastDivInsertionTask::isHashLikeValue(Value *V, VisitedSetTy &Visited) {
196 Instruction *I = dyn_cast<Instruction>(Val: V);
197 if (!I)
198 return false;
199
200 switch (I->getOpcode()) {
201 case Instruction::Xor:
202 return true;
203 case Instruction::Mul: {
204 // After Constant Hoisting pass, long constants may be represented as
205 // bitcast instructions. As a result, some constants may look like an
206 // instruction at first, and an additional check is necessary to find out if
207 // an operand is actually a constant.
208 Value *Op1 = I->getOperand(i: 1);
209 ConstantInt *C = dyn_cast<ConstantInt>(Val: Op1);
210 if (!C && isa<BitCastInst>(Val: Op1))
211 C = dyn_cast<ConstantInt>(Val: cast<BitCastInst>(Val: Op1)->getOperand(i_nocapture: 0));
212 return C && C->getValue().getSignificantBits() > BypassType->getBitWidth();
213 }
214 case Instruction::PHI:
215 // Stop IR traversal in case of a crazy input code. This limits recursion
216 // depth.
217 if (Visited.size() >= 16)
218 return false;
219 // Do not visit nodes that have been visited already. We return true because
220 // it means that we couldn't find any value that doesn't look hash-like.
221 if (!Visited.insert(Ptr: I).second)
222 return true;
223 return llvm::all_of(Range: cast<PHINode>(Val: I)->incoming_values(), P: [&](Value *V) {
224 // Ignore undef values as they probably don't affect the division
225 // operands.
226 return getValueRange(Op: V, Visited) == VALRNG_LIKELY_LONG ||
227 isa<UndefValue>(Val: V);
228 });
229 default:
230 return false;
231 }
232}
233
234/// Check if an integer value fits into our bypass type.
235ValueRange FastDivInsertionTask::getValueRange(Value *V,
236 VisitedSetTy &Visited) {
237 unsigned ShortLen = BypassType->getBitWidth();
238 unsigned LongLen = V->getType()->getIntegerBitWidth();
239
240 assert(LongLen > ShortLen && "Value type must be wider than BypassType");
241 unsigned HiBits = LongLen - ShortLen;
242
243 const DataLayout &DL = SlowDivOrRem->getDataLayout();
244 KnownBits Known(LongLen);
245
246 computeKnownBits(V, Known, DL);
247
248 if (Known.countMinLeadingZeros() >= HiBits)
249 return VALRNG_KNOWN_SHORT;
250
251 if (Known.countMaxLeadingZeros() < HiBits)
252 return VALRNG_LIKELY_LONG;
253
254 // Long integer divisions are often used in hashtable implementations. It's
255 // not worth bypassing such divisions because hash values are extremely
256 // unlikely to have enough leading zeros. The call below tries to detect
257 // values that are unlikely to fit BypassType (including hashes).
258 if (isHashLikeValue(V, Visited))
259 return VALRNG_LIKELY_LONG;
260
261 return VALRNG_UNKNOWN;
262}
263
264/// Add new basic block for slow div and rem operations and put it before
265/// SuccessorBB.
266QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) {
267 QuotRemWithBB DivRemPair;
268 DivRemPair.BB = BasicBlock::Create(Context&: MainBB->getParent()->getContext(), Name: "",
269 Parent: MainBB->getParent(), InsertBefore: SuccessorBB);
270 IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
271 Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
272
273 Value *Dividend = SlowDivOrRem->getOperand(i: 0);
274 Value *Divisor = SlowDivOrRem->getOperand(i: 1);
275
276 if (isSignedOp()) {
277 DivRemPair.Quotient = Builder.CreateSDiv(LHS: Dividend, RHS: Divisor);
278 DivRemPair.Remainder = Builder.CreateSRem(LHS: Dividend, RHS: Divisor);
279 } else {
280 DivRemPair.Quotient = Builder.CreateUDiv(LHS: Dividend, RHS: Divisor);
281 DivRemPair.Remainder = Builder.CreateURem(LHS: Dividend, RHS: Divisor);
282 }
283
284 Builder.CreateBr(Dest: SuccessorBB);
285 return DivRemPair;
286}
287
288/// Add new basic block for fast div and rem operations and put it before
289/// SuccessorBB.
290QuotRemWithBB FastDivInsertionTask::createFastBB(BasicBlock *SuccessorBB) {
291 QuotRemWithBB DivRemPair;
292 DivRemPair.BB = BasicBlock::Create(Context&: MainBB->getParent()->getContext(), Name: "",
293 Parent: MainBB->getParent(), InsertBefore: SuccessorBB);
294 IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
295 Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
296
297 Value *Dividend = SlowDivOrRem->getOperand(i: 0);
298 Value *Divisor = SlowDivOrRem->getOperand(i: 1);
299 Value *ShortDivisorV =
300 Builder.CreateCast(Op: Instruction::Trunc, V: Divisor, DestTy: BypassType);
301 Value *ShortDividendV =
302 Builder.CreateCast(Op: Instruction::Trunc, V: Dividend, DestTy: BypassType);
303
304 // udiv/urem because this optimization only handles positive numbers.
305 Value *ShortQV = Builder.CreateUDiv(LHS: ShortDividendV, RHS: ShortDivisorV);
306 Value *ShortRV = Builder.CreateURem(LHS: ShortDividendV, RHS: ShortDivisorV);
307 DivRemPair.Quotient =
308 Builder.CreateCast(Op: Instruction::ZExt, V: ShortQV, DestTy: getSlowType());
309 DivRemPair.Remainder =
310 Builder.CreateCast(Op: Instruction::ZExt, V: ShortRV, DestTy: getSlowType());
311 Builder.CreateBr(Dest: SuccessorBB);
312
313 return DivRemPair;
314}
315
316/// Creates Phi nodes for result of Div and Rem.
317QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS,
318 QuotRemWithBB &RHS,
319 BasicBlock *PhiBB) {
320 IRBuilder<> Builder(PhiBB, PhiBB->begin());
321 Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
322 PHINode *QuoPhi = Builder.CreatePHI(Ty: getSlowType(), NumReservedValues: 2);
323 QuoPhi->addIncoming(V: LHS.Quotient, BB: LHS.BB);
324 QuoPhi->addIncoming(V: RHS.Quotient, BB: RHS.BB);
325 PHINode *RemPhi = Builder.CreatePHI(Ty: getSlowType(), NumReservedValues: 2);
326 RemPhi->addIncoming(V: LHS.Remainder, BB: LHS.BB);
327 RemPhi->addIncoming(V: RHS.Remainder, BB: RHS.BB);
328 return QuotRemPair(QuoPhi, RemPhi);
329}
330
331/// Creates a runtime check to test whether both the divisor and dividend fit
332/// into BypassType. The check is inserted at the end of MainBB. True return
333/// value means that the operands fit. Either of the operands may be NULL if it
334/// doesn't need a runtime check.
335Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) {
336 assert((Op1 || Op2) && "Nothing to check");
337 IRBuilder<> Builder(MainBB, MainBB->end());
338 Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
339
340 Value *OrV;
341 if (Op1 && Op2)
342 OrV = Builder.CreateOr(LHS: Op1, RHS: Op2);
343 else
344 OrV = Op1 ? Op1 : Op2;
345
346 // Check whether the operands are larger than the bypass type.
347 Value *AndV = Builder.CreateAnd(
348 LHS: OrV, RHS: APInt::getBitsSetFrom(numBits: OrV->getType()->getIntegerBitWidth(),
349 loBit: BypassType->getBitWidth()));
350
351 // Compare operand values
352 Value *ZeroV = ConstantInt::getSigned(Ty: getSlowType(), V: 0);
353 return Builder.CreateICmpEQ(LHS: AndV, RHS: ZeroV);
354}
355
356/// Substitutes the div/rem instruction with code that checks the value of the
357/// operands and uses a shorter-faster div/rem instruction when possible.
358std::optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() {
359 Value *Dividend = SlowDivOrRem->getOperand(i: 0);
360 Value *Divisor = SlowDivOrRem->getOperand(i: 1);
361
362 VisitedSetTy SetL;
363 ValueRange DividendRange = getValueRange(V: Dividend, Visited&: SetL);
364 if (DividendRange == VALRNG_LIKELY_LONG)
365 return std::nullopt;
366
367 VisitedSetTy SetR;
368 ValueRange DivisorRange = getValueRange(V: Divisor, Visited&: SetR);
369 if (DivisorRange == VALRNG_LIKELY_LONG)
370 return std::nullopt;
371
372 bool DividendShort = (DividendRange == VALRNG_KNOWN_SHORT);
373 bool DivisorShort = (DivisorRange == VALRNG_KNOWN_SHORT);
374
375 if (DividendShort && DivisorShort) {
376 // If both operands are known to be short then just replace the long
377 // division with a short one in-place. Since we're not introducing control
378 // flow in this case, narrowing the division is always a win, even if the
379 // divisor is a constant (and will later get replaced by a multiplication).
380
381 IRBuilder<> Builder(SlowDivOrRem);
382 Value *TruncDividend = Builder.CreateTrunc(V: Dividend, DestTy: BypassType);
383 Value *TruncDivisor = Builder.CreateTrunc(V: Divisor, DestTy: BypassType);
384 Value *TruncDiv = Builder.CreateUDiv(LHS: TruncDividend, RHS: TruncDivisor);
385 Value *TruncRem = Builder.CreateURem(LHS: TruncDividend, RHS: TruncDivisor);
386 Value *ExtDiv = Builder.CreateZExt(V: TruncDiv, DestTy: getSlowType());
387 Value *ExtRem = Builder.CreateZExt(V: TruncRem, DestTy: getSlowType());
388 return QuotRemPair(ExtDiv, ExtRem);
389 }
390
391 if (isa<ConstantInt>(Val: Divisor)) {
392 // If the divisor is not a constant, DAGCombiner will convert it to a
393 // multiplication by a magic constant. It isn't clear if it is worth
394 // introducing control flow to get a narrower multiply.
395 return std::nullopt;
396 }
397
398 // After Constant Hoisting pass, long constants may be represented as
399 // bitcast instructions. As a result, some constants may look like an
400 // instruction at first, and an additional check is necessary to find out if
401 // an operand is actually a constant.
402 if (auto *BCI = dyn_cast<BitCastInst>(Val: Divisor))
403 if (BCI->getParent() == SlowDivOrRem->getParent() &&
404 isa<ConstantInt>(Val: BCI->getOperand(i_nocapture: 0)))
405 return std::nullopt;
406
407 IRBuilder<> Builder(MainBB, MainBB->end());
408 Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
409
410 if (DividendShort && !isSignedOp()) {
411 // If the division is unsigned and Dividend is known to be short, then
412 // either
413 // 1) Divisor is less or equal to Dividend, and the result can be computed
414 // with a short division.
415 // 2) Divisor is greater than Dividend. In this case, no division is needed
416 // at all: The quotient is 0 and the remainder is equal to Dividend.
417 //
418 // So instead of checking at runtime whether Divisor fits into BypassType,
419 // we emit a runtime check to differentiate between these two cases. This
420 // lets us entirely avoid a long div.
421
422 // Split the basic block before the div/rem.
423 BasicBlock *SuccessorBB = SplitBlock(Old: MainBB, SplitPt: SlowDivOrRem, DTU, LI);
424 // Remove the unconditional branch from MainBB to SuccessorBB.
425 MainBB->back().eraseFromParent();
426 QuotRemWithBB Long;
427 Long.BB = MainBB;
428 Long.Quotient = ConstantInt::get(Ty: getSlowType(), V: 0);
429 Long.Remainder = Dividend;
430 QuotRemWithBB Fast = createFastBB(SuccessorBB);
431 QuotRemPair Result = createDivRemPhiNodes(LHS&: Fast, RHS&: Long, PhiBB: SuccessorBB);
432 Value *CmpV = Builder.CreateICmpUGE(LHS: Dividend, RHS: Divisor);
433 Builder.CreateCondBr(Cond: CmpV, True: Fast.BB, False: SuccessorBB);
434
435 if (DTU)
436 DTU->applyUpdates(Updates: {{DominatorTree::Insert, MainBB, Fast.BB},
437 {DominatorTree::Insert, Fast.BB, SuccessorBB}});
438 if (LI) {
439 if (Loop *L = LI->getLoopFor(BB: MainBB))
440 L->addBasicBlockToLoop(NewBB: Fast.BB, LI&: *LI);
441 }
442
443 return Result;
444 }
445
446 // General case. Create both slow and fast div/rem pairs and choose one of
447 // them at runtime.
448
449 // Split the basic block before the div/rem.
450 BasicBlock *SuccessorBB = SplitBlock(Old: MainBB, SplitPt: SlowDivOrRem, DTU, LI);
451 // Remove the unconditional branch from MainBB to SuccessorBB.
452 MainBB->back().eraseFromParent();
453 QuotRemWithBB Fast = createFastBB(SuccessorBB);
454 QuotRemWithBB Slow = createSlowBB(SuccessorBB);
455 QuotRemPair Result = createDivRemPhiNodes(LHS&: Fast, RHS&: Slow, PhiBB: SuccessorBB);
456 Value *CmpV = insertOperandRuntimeCheck(Op1: DividendShort ? nullptr : Dividend,
457 Op2: DivisorShort ? nullptr : Divisor);
458 Builder.CreateCondBr(Cond: CmpV, True: Fast.BB, False: Slow.BB);
459 if (DTU)
460 DTU->applyUpdates(Updates: {{DominatorTree::Insert, MainBB, Fast.BB},
461 {DominatorTree::Insert, MainBB, Slow.BB},
462 {DominatorTree::Insert, Fast.BB, SuccessorBB},
463 {DominatorTree::Insert, Slow.BB, SuccessorBB},
464 {DominatorTree::Delete, MainBB, SuccessorBB}});
465 if (LI) {
466 if (Loop *L = LI->getLoopFor(BB: MainBB)) {
467 L->addBasicBlockToLoop(NewBB: Fast.BB, LI&: *LI);
468 L->addBasicBlockToLoop(NewBB: Slow.BB, LI&: *LI);
469 }
470 }
471 return Result;
472}
473
474/// This optimization identifies DIV/REM instructions in a BB that can be
475/// profitably bypassed and carried out with a shorter, faster divide.
476bool llvm::bypassSlowDivision(BasicBlock *BB,
477 const BypassWidthsTy &BypassWidths,
478 DomTreeUpdater *DTU, LoopInfo *LI) {
479 DivCacheTy PerBBDivCache;
480
481 bool MadeChange = false;
482 Instruction *Next = &*BB->begin();
483 while (Next != nullptr) {
484 // We may add instructions immediately after I, but we want to skip over
485 // them.
486 Instruction *I = Next;
487 Next = Next->getNextNode();
488
489 // Ignore dead code to save time and avoid bugs.
490 if (I->use_empty())
491 continue;
492
493 FastDivInsertionTask Task(I, BypassWidths, DTU, LI);
494 if (Value *Replacement = Task.getReplacement(Cache&: PerBBDivCache)) {
495 I->replaceAllUsesWith(V: Replacement);
496 I->eraseFromParent();
497 MadeChange = true;
498 }
499 }
500
501 // Above we eagerly create divs and rems, as pairs, so that we can efficiently
502 // create divrem machine instructions. Now erase any unused divs / rems so we
503 // don't leave extra instructions sitting around.
504 for (auto &KV : PerBBDivCache)
505 for (Value *V : {KV.second.Quotient, KV.second.Remainder})
506 RecursivelyDeleteTriviallyDeadInstructions(V);
507
508 return MadeChange;
509}
510