1//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass does misc. AMDGPU optimizations on IR before instruction
11/// selection.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPU.h"
16#include "AMDGPUTargetMachine.h"
17#include "SIModeRegisterDefaults.h"
18#include "llvm/Analysis/AssumptionCache.h"
19#include "llvm/Analysis/ConstantFolding.h"
20#include "llvm/Analysis/TargetLibraryInfo.h"
21#include "llvm/Analysis/UniformityAnalysis.h"
22#include "llvm/Analysis/ValueTracking.h"
23#include "llvm/CodeGen/TargetPassConfig.h"
24#include "llvm/IR/Dominators.h"
25#include "llvm/IR/IRBuilder.h"
26#include "llvm/IR/InstVisitor.h"
27#include "llvm/IR/IntrinsicsAMDGPU.h"
28#include "llvm/IR/PatternMatch.h"
29#include "llvm/Pass.h"
30#include "llvm/Support/KnownBits.h"
31#include "llvm/Support/KnownFPClass.h"
32#include "llvm/Transforms/Utils/IntegerDivision.h"
33#include "llvm/Transforms/Utils/Local.h"
34
35#define DEBUG_TYPE "amdgpu-codegenprepare"
36
37using namespace llvm;
38using namespace llvm::PatternMatch;
39
40namespace {
41
42static cl::opt<bool> WidenLoads(
43 "amdgpu-codegenprepare-widen-constant-loads",
44 cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
45 cl::ReallyHidden,
46 cl::init(Val: false));
47
48static cl::opt<bool> Widen16BitOps(
49 "amdgpu-codegenprepare-widen-16-bit-ops",
50 cl::desc(
51 "Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
52 cl::ReallyHidden, cl::init(Val: false));
53
54static cl::opt<bool>
55 BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
56 cl::desc("Break large PHI nodes for DAGISel"),
57 cl::ReallyHidden, cl::init(Val: true));
58
59static cl::opt<bool>
60 ForceBreakLargePHIs("amdgpu-codegenprepare-force-break-large-phis",
61 cl::desc("For testing purposes, always break large "
62 "PHIs even if it isn't profitable."),
63 cl::ReallyHidden, cl::init(Val: false));
64
65static cl::opt<unsigned> BreakLargePHIsThreshold(
66 "amdgpu-codegenprepare-break-large-phis-threshold",
67 cl::desc("Minimum type size in bits for breaking large PHI nodes"),
68 cl::ReallyHidden, cl::init(Val: 32));
69
70static cl::opt<bool> UseMul24Intrin(
71 "amdgpu-codegenprepare-mul24",
72 cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
73 cl::ReallyHidden,
74 cl::init(Val: true));
75
76// Legalize 64-bit division by using the generic IR expansion.
77static cl::opt<bool> ExpandDiv64InIR(
78 "amdgpu-codegenprepare-expand-div64",
79 cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),
80 cl::ReallyHidden,
81 cl::init(Val: false));
82
83// Leave all division operations as they are. This supersedes ExpandDiv64InIR
84// and is used for testing the legalizer.
85static cl::opt<bool> DisableIDivExpand(
86 "amdgpu-codegenprepare-disable-idiv-expansion",
87 cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"),
88 cl::ReallyHidden,
89 cl::init(Val: false));
90
91// Disable processing of fdiv so we can better test the backend implementations.
92static cl::opt<bool> DisableFDivExpand(
93 "amdgpu-codegenprepare-disable-fdiv-expansion",
94 cl::desc("Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
95 cl::ReallyHidden,
96 cl::init(Val: false));
97
98static bool hasUnsafeFPMath(const Function &F) {
99 return F.getFnAttribute(Kind: "unsafe-fp-math").getValueAsBool();
100}
101
102class AMDGPUCodeGenPrepareImpl
103 : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
104public:
105 Function &F;
106 const GCNSubtarget &ST;
107 const AMDGPUTargetMachine &TM;
108 const TargetLibraryInfo *TLI;
109 AssumptionCache *AC;
110 const DominatorTree *DT;
111 const UniformityInfo &UA;
112 const DataLayout &DL;
113 const bool HasUnsafeFPMath;
114 const bool HasFP32DenormalFlush;
115 bool FlowChanged = false;
116 mutable Function *SqrtF32 = nullptr;
117 mutable Function *LdexpF32 = nullptr;
118
119 DenseMap<const PHINode *, bool> BreakPhiNodesCache;
120
121 AMDGPUCodeGenPrepareImpl(Function &F, const AMDGPUTargetMachine &TM,
122 const TargetLibraryInfo *TLI, AssumptionCache *AC,
123 const DominatorTree *DT, const UniformityInfo &UA)
124 : F(F), ST(TM.getSubtarget<GCNSubtarget>(F)), TM(TM), TLI(TLI), AC(AC),
125 DT(DT), UA(UA), DL(F.getDataLayout()),
126 HasUnsafeFPMath(hasUnsafeFPMath(F)),
127 HasFP32DenormalFlush(SIModeRegisterDefaults(F, ST).FP32Denormals ==
128 DenormalMode::getPreserveSign()) {}
129
130 Function *getSqrtF32() const {
131 if (SqrtF32)
132 return SqrtF32;
133
134 LLVMContext &Ctx = F.getContext();
135 SqrtF32 = Intrinsic::getOrInsertDeclaration(
136 M: F.getParent(), id: Intrinsic::amdgcn_sqrt, Tys: {Type::getFloatTy(C&: Ctx)});
137 return SqrtF32;
138 }
139
140 Function *getLdexpF32() const {
141 if (LdexpF32)
142 return LdexpF32;
143
144 LLVMContext &Ctx = F.getContext();
145 LdexpF32 = Intrinsic::getOrInsertDeclaration(
146 M: F.getParent(), id: Intrinsic::ldexp,
147 Tys: {Type::getFloatTy(C&: Ctx), Type::getInt32Ty(C&: Ctx)});
148 return LdexpF32;
149 }
150
151 bool canBreakPHINode(const PHINode &I);
152
153 /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
154 /// binary operation \p V.
155 ///
156 /// \returns Binary operation \p V.
157 /// \returns \p T's base element bit width.
158 unsigned getBaseElementBitWidth(const Type *T) const;
159
160 /// \returns Equivalent 32 bit integer type for given type \p T. For example,
161 /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
162 /// is returned.
163 Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
164
165 /// \returns True if binary operation \p I is a signed binary operation, false
166 /// otherwise.
167 bool isSigned(const BinaryOperator &I) const;
168
169 /// \returns True if the condition of 'select' operation \p I comes from a
170 /// signed 'icmp' operation, false otherwise.
171 bool isSigned(const SelectInst &I) const;
172
173 /// \returns True if type \p T needs to be promoted to 32 bit integer type,
174 /// false otherwise.
175 bool needsPromotionToI32(const Type *T) const;
176
177 /// Return true if \p T is a legal scalar floating point type.
178 bool isLegalFloatingTy(const Type *T) const;
179
180 /// Wrapper to pass all the arguments to computeKnownFPClass
181 KnownFPClass computeKnownFPClass(const Value *V, FPClassTest Interested,
182 const Instruction *CtxI) const {
183 return llvm::computeKnownFPClass(V, DL, InterestedClasses: Interested, TLI, AC, CxtI: CtxI, DT);
184 }
185
186 bool canIgnoreDenormalInput(const Value *V, const Instruction *CtxI) const {
187 return HasFP32DenormalFlush ||
188 computeKnownFPClass(V, Interested: fcSubnormal, CtxI).isKnownNeverSubnormal();
189 }
190
191 /// Promotes uniform binary operation \p I to equivalent 32 bit binary
192 /// operation.
193 ///
194 /// \details \p I's base element bit width must be greater than 1 and less
195 /// than or equal 16. Promotion is done by sign or zero extending operands to
196 /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
197 /// truncating the result of 32 bit binary operation back to \p I's original
198 /// type. Division operation is not promoted.
199 ///
200 /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
201 /// false otherwise.
202 bool promoteUniformOpToI32(BinaryOperator &I) const;
203
204 /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
205 ///
206 /// \details \p I's base element bit width must be greater than 1 and less
207 /// than or equal 16. Promotion is done by sign or zero extending operands to
208 /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
209 ///
210 /// \returns True.
211 bool promoteUniformOpToI32(ICmpInst &I) const;
212
213 /// Promotes uniform 'select' operation \p I to 32 bit 'select'
214 /// operation.
215 ///
216 /// \details \p I's base element bit width must be greater than 1 and less
217 /// than or equal 16. Promotion is done by sign or zero extending operands to
218 /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
219 /// result of 32 bit 'select' operation back to \p I's original type.
220 ///
221 /// \returns True.
222 bool promoteUniformOpToI32(SelectInst &I) const;
223
224 /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
225 /// intrinsic.
226 ///
227 /// \details \p I's base element bit width must be greater than 1 and less
228 /// than or equal 16. Promotion is done by zero extending the operand to 32
229 /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
230 /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
231 /// shift amount is 32 minus \p I's base element bit width), and truncating
232 /// the result of the shift operation back to \p I's original type.
233 ///
234 /// \returns True.
235 bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
236
237 /// \returns The minimum number of bits needed to store the value of \Op as an
238 /// unsigned integer. Truncating to this size and then zero-extending to
239 /// the original will not change the value.
240 unsigned numBitsUnsigned(Value *Op) const;
241
242 /// \returns The minimum number of bits needed to store the value of \Op as a
243 /// signed integer. Truncating to this size and then sign-extending to
244 /// the original size will not change the value.
245 unsigned numBitsSigned(Value *Op) const;
246
247 /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
248 /// SelectionDAG has an issue where an and asserting the bits are known
249 bool replaceMulWithMul24(BinaryOperator &I) const;
250
251 /// Perform same function as equivalently named function in DAGCombiner. Since
252 /// we expand some divisions here, we need to perform this before obscuring.
253 bool foldBinOpIntoSelect(BinaryOperator &I) const;
254
255 bool divHasSpecialOptimization(BinaryOperator &I,
256 Value *Num, Value *Den) const;
257 unsigned getDivNumBits(BinaryOperator &I, Value *Num, Value *Den,
258 unsigned MaxDivBits, bool Signed) const;
259
260 /// Expands 24 bit div or rem.
261 Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
262 Value *Num, Value *Den,
263 bool IsDiv, bool IsSigned) const;
264
265 Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I,
266 Value *Num, Value *Den, unsigned NumBits,
267 bool IsDiv, bool IsSigned) const;
268
269 /// Expands 32 bit div or rem.
270 Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
271 Value *Num, Value *Den) const;
272
273 Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I,
274 Value *Num, Value *Den) const;
275 void expandDivRem64(BinaryOperator &I) const;
276
277 /// Widen a scalar load.
278 ///
279 /// \details \p Widen scalar load for uniform, small type loads from constant
280 // memory / to a full 32-bits and then truncate the input to allow a scalar
281 // load instead of a vector load.
282 //
283 /// \returns True.
284
285 bool canWidenScalarExtLoad(LoadInst &I) const;
286
287 Value *matchFractPat(IntrinsicInst &I);
288 Value *applyFractPat(IRBuilder<> &Builder, Value *FractArg);
289
290 bool canOptimizeWithRsq(const FPMathOperator *SqrtOp, FastMathFlags DivFMF,
291 FastMathFlags SqrtFMF) const;
292
293 Value *optimizeWithRsq(IRBuilder<> &Builder, Value *Num, Value *Den,
294 FastMathFlags DivFMF, FastMathFlags SqrtFMF,
295 const Instruction *CtxI) const;
296
297 Value *optimizeWithRcp(IRBuilder<> &Builder, Value *Num, Value *Den,
298 FastMathFlags FMF, const Instruction *CtxI) const;
299 Value *optimizeWithFDivFast(IRBuilder<> &Builder, Value *Num, Value *Den,
300 float ReqdAccuracy) const;
301
302 Value *visitFDivElement(IRBuilder<> &Builder, Value *Num, Value *Den,
303 FastMathFlags DivFMF, FastMathFlags SqrtFMF,
304 Value *RsqOp, const Instruction *FDiv,
305 float ReqdAccuracy) const;
306
307 std::pair<Value *, Value *> getFrexpResults(IRBuilder<> &Builder,
308 Value *Src) const;
309
310 Value *emitRcpIEEE1ULP(IRBuilder<> &Builder, Value *Src,
311 bool IsNegative) const;
312 Value *emitFrexpDiv(IRBuilder<> &Builder, Value *LHS, Value *RHS,
313 FastMathFlags FMF) const;
314 Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src,
315 FastMathFlags FMF) const;
316
317public:
318 bool visitFDiv(BinaryOperator &I);
319
320 bool visitInstruction(Instruction &I) { return false; }
321 bool visitBinaryOperator(BinaryOperator &I);
322 bool visitLoadInst(LoadInst &I);
323 bool visitICmpInst(ICmpInst &I);
324 bool visitSelectInst(SelectInst &I);
325 bool visitPHINode(PHINode &I);
326 bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
327
328 bool visitIntrinsicInst(IntrinsicInst &I);
329 bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
330 bool visitFMinLike(IntrinsicInst &I);
331 bool visitSqrt(IntrinsicInst &I);
332 bool run();
333};
334
335class AMDGPUCodeGenPrepare : public FunctionPass {
336public:
337 static char ID;
338 AMDGPUCodeGenPrepare() : FunctionPass(ID) {}
339 void getAnalysisUsage(AnalysisUsage &AU) const override {
340 AU.addRequired<AssumptionCacheTracker>();
341 AU.addRequired<UniformityInfoWrapperPass>();
342 AU.addRequired<TargetLibraryInfoWrapperPass>();
343
344 // FIXME: Division expansion needs to preserve the dominator tree.
345 if (!ExpandDiv64InIR)
346 AU.setPreservesAll();
347 }
348 bool runOnFunction(Function &F) override;
349 StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
350};
351
352} // end anonymous namespace
353
354bool AMDGPUCodeGenPrepareImpl::run() {
355 BreakPhiNodesCache.clear();
356 bool MadeChange = false;
357
358 Function::iterator NextBB;
359 for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) {
360 BasicBlock *BB = &*FI;
361 NextBB = std::next(x: FI);
362
363 BasicBlock::iterator Next;
364 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
365 I = Next) {
366 Next = std::next(x: I);
367
368 MadeChange |= visit(I&: *I);
369
370 if (Next != E) { // Control flow changed
371 BasicBlock *NextInstBB = Next->getParent();
372 if (NextInstBB != BB) {
373 BB = NextInstBB;
374 E = BB->end();
375 FE = F.end();
376 }
377 }
378 }
379 }
380 return MadeChange;
381}
382
383unsigned AMDGPUCodeGenPrepareImpl::getBaseElementBitWidth(const Type *T) const {
384 assert(needsPromotionToI32(T) && "T does not need promotion to i32");
385
386 if (T->isIntegerTy())
387 return T->getIntegerBitWidth();
388 return cast<VectorType>(Val: T)->getElementType()->getIntegerBitWidth();
389}
390
391Type *AMDGPUCodeGenPrepareImpl::getI32Ty(IRBuilder<> &B, const Type *T) const {
392 assert(needsPromotionToI32(T) && "T does not need promotion to i32");
393
394 if (T->isIntegerTy())
395 return B.getInt32Ty();
396 return FixedVectorType::get(ElementType: B.getInt32Ty(), FVTy: cast<FixedVectorType>(Val: T));
397}
398
399bool AMDGPUCodeGenPrepareImpl::isSigned(const BinaryOperator &I) const {
400 return I.getOpcode() == Instruction::AShr ||
401 I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
402}
403
404bool AMDGPUCodeGenPrepareImpl::isSigned(const SelectInst &I) const {
405 return isa<ICmpInst>(Val: I.getOperand(i_nocapture: 0)) &&
406 cast<ICmpInst>(Val: I.getOperand(i_nocapture: 0))->isSigned();
407}
408
409bool AMDGPUCodeGenPrepareImpl::needsPromotionToI32(const Type *T) const {
410 if (!Widen16BitOps)
411 return false;
412
413 const IntegerType *IntTy = dyn_cast<IntegerType>(Val: T);
414 if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
415 return true;
416
417 if (const VectorType *VT = dyn_cast<VectorType>(Val: T)) {
418 // TODO: The set of packed operations is more limited, so may want to
419 // promote some anyway.
420 if (ST.hasVOP3PInsts())
421 return false;
422
423 return needsPromotionToI32(T: VT->getElementType());
424 }
425
426 return false;
427}
428
429bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const {
430 return Ty->isFloatTy() || Ty->isDoubleTy() ||
431 (Ty->isHalfTy() && ST.has16BitInsts());
432}
433
434// Return true if the op promoted to i32 should have nsw set.
435static bool promotedOpIsNSW(const Instruction &I) {
436 switch (I.getOpcode()) {
437 case Instruction::Shl:
438 case Instruction::Add:
439 case Instruction::Sub:
440 return true;
441 case Instruction::Mul:
442 return I.hasNoUnsignedWrap();
443 default:
444 return false;
445 }
446}
447
448// Return true if the op promoted to i32 should have nuw set.
449static bool promotedOpIsNUW(const Instruction &I) {
450 switch (I.getOpcode()) {
451 case Instruction::Shl:
452 case Instruction::Add:
453 case Instruction::Mul:
454 return true;
455 case Instruction::Sub:
456 return I.hasNoUnsignedWrap();
457 default:
458 return false;
459 }
460}
461
462bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {
463 Type *Ty = I.getType();
464 int TySize = DL.getTypeSizeInBits(Ty);
465 Align Alignment = DL.getValueOrABITypeAlignment(Alignment: I.getAlign(), Ty);
466
467 return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.isUniform(I: &I);
468}
469
470bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(BinaryOperator &I) const {
471 assert(needsPromotionToI32(I.getType()) &&
472 "I does not need promotion to i32");
473
474 if (I.getOpcode() == Instruction::SDiv ||
475 I.getOpcode() == Instruction::UDiv ||
476 I.getOpcode() == Instruction::SRem ||
477 I.getOpcode() == Instruction::URem)
478 return false;
479
480 IRBuilder<> Builder(&I);
481 Builder.SetCurrentDebugLocation(I.getDebugLoc());
482
483 Type *I32Ty = getI32Ty(B&: Builder, T: I.getType());
484 Value *ExtOp0 = nullptr;
485 Value *ExtOp1 = nullptr;
486 Value *ExtRes = nullptr;
487 Value *TruncRes = nullptr;
488
489 if (isSigned(I)) {
490 ExtOp0 = Builder.CreateSExt(V: I.getOperand(i_nocapture: 0), DestTy: I32Ty);
491 ExtOp1 = Builder.CreateSExt(V: I.getOperand(i_nocapture: 1), DestTy: I32Ty);
492 } else {
493 ExtOp0 = Builder.CreateZExt(V: I.getOperand(i_nocapture: 0), DestTy: I32Ty);
494 ExtOp1 = Builder.CreateZExt(V: I.getOperand(i_nocapture: 1), DestTy: I32Ty);
495 }
496
497 ExtRes = Builder.CreateBinOp(Opc: I.getOpcode(), LHS: ExtOp0, RHS: ExtOp1);
498 if (Instruction *Inst = dyn_cast<Instruction>(Val: ExtRes)) {
499 if (promotedOpIsNSW(I: cast<Instruction>(Val&: I)))
500 Inst->setHasNoSignedWrap();
501
502 if (promotedOpIsNUW(I: cast<Instruction>(Val&: I)))
503 Inst->setHasNoUnsignedWrap();
504
505 if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(Val: &I))
506 Inst->setIsExact(ExactOp->isExact());
507 }
508
509 TruncRes = Builder.CreateTrunc(V: ExtRes, DestTy: I.getType());
510
511 I.replaceAllUsesWith(V: TruncRes);
512 I.eraseFromParent();
513
514 return true;
515}
516
517bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(ICmpInst &I) const {
518 assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
519 "I does not need promotion to i32");
520
521 IRBuilder<> Builder(&I);
522 Builder.SetCurrentDebugLocation(I.getDebugLoc());
523
524 Type *I32Ty = getI32Ty(B&: Builder, T: I.getOperand(i_nocapture: 0)->getType());
525 Value *ExtOp0 = nullptr;
526 Value *ExtOp1 = nullptr;
527 Value *NewICmp = nullptr;
528
529 if (I.isSigned()) {
530 ExtOp0 = Builder.CreateSExt(V: I.getOperand(i_nocapture: 0), DestTy: I32Ty);
531 ExtOp1 = Builder.CreateSExt(V: I.getOperand(i_nocapture: 1), DestTy: I32Ty);
532 } else {
533 ExtOp0 = Builder.CreateZExt(V: I.getOperand(i_nocapture: 0), DestTy: I32Ty);
534 ExtOp1 = Builder.CreateZExt(V: I.getOperand(i_nocapture: 1), DestTy: I32Ty);
535 }
536 NewICmp = Builder.CreateICmp(P: I.getPredicate(), LHS: ExtOp0, RHS: ExtOp1);
537
538 I.replaceAllUsesWith(V: NewICmp);
539 I.eraseFromParent();
540
541 return true;
542}
543
544bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(SelectInst &I) const {
545 assert(needsPromotionToI32(I.getType()) &&
546 "I does not need promotion to i32");
547
548 IRBuilder<> Builder(&I);
549 Builder.SetCurrentDebugLocation(I.getDebugLoc());
550
551 Type *I32Ty = getI32Ty(B&: Builder, T: I.getType());
552 Value *ExtOp1 = nullptr;
553 Value *ExtOp2 = nullptr;
554 Value *ExtRes = nullptr;
555 Value *TruncRes = nullptr;
556
557 if (isSigned(I)) {
558 ExtOp1 = Builder.CreateSExt(V: I.getOperand(i_nocapture: 1), DestTy: I32Ty);
559 ExtOp2 = Builder.CreateSExt(V: I.getOperand(i_nocapture: 2), DestTy: I32Ty);
560 } else {
561 ExtOp1 = Builder.CreateZExt(V: I.getOperand(i_nocapture: 1), DestTy: I32Ty);
562 ExtOp2 = Builder.CreateZExt(V: I.getOperand(i_nocapture: 2), DestTy: I32Ty);
563 }
564 ExtRes = Builder.CreateSelect(C: I.getOperand(i_nocapture: 0), True: ExtOp1, False: ExtOp2);
565 TruncRes = Builder.CreateTrunc(V: ExtRes, DestTy: I.getType());
566
567 I.replaceAllUsesWith(V: TruncRes);
568 I.eraseFromParent();
569
570 return true;
571}
572
573bool AMDGPUCodeGenPrepareImpl::promoteUniformBitreverseToI32(
574 IntrinsicInst &I) const {
575 assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
576 "I must be bitreverse intrinsic");
577 assert(needsPromotionToI32(I.getType()) &&
578 "I does not need promotion to i32");
579
580 IRBuilder<> Builder(&I);
581 Builder.SetCurrentDebugLocation(I.getDebugLoc());
582
583 Type *I32Ty = getI32Ty(B&: Builder, T: I.getType());
584 Value *ExtOp = Builder.CreateZExt(V: I.getOperand(i_nocapture: 0), DestTy: I32Ty);
585 Value *ExtRes =
586 Builder.CreateIntrinsic(ID: Intrinsic::bitreverse, Types: {I32Ty}, Args: {ExtOp});
587 Value *LShrOp =
588 Builder.CreateLShr(LHS: ExtRes, RHS: 32 - getBaseElementBitWidth(T: I.getType()));
589 Value *TruncRes =
590 Builder.CreateTrunc(V: LShrOp, DestTy: I.getType());
591
592 I.replaceAllUsesWith(V: TruncRes);
593 I.eraseFromParent();
594
595 return true;
596}
597
598unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op) const {
599 return computeKnownBits(V: Op, DL, AC).countMaxActiveBits();
600}
601
602unsigned AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op) const {
603 return ComputeMaxSignificantBits(Op, DL, AC);
604}
605
606static void extractValues(IRBuilder<> &Builder,
607 SmallVectorImpl<Value *> &Values, Value *V) {
608 auto *VT = dyn_cast<FixedVectorType>(Val: V->getType());
609 if (!VT) {
610 Values.push_back(Elt: V);
611 return;
612 }
613
614 for (int I = 0, E = VT->getNumElements(); I != E; ++I)
615 Values.push_back(Elt: Builder.CreateExtractElement(Vec: V, Idx: I));
616}
617
618static Value *insertValues(IRBuilder<> &Builder,
619 Type *Ty,
620 SmallVectorImpl<Value *> &Values) {
621 if (!Ty->isVectorTy()) {
622 assert(Values.size() == 1);
623 return Values[0];
624 }
625
626 Value *NewVal = PoisonValue::get(T: Ty);
627 for (int I = 0, E = Values.size(); I != E; ++I)
628 NewVal = Builder.CreateInsertElement(Vec: NewVal, NewElt: Values[I], Idx: I);
629
630 return NewVal;
631}
632
633bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
634 if (I.getOpcode() != Instruction::Mul)
635 return false;
636
637 Type *Ty = I.getType();
638 unsigned Size = Ty->getScalarSizeInBits();
639 if (Size <= 16 && ST.has16BitInsts())
640 return false;
641
642 // Prefer scalar if this could be s_mul_i32
643 if (UA.isUniform(I: &I))
644 return false;
645
646 Value *LHS = I.getOperand(i_nocapture: 0);
647 Value *RHS = I.getOperand(i_nocapture: 1);
648 IRBuilder<> Builder(&I);
649 Builder.SetCurrentDebugLocation(I.getDebugLoc());
650
651 unsigned LHSBits = 0, RHSBits = 0;
652 bool IsSigned = false;
653
654 if (ST.hasMulU24() && (LHSBits = numBitsUnsigned(Op: LHS)) <= 24 &&
655 (RHSBits = numBitsUnsigned(Op: RHS)) <= 24) {
656 IsSigned = false;
657
658 } else if (ST.hasMulI24() && (LHSBits = numBitsSigned(Op: LHS)) <= 24 &&
659 (RHSBits = numBitsSigned(Op: RHS)) <= 24) {
660 IsSigned = true;
661
662 } else
663 return false;
664
665 SmallVector<Value *, 4> LHSVals;
666 SmallVector<Value *, 4> RHSVals;
667 SmallVector<Value *, 4> ResultVals;
668 extractValues(Builder, Values&: LHSVals, V: LHS);
669 extractValues(Builder, Values&: RHSVals, V: RHS);
670
671 IntegerType *I32Ty = Builder.getInt32Ty();
672 IntegerType *IntrinTy = Size > 32 ? Builder.getInt64Ty() : I32Ty;
673 Type *DstTy = LHSVals[0]->getType();
674
675 for (int I = 0, E = LHSVals.size(); I != E; ++I) {
676 Value *LHS = IsSigned ? Builder.CreateSExtOrTrunc(V: LHSVals[I], DestTy: I32Ty)
677 : Builder.CreateZExtOrTrunc(V: LHSVals[I], DestTy: I32Ty);
678 Value *RHS = IsSigned ? Builder.CreateSExtOrTrunc(V: RHSVals[I], DestTy: I32Ty)
679 : Builder.CreateZExtOrTrunc(V: RHSVals[I], DestTy: I32Ty);
680 Intrinsic::ID ID =
681 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
682 Value *Result = Builder.CreateIntrinsic(ID, Types: {IntrinTy}, Args: {LHS, RHS});
683 Result = IsSigned ? Builder.CreateSExtOrTrunc(V: Result, DestTy: DstTy)
684 : Builder.CreateZExtOrTrunc(V: Result, DestTy: DstTy);
685 ResultVals.push_back(Elt: Result);
686 }
687
688 Value *NewVal = insertValues(Builder, Ty, Values&: ResultVals);
689 NewVal->takeName(V: &I);
690 I.replaceAllUsesWith(V: NewVal);
691 I.eraseFromParent();
692
693 return true;
694}
695
696// Find a select instruction, which may have been casted. This is mostly to deal
697// with cases where i16 selects were promoted here to i32.
698static SelectInst *findSelectThroughCast(Value *V, CastInst *&Cast) {
699 Cast = nullptr;
700 if (SelectInst *Sel = dyn_cast<SelectInst>(Val: V))
701 return Sel;
702
703 if ((Cast = dyn_cast<CastInst>(Val: V))) {
704 if (SelectInst *Sel = dyn_cast<SelectInst>(Val: Cast->getOperand(i_nocapture: 0)))
705 return Sel;
706 }
707
708 return nullptr;
709}
710
711bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const {
712 // Don't do this unless the old select is going away. We want to eliminate the
713 // binary operator, not replace a binop with a select.
714 int SelOpNo = 0;
715
716 CastInst *CastOp;
717
718 // TODO: Should probably try to handle some cases with multiple
719 // users. Duplicating the select may be profitable for division.
720 SelectInst *Sel = findSelectThroughCast(V: BO.getOperand(i_nocapture: 0), Cast&: CastOp);
721 if (!Sel || !Sel->hasOneUse()) {
722 SelOpNo = 1;
723 Sel = findSelectThroughCast(V: BO.getOperand(i_nocapture: 1), Cast&: CastOp);
724 }
725
726 if (!Sel || !Sel->hasOneUse())
727 return false;
728
729 Constant *CT = dyn_cast<Constant>(Val: Sel->getTrueValue());
730 Constant *CF = dyn_cast<Constant>(Val: Sel->getFalseValue());
731 Constant *CBO = dyn_cast<Constant>(Val: BO.getOperand(i_nocapture: SelOpNo ^ 1));
732 if (!CBO || !CT || !CF)
733 return false;
734
735 if (CastOp) {
736 if (!CastOp->hasOneUse())
737 return false;
738 CT = ConstantFoldCastOperand(Opcode: CastOp->getOpcode(), C: CT, DestTy: BO.getType(), DL);
739 CF = ConstantFoldCastOperand(Opcode: CastOp->getOpcode(), C: CF, DestTy: BO.getType(), DL);
740 }
741
742 // TODO: Handle special 0/-1 cases DAG combine does, although we only really
743 // need to handle divisions here.
744 Constant *FoldedT =
745 SelOpNo ? ConstantFoldBinaryOpOperands(Opcode: BO.getOpcode(), LHS: CBO, RHS: CT, DL)
746 : ConstantFoldBinaryOpOperands(Opcode: BO.getOpcode(), LHS: CT, RHS: CBO, DL);
747 if (!FoldedT || isa<ConstantExpr>(Val: FoldedT))
748 return false;
749
750 Constant *FoldedF =
751 SelOpNo ? ConstantFoldBinaryOpOperands(Opcode: BO.getOpcode(), LHS: CBO, RHS: CF, DL)
752 : ConstantFoldBinaryOpOperands(Opcode: BO.getOpcode(), LHS: CF, RHS: CBO, DL);
753 if (!FoldedF || isa<ConstantExpr>(Val: FoldedF))
754 return false;
755
756 IRBuilder<> Builder(&BO);
757 Builder.SetCurrentDebugLocation(BO.getDebugLoc());
758 if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(Val: &BO))
759 Builder.setFastMathFlags(FPOp->getFastMathFlags());
760
761 Value *NewSelect = Builder.CreateSelect(C: Sel->getCondition(),
762 True: FoldedT, False: FoldedF);
763 NewSelect->takeName(V: &BO);
764 BO.replaceAllUsesWith(V: NewSelect);
765 BO.eraseFromParent();
766 if (CastOp)
767 CastOp->eraseFromParent();
768 Sel->eraseFromParent();
769 return true;
770}
771
772std::pair<Value *, Value *>
773AMDGPUCodeGenPrepareImpl::getFrexpResults(IRBuilder<> &Builder,
774 Value *Src) const {
775 Type *Ty = Src->getType();
776 Value *Frexp = Builder.CreateIntrinsic(ID: Intrinsic::frexp,
777 Types: {Ty, Builder.getInt32Ty()}, Args: Src);
778 Value *FrexpMant = Builder.CreateExtractValue(Agg: Frexp, Idxs: {0});
779
780 // Bypass the bug workaround for the exponent result since it doesn't matter.
781 // TODO: Does the bug workaround even really need to consider the exponent
782 // result? It's unspecified by the spec.
783
784 Value *FrexpExp =
785 ST.hasFractBug()
786 ? Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_frexp_exp,
787 Types: {Builder.getInt32Ty(), Ty}, Args: Src)
788 : Builder.CreateExtractValue(Agg: Frexp, Idxs: {1});
789 return {FrexpMant, FrexpExp};
790}
791
792/// Emit an expansion of 1.0 / Src good for 1ulp that supports denormals.
793Value *AMDGPUCodeGenPrepareImpl::emitRcpIEEE1ULP(IRBuilder<> &Builder,
794 Value *Src,
795 bool IsNegative) const {
796 // Same as for 1.0, but expand the sign out of the constant.
797 // -1.0 / x -> rcp (fneg x)
798 if (IsNegative)
799 Src = Builder.CreateFNeg(V: Src);
800
801 // The rcp instruction doesn't support denormals, so scale the input
802 // out of the denormal range and convert at the end.
803 //
804 // Expand as 2^-n * (1.0 / (x * 2^n))
805
806 // TODO: Skip scaling if input is known never denormal and the input
807 // range won't underflow to denormal. The hard part is knowing the
808 // result. We need a range check, the result could be denormal for
809 // 0x1p+126 < den <= 0x1p+127.
810 auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);
811 Value *ScaleFactor = Builder.CreateNeg(V: FrexpExp);
812 Value *Rcp = Builder.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rcp, V: FrexpMant);
813 return Builder.CreateCall(Callee: getLdexpF32(), Args: {Rcp, ScaleFactor});
814}
815
816/// Emit a 2ulp expansion for fdiv by using frexp for input scaling.
817Value *AMDGPUCodeGenPrepareImpl::emitFrexpDiv(IRBuilder<> &Builder, Value *LHS,
818 Value *RHS,
819 FastMathFlags FMF) const {
820 // If we have have to work around the fract/frexp bug, we're worse off than
821 // using the fdiv.fast expansion. The full safe expansion is faster if we have
822 // fast FMA.
823 if (HasFP32DenormalFlush && ST.hasFractBug() && !ST.hasFastFMAF32() &&
824 (!FMF.noNaNs() || !FMF.noInfs()))
825 return nullptr;
826
827 // We're scaling the LHS to avoid a denormal input, and scale the denominator
828 // to avoid large values underflowing the result.
829 auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, Src: RHS);
830
831 Value *Rcp =
832 Builder.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rcp, V: FrexpMantRHS);
833
834 auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder, Src: LHS);
835 Value *Mul = Builder.CreateFMul(L: FrexpMantLHS, R: Rcp);
836
837 // We multiplied by 2^N/2^M, so we need to multiply by 2^(N-M) to scale the
838 // result.
839 Value *ExpDiff = Builder.CreateSub(LHS: FrexpExpLHS, RHS: FrexpExpRHS);
840 return Builder.CreateCall(Callee: getLdexpF32(), Args: {Mul, ExpDiff});
841}
842
843/// Emit a sqrt that handles denormals and is accurate to 2ulp.
844Value *AMDGPUCodeGenPrepareImpl::emitSqrtIEEE2ULP(IRBuilder<> &Builder,
845 Value *Src,
846 FastMathFlags FMF) const {
847 Type *Ty = Src->getType();
848 APFloat SmallestNormal =
849 APFloat::getSmallestNormalized(Sem: Ty->getFltSemantics());
850 Value *NeedScale =
851 Builder.CreateFCmpOLT(LHS: Src, RHS: ConstantFP::get(Ty, V: SmallestNormal));
852
853 ConstantInt *Zero = Builder.getInt32(C: 0);
854 Value *InputScaleFactor =
855 Builder.CreateSelect(C: NeedScale, True: Builder.getInt32(C: 32), False: Zero);
856
857 Value *Scaled = Builder.CreateCall(Callee: getLdexpF32(), Args: {Src, InputScaleFactor});
858
859 Value *Sqrt = Builder.CreateCall(Callee: getSqrtF32(), Args: Scaled);
860
861 Value *OutputScaleFactor =
862 Builder.CreateSelect(C: NeedScale, True: Builder.getInt32(C: -16), False: Zero);
863 return Builder.CreateCall(Callee: getLdexpF32(), Args: {Sqrt, OutputScaleFactor});
864}
865
866/// Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
867static Value *emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src,
868 bool IsNegative) {
869 // bool need_scale = x < 0x1p-126f;
870 // float input_scale = need_scale ? 0x1.0p+24f : 1.0f;
871 // float output_scale = need_scale ? 0x1.0p+12f : 1.0f;
872 // rsq(x * input_scale) * output_scale;
873
874 Type *Ty = Src->getType();
875 APFloat SmallestNormal =
876 APFloat::getSmallestNormalized(Sem: Ty->getFltSemantics());
877 Value *NeedScale =
878 Builder.CreateFCmpOLT(LHS: Src, RHS: ConstantFP::get(Ty, V: SmallestNormal));
879 Constant *One = ConstantFP::get(Ty, V: 1.0);
880 Constant *InputScale = ConstantFP::get(Ty, V: 0x1.0p+24);
881 Constant *OutputScale =
882 ConstantFP::get(Ty, V: IsNegative ? -0x1.0p+12 : 0x1.0p+12);
883
884 Value *InputScaleFactor = Builder.CreateSelect(C: NeedScale, True: InputScale, False: One);
885
886 Value *ScaledInput = Builder.CreateFMul(L: Src, R: InputScaleFactor);
887 Value *Rsq = Builder.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rsq, V: ScaledInput);
888 Value *OutputScaleFactor = Builder.CreateSelect(
889 C: NeedScale, True: OutputScale, False: IsNegative ? ConstantFP::get(Ty, V: -1.0) : One);
890
891 return Builder.CreateFMul(L: Rsq, R: OutputScaleFactor);
892}
893
894bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp,
895 FastMathFlags DivFMF,
896 FastMathFlags SqrtFMF) const {
897 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
898 if (!DivFMF.allowContract() || !SqrtFMF.allowContract())
899 return false;
900
901 // v_rsq_f32 gives 1ulp
902 return SqrtFMF.approxFunc() || HasUnsafeFPMath ||
903 SqrtOp->getFPAccuracy() >= 1.0f;
904}
905
906Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
907 IRBuilder<> &Builder, Value *Num, Value *Den, const FastMathFlags DivFMF,
908 const FastMathFlags SqrtFMF, const Instruction *CtxI) const {
909 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
910 assert(DivFMF.allowContract() && SqrtFMF.allowContract());
911
912 // rsq_f16 is accurate to 0.51 ulp.
913 // rsq_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
914 // rsq_f64 is never accurate.
915 const ConstantFP *CLHS = dyn_cast<ConstantFP>(Val: Num);
916 if (!CLHS)
917 return nullptr;
918
919 assert(Den->getType()->isFloatTy());
920
921 bool IsNegative = false;
922
923 // TODO: Handle other numerator values with arcp.
924 if (CLHS->isExactlyValue(V: 1.0) || (IsNegative = CLHS->isExactlyValue(V: -1.0))) {
925 // Add in the sqrt flags.
926 IRBuilder<>::FastMathFlagGuard Guard(Builder);
927 Builder.setFastMathFlags(DivFMF | SqrtFMF);
928
929 if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) || HasUnsafeFPMath ||
930 canIgnoreDenormalInput(V: Den, CtxI)) {
931 Value *Result = Builder.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rsq, V: Den);
932 // -1.0 / sqrt(x) -> fneg(rsq(x))
933 return IsNegative ? Builder.CreateFNeg(V: Result) : Result;
934 }
935
936 return emitRsqIEEE1ULP(Builder, Src: Den, IsNegative);
937 }
938
939 return nullptr;
940}
941
942// Optimize fdiv with rcp:
943//
944// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
945// allowed with unsafe-fp-math or afn.
946//
947// a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0
948Value *
949AMDGPUCodeGenPrepareImpl::optimizeWithRcp(IRBuilder<> &Builder, Value *Num,
950 Value *Den, FastMathFlags FMF,
951 const Instruction *CtxI) const {
952 // rcp_f16 is accurate to 0.51 ulp.
953 // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
954 // rcp_f64 is never accurate.
955 assert(Den->getType()->isFloatTy());
956
957 if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Val: Num)) {
958 bool IsNegative = false;
959 if (CLHS->isExactlyValue(V: 1.0) ||
960 (IsNegative = CLHS->isExactlyValue(V: -1.0))) {
961 Value *Src = Den;
962
963 if (HasFP32DenormalFlush || FMF.approxFunc()) {
964 // -1.0 / x -> 1.0 / fneg(x)
965 if (IsNegative)
966 Src = Builder.CreateFNeg(V: Src);
967
968 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
969 // the CI documentation has a worst case error of 1 ulp.
970 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK
971 // to use it as long as we aren't trying to use denormals.
972 //
973 // v_rcp_f16 and v_rsq_f16 DO support denormals.
974
975 // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
976 // insert rsq intrinsic here.
977
978 // 1.0 / x -> rcp(x)
979 return Builder.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rcp, V: Src);
980 }
981
982 // TODO: If the input isn't denormal, and we know the input exponent isn't
983 // big enough to introduce a denormal we can avoid the scaling.
984 return emitRcpIEEE1ULP(Builder, Src, IsNegative);
985 }
986 }
987
988 if (FMF.allowReciprocal()) {
989 // x / y -> x * (1.0 / y)
990
991 // TODO: Could avoid denormal scaling and use raw rcp if we knew the output
992 // will never underflow.
993 if (HasFP32DenormalFlush || FMF.approxFunc()) {
994 Value *Recip = Builder.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rcp, V: Den);
995 return Builder.CreateFMul(L: Num, R: Recip);
996 }
997
998 Value *Recip = emitRcpIEEE1ULP(Builder, Src: Den, IsNegative: false);
999 return Builder.CreateFMul(L: Num, R: Recip);
1000 }
1001
1002 return nullptr;
1003}
1004
1005// optimize with fdiv.fast:
1006//
1007// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
1008//
1009// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
1010//
1011// NOTE: optimizeWithRcp should be tried first because rcp is the preference.
1012Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
1013 IRBuilder<> &Builder, Value *Num, Value *Den, float ReqdAccuracy) const {
1014 // fdiv.fast can achieve 2.5 ULP accuracy.
1015 if (ReqdAccuracy < 2.5f)
1016 return nullptr;
1017
1018 // Only have fdiv.fast for f32.
1019 assert(Den->getType()->isFloatTy());
1020
1021 bool NumIsOne = false;
1022 if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Val: Num)) {
1023 if (CNum->isExactlyValue(V: +1.0) || CNum->isExactlyValue(V: -1.0))
1024 NumIsOne = true;
1025 }
1026
1027 // fdiv does not support denormals. But 1.0/x is always fine to use it.
1028 //
1029 // TODO: This works for any value with a specific known exponent range, don't
1030 // just limit to constant 1.
1031 if (!HasFP32DenormalFlush && !NumIsOne)
1032 return nullptr;
1033
1034 return Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_fdiv_fast, Args: {Num, Den});
1035}
1036
1037Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
1038 IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF,
1039 FastMathFlags SqrtFMF, Value *RsqOp, const Instruction *FDivInst,
1040 float ReqdDivAccuracy) const {
1041 if (RsqOp) {
1042 Value *Rsq =
1043 optimizeWithRsq(Builder, Num, Den: RsqOp, DivFMF, SqrtFMF, CtxI: FDivInst);
1044 if (Rsq)
1045 return Rsq;
1046 }
1047
1048 Value *Rcp = optimizeWithRcp(Builder, Num, Den, FMF: DivFMF, CtxI: FDivInst);
1049 if (Rcp)
1050 return Rcp;
1051
1052 // In the basic case fdiv_fast has the same instruction count as the frexp div
1053 // expansion. Slightly prefer fdiv_fast since it ends in an fmul that can
1054 // potentially be fused into a user. Also, materialization of the constants
1055 // can be reused for multiple instances.
1056 Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdAccuracy: ReqdDivAccuracy);
1057 if (FDivFast)
1058 return FDivFast;
1059
1060 return emitFrexpDiv(Builder, LHS: Num, RHS: Den, FMF: DivFMF);
1061}
1062
1063// Optimizations is performed based on fpmath, fast math flags as well as
1064// denormals to optimize fdiv with either rcp or fdiv.fast.
1065//
1066// With rcp:
1067// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
1068// allowed with unsafe-fp-math or afn.
1069//
1070// a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
1071//
1072// With fdiv.fast:
1073// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
1074//
1075// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
1076//
1077// NOTE: rcp is the preference in cases that both are legal.
1078bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
1079 if (DisableFDivExpand)
1080 return false;
1081
1082 Type *Ty = FDiv.getType()->getScalarType();
1083 if (!Ty->isFloatTy())
1084 return false;
1085
1086 // The f64 rcp/rsq approximations are pretty inaccurate. We can do an
1087 // expansion around them in codegen. f16 is good enough to always use.
1088
1089 const FPMathOperator *FPOp = cast<const FPMathOperator>(Val: &FDiv);
1090 const FastMathFlags DivFMF = FPOp->getFastMathFlags();
1091 const float ReqdAccuracy = FPOp->getFPAccuracy();
1092
1093 FastMathFlags SqrtFMF;
1094
1095 Value *Num = FDiv.getOperand(i_nocapture: 0);
1096 Value *Den = FDiv.getOperand(i_nocapture: 1);
1097
1098 Value *RsqOp = nullptr;
1099 auto *DenII = dyn_cast<IntrinsicInst>(Val: Den);
1100 if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
1101 DenII->hasOneUse()) {
1102 const auto *SqrtOp = cast<FPMathOperator>(Val: DenII);
1103 SqrtFMF = SqrtOp->getFastMathFlags();
1104 if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF))
1105 RsqOp = SqrtOp->getOperand(i: 0);
1106 }
1107
1108 // Inaccurate rcp is allowed with unsafe-fp-math or afn.
1109 //
1110 // Defer to codegen to handle this.
1111 //
1112 // TODO: Decide on an interpretation for interactions between afn + arcp +
1113 // !fpmath, and make it consistent between here and codegen. For now, defer
1114 // expansion of afn to codegen. The current interpretation is so aggressive we
1115 // don't need any pre-consideration here when we have better information. A
1116 // more conservative interpretation could use handling here.
1117 const bool AllowInaccurateRcp = HasUnsafeFPMath || DivFMF.approxFunc();
1118 if (!RsqOp && AllowInaccurateRcp)
1119 return false;
1120
1121 // Defer the correct implementations to codegen.
1122 if (ReqdAccuracy < 1.0f)
1123 return false;
1124
1125 IRBuilder<> Builder(FDiv.getParent(), std::next(x: FDiv.getIterator()));
1126 Builder.setFastMathFlags(DivFMF);
1127 Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
1128
1129 SmallVector<Value *, 4> NumVals;
1130 SmallVector<Value *, 4> DenVals;
1131 SmallVector<Value *, 4> RsqDenVals;
1132 extractValues(Builder, Values&: NumVals, V: Num);
1133 extractValues(Builder, Values&: DenVals, V: Den);
1134
1135 if (RsqOp)
1136 extractValues(Builder, Values&: RsqDenVals, V: RsqOp);
1137
1138 SmallVector<Value *, 4> ResultVals(NumVals.size());
1139 for (int I = 0, E = NumVals.size(); I != E; ++I) {
1140 Value *NumElt = NumVals[I];
1141 Value *DenElt = DenVals[I];
1142 Value *RsqDenElt = RsqOp ? RsqDenVals[I] : nullptr;
1143
1144 Value *NewElt =
1145 visitFDivElement(Builder, Num: NumElt, Den: DenElt, DivFMF, SqrtFMF, RsqOp: RsqDenElt,
1146 FDivInst: cast<Instruction>(Val: FPOp), ReqdDivAccuracy: ReqdAccuracy);
1147 if (!NewElt) {
1148 // Keep the original, but scalarized.
1149
1150 // This has the unfortunate side effect of sometimes scalarizing when
1151 // we're not going to do anything.
1152 NewElt = Builder.CreateFDiv(L: NumElt, R: DenElt);
1153 if (auto *NewEltInst = dyn_cast<Instruction>(Val: NewElt))
1154 NewEltInst->copyMetadata(SrcInst: FDiv);
1155 }
1156
1157 ResultVals[I] = NewElt;
1158 }
1159
1160 Value *NewVal = insertValues(Builder, Ty: FDiv.getType(), Values&: ResultVals);
1161
1162 if (NewVal) {
1163 FDiv.replaceAllUsesWith(V: NewVal);
1164 NewVal->takeName(V: &FDiv);
1165 RecursivelyDeleteTriviallyDeadInstructions(V: &FDiv, TLI);
1166 }
1167
1168 return true;
1169}
1170
1171static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,
1172 Value *LHS, Value *RHS) {
1173 Type *I32Ty = Builder.getInt32Ty();
1174 Type *I64Ty = Builder.getInt64Ty();
1175
1176 Value *LHS_EXT64 = Builder.CreateZExt(V: LHS, DestTy: I64Ty);
1177 Value *RHS_EXT64 = Builder.CreateZExt(V: RHS, DestTy: I64Ty);
1178 Value *MUL64 = Builder.CreateMul(LHS: LHS_EXT64, RHS: RHS_EXT64);
1179 Value *Lo = Builder.CreateTrunc(V: MUL64, DestTy: I32Ty);
1180 Value *Hi = Builder.CreateLShr(LHS: MUL64, RHS: Builder.getInt64(C: 32));
1181 Hi = Builder.CreateTrunc(V: Hi, DestTy: I32Ty);
1182 return std::pair(Lo, Hi);
1183}
1184
1185static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
1186 return getMul64(Builder, LHS, RHS).second;
1187}
1188
1189/// Figure out how many bits are really needed for this division.
1190/// \p MaxDivBits is an optimization hint to bypass the second
1191/// ComputeNumSignBits/computeKnownBits call if the first one is
1192/// insufficient.
1193unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,
1194 Value *Den,
1195 unsigned MaxDivBits,
1196 bool IsSigned) const {
1197 assert(Num->getType()->getScalarSizeInBits() ==
1198 Den->getType()->getScalarSizeInBits());
1199 unsigned SSBits = Num->getType()->getScalarSizeInBits();
1200 if (IsSigned) {
1201 unsigned RHSSignBits = ComputeNumSignBits(Op: Den, DL, AC, CxtI: &I);
1202 // A sign bit needs to be reserved for shrinking.
1203 unsigned DivBits = SSBits - RHSSignBits + 1;
1204 if (DivBits > MaxDivBits)
1205 return SSBits;
1206
1207 unsigned LHSSignBits = ComputeNumSignBits(Op: Num, DL, AC, CxtI: &I);
1208
1209 unsigned SignBits = std::min(a: LHSSignBits, b: RHSSignBits);
1210 DivBits = SSBits - SignBits + 1;
1211 return DivBits;
1212 }
1213
1214 // All bits are used for unsigned division for Num or Den in range
1215 // (SignedMax, UnsignedMax].
1216 KnownBits Known = computeKnownBits(V: Den, DL, AC, CxtI: &I);
1217 if (Known.isNegative() || !Known.isNonNegative())
1218 return SSBits;
1219 unsigned RHSSignBits = Known.countMinLeadingZeros();
1220 unsigned DivBits = SSBits - RHSSignBits;
1221 if (DivBits > MaxDivBits)
1222 return SSBits;
1223
1224 Known = computeKnownBits(V: Num, DL, AC, CxtI: &I);
1225 if (Known.isNegative() || !Known.isNonNegative())
1226 return SSBits;
1227 unsigned LHSSignBits = Known.countMinLeadingZeros();
1228
1229 unsigned SignBits = std::min(a: LHSSignBits, b: RHSSignBits);
1230 DivBits = SSBits - SignBits;
1231 return DivBits;
1232}
1233
1234// The fractional part of a float is enough to accurately represent up to
1235// a 24-bit signed integer.
1236Value *AMDGPUCodeGenPrepareImpl::expandDivRem24(IRBuilder<> &Builder,
1237 BinaryOperator &I, Value *Num,
1238 Value *Den, bool IsDiv,
1239 bool IsSigned) const {
1240 unsigned DivBits = getDivNumBits(I, Num, Den, MaxDivBits: 24, IsSigned);
1241 if (DivBits > 24)
1242 return nullptr;
1243 return expandDivRem24Impl(Builder, I, Num, Den, NumBits: DivBits, IsDiv, IsSigned);
1244}
1245
1246Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
1247 IRBuilder<> &Builder, BinaryOperator &I, Value *Num, Value *Den,
1248 unsigned DivBits, bool IsDiv, bool IsSigned) const {
1249 Type *I32Ty = Builder.getInt32Ty();
1250 Num = Builder.CreateTrunc(V: Num, DestTy: I32Ty);
1251 Den = Builder.CreateTrunc(V: Den, DestTy: I32Ty);
1252
1253 Type *F32Ty = Builder.getFloatTy();
1254 ConstantInt *One = Builder.getInt32(C: 1);
1255 Value *JQ = One;
1256
1257 if (IsSigned) {
1258 // char|short jq = ia ^ ib;
1259 JQ = Builder.CreateXor(LHS: Num, RHS: Den);
1260
1261 // jq = jq >> (bitsize - 2)
1262 JQ = Builder.CreateAShr(LHS: JQ, RHS: Builder.getInt32(C: 30));
1263
1264 // jq = jq | 0x1
1265 JQ = Builder.CreateOr(LHS: JQ, RHS: One);
1266 }
1267
1268 // int ia = (int)LHS;
1269 Value *IA = Num;
1270
1271 // int ib, (int)RHS;
1272 Value *IB = Den;
1273
1274 // float fa = (float)ia;
1275 Value *FA = IsSigned ? Builder.CreateSIToFP(V: IA, DestTy: F32Ty)
1276 : Builder.CreateUIToFP(V: IA, DestTy: F32Ty);
1277
1278 // float fb = (float)ib;
1279 Value *FB = IsSigned ? Builder.CreateSIToFP(V: IB,DestTy: F32Ty)
1280 : Builder.CreateUIToFP(V: IB,DestTy: F32Ty);
1281
1282 Value *RCP = Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_rcp,
1283 Types: Builder.getFloatTy(), Args: {FB});
1284 Value *FQM = Builder.CreateFMul(L: FA, R: RCP);
1285
1286 // fq = trunc(fqm);
1287 CallInst *FQ = Builder.CreateUnaryIntrinsic(ID: Intrinsic::trunc, V: FQM);
1288 FQ->copyFastMathFlags(FMF: Builder.getFastMathFlags());
1289
1290 // float fqneg = -fq;
1291 Value *FQNeg = Builder.CreateFNeg(V: FQ);
1292
1293 // float fr = mad(fqneg, fb, fa);
1294 auto FMAD = !ST.hasMadMacF32Insts()
1295 ? Intrinsic::fma
1296 : (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz;
1297 Value *FR = Builder.CreateIntrinsic(ID: FMAD,
1298 Types: {FQNeg->getType()}, Args: {FQNeg, FB, FA}, FMFSource: FQ);
1299
1300 // int iq = (int)fq;
1301 Value *IQ = IsSigned ? Builder.CreateFPToSI(V: FQ, DestTy: I32Ty)
1302 : Builder.CreateFPToUI(V: FQ, DestTy: I32Ty);
1303
1304 // fr = fabs(fr);
1305 FR = Builder.CreateUnaryIntrinsic(ID: Intrinsic::fabs, V: FR, FMFSource: FQ);
1306
1307 // fb = fabs(fb);
1308 FB = Builder.CreateUnaryIntrinsic(ID: Intrinsic::fabs, V: FB, FMFSource: FQ);
1309
1310 // int cv = fr >= fb;
1311 Value *CV = Builder.CreateFCmpOGE(LHS: FR, RHS: FB);
1312
1313 // jq = (cv ? jq : 0);
1314 JQ = Builder.CreateSelect(C: CV, True: JQ, False: Builder.getInt32(C: 0));
1315
1316 // dst = iq + jq;
1317 Value *Div = Builder.CreateAdd(LHS: IQ, RHS: JQ);
1318
1319 Value *Res = Div;
1320 if (!IsDiv) {
1321 // Rem needs compensation, it's easier to recompute it
1322 Value *Rem = Builder.CreateMul(LHS: Div, RHS: Den);
1323 Res = Builder.CreateSub(LHS: Num, RHS: Rem);
1324 }
1325
1326 if (DivBits != 0 && DivBits < 32) {
1327 // Extend in register from the number of bits this divide really is.
1328 if (IsSigned) {
1329 int InRegBits = 32 - DivBits;
1330
1331 Res = Builder.CreateShl(LHS: Res, RHS: InRegBits);
1332 Res = Builder.CreateAShr(LHS: Res, RHS: InRegBits);
1333 } else {
1334 ConstantInt *TruncMask
1335 = Builder.getInt32(C: (UINT64_C(1) << DivBits) - 1);
1336 Res = Builder.CreateAnd(LHS: Res, RHS: TruncMask);
1337 }
1338 }
1339
1340 return Res;
1341}
1342
1343// Try to recognize special cases the DAG will emit special, better expansions
1344// than the general expansion we do here.
1345
1346// TODO: It would be better to just directly handle those optimizations here.
1347bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I,
1348 Value *Num,
1349 Value *Den) const {
1350 if (Constant *C = dyn_cast<Constant>(Val: Den)) {
1351 // Arbitrary constants get a better expansion as long as a wider mulhi is
1352 // legal.
1353 if (C->getType()->getScalarSizeInBits() <= 32)
1354 return true;
1355
1356 // TODO: Sdiv check for not exact for some reason.
1357
1358 // If there's no wider mulhi, there's only a better expansion for powers of
1359 // two.
1360 // TODO: Should really know for each vector element.
1361 if (isKnownToBeAPowerOfTwo(V: C, DL, OrZero: true, AC, CxtI: &I, DT))
1362 return true;
1363
1364 return false;
1365 }
1366
1367 if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Val: Den)) {
1368 // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
1369 if (BinOpDen->getOpcode() == Instruction::Shl &&
1370 isa<Constant>(Val: BinOpDen->getOperand(i_nocapture: 0)) &&
1371 isKnownToBeAPowerOfTwo(V: BinOpDen->getOperand(i_nocapture: 0), DL, OrZero: true, AC, CxtI: &I, DT)) {
1372 return true;
1373 }
1374 }
1375
1376 return false;
1377}
1378
1379static Value *getSign32(Value *V, IRBuilder<> &Builder, const DataLayout DL) {
1380 // Check whether the sign can be determined statically.
1381 KnownBits Known = computeKnownBits(V, DL);
1382 if (Known.isNegative())
1383 return Constant::getAllOnesValue(Ty: V->getType());
1384 if (Known.isNonNegative())
1385 return Constant::getNullValue(Ty: V->getType());
1386 return Builder.CreateAShr(LHS: V, RHS: Builder.getInt32(C: 31));
1387}
1388
1389Value *AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder<> &Builder,
1390 BinaryOperator &I, Value *X,
1391 Value *Y) const {
1392 Instruction::BinaryOps Opc = I.getOpcode();
1393 assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
1394 Opc == Instruction::SRem || Opc == Instruction::SDiv);
1395
1396 FastMathFlags FMF;
1397 FMF.setFast();
1398 Builder.setFastMathFlags(FMF);
1399
1400 if (divHasSpecialOptimization(I, Num: X, Den: Y))
1401 return nullptr; // Keep it for later optimization.
1402
1403 bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
1404 bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
1405
1406 Type *Ty = X->getType();
1407 Type *I32Ty = Builder.getInt32Ty();
1408 Type *F32Ty = Builder.getFloatTy();
1409
1410 if (Ty->getScalarSizeInBits() != 32) {
1411 if (IsSigned) {
1412 X = Builder.CreateSExtOrTrunc(V: X, DestTy: I32Ty);
1413 Y = Builder.CreateSExtOrTrunc(V: Y, DestTy: I32Ty);
1414 } else {
1415 X = Builder.CreateZExtOrTrunc(V: X, DestTy: I32Ty);
1416 Y = Builder.CreateZExtOrTrunc(V: Y, DestTy: I32Ty);
1417 }
1418 }
1419
1420 if (Value *Res = expandDivRem24(Builder, I, Num: X, Den: Y, IsDiv, IsSigned)) {
1421 return IsSigned ? Builder.CreateSExtOrTrunc(V: Res, DestTy: Ty) :
1422 Builder.CreateZExtOrTrunc(V: Res, DestTy: Ty);
1423 }
1424
1425 ConstantInt *Zero = Builder.getInt32(C: 0);
1426 ConstantInt *One = Builder.getInt32(C: 1);
1427
1428 Value *Sign = nullptr;
1429 if (IsSigned) {
1430 Value *SignX = getSign32(V: X, Builder, DL);
1431 Value *SignY = getSign32(V: Y, Builder, DL);
1432 // Remainder sign is the same as LHS
1433 Sign = IsDiv ? Builder.CreateXor(LHS: SignX, RHS: SignY) : SignX;
1434
1435 X = Builder.CreateAdd(LHS: X, RHS: SignX);
1436 Y = Builder.CreateAdd(LHS: Y, RHS: SignY);
1437
1438 X = Builder.CreateXor(LHS: X, RHS: SignX);
1439 Y = Builder.CreateXor(LHS: Y, RHS: SignY);
1440 }
1441
1442 // The algorithm here is based on ideas from "Software Integer Division", Tom
1443 // Rodeheffer, August 2008.
1444 //
1445 // unsigned udiv(unsigned x, unsigned y) {
1446 // // Initial estimate of inv(y). The constant is less than 2^32 to ensure
1447 // // that this is a lower bound on inv(y), even if some of the calculations
1448 // // round up.
1449 // unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y));
1450 //
1451 // // One round of UNR (Unsigned integer Newton-Raphson) to improve z.
1452 // // Empirically this is guaranteed to give a "two-y" lower bound on
1453 // // inv(y).
1454 // z += umulh(z, -y * z);
1455 //
1456 // // Quotient/remainder estimate.
1457 // unsigned q = umulh(x, z);
1458 // unsigned r = x - q * y;
1459 //
1460 // // Two rounds of quotient/remainder refinement.
1461 // if (r >= y) {
1462 // ++q;
1463 // r -= y;
1464 // }
1465 // if (r >= y) {
1466 // ++q;
1467 // r -= y;
1468 // }
1469 //
1470 // return q;
1471 // }
1472
1473 // Initial estimate of inv(y).
1474 Value *FloatY = Builder.CreateUIToFP(V: Y, DestTy: F32Ty);
1475 Value *RcpY = Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_rcp, Types: F32Ty, Args: {FloatY});
1476 Constant *Scale = ConstantFP::get(Ty: F32Ty, V: llvm::bit_cast<float>(from: 0x4F7FFFFE));
1477 Value *ScaledY = Builder.CreateFMul(L: RcpY, R: Scale);
1478 Value *Z = Builder.CreateFPToUI(V: ScaledY, DestTy: I32Ty);
1479
1480 // One round of UNR.
1481 Value *NegY = Builder.CreateSub(LHS: Zero, RHS: Y);
1482 Value *NegYZ = Builder.CreateMul(LHS: NegY, RHS: Z);
1483 Z = Builder.CreateAdd(LHS: Z, RHS: getMulHu(Builder, LHS: Z, RHS: NegYZ));
1484
1485 // Quotient/remainder estimate.
1486 Value *Q = getMulHu(Builder, LHS: X, RHS: Z);
1487 Value *R = Builder.CreateSub(LHS: X, RHS: Builder.CreateMul(LHS: Q, RHS: Y));
1488
1489 // First quotient/remainder refinement.
1490 Value *Cond = Builder.CreateICmpUGE(LHS: R, RHS: Y);
1491 if (IsDiv)
1492 Q = Builder.CreateSelect(C: Cond, True: Builder.CreateAdd(LHS: Q, RHS: One), False: Q);
1493 R = Builder.CreateSelect(C: Cond, True: Builder.CreateSub(LHS: R, RHS: Y), False: R);
1494
1495 // Second quotient/remainder refinement.
1496 Cond = Builder.CreateICmpUGE(LHS: R, RHS: Y);
1497 Value *Res;
1498 if (IsDiv)
1499 Res = Builder.CreateSelect(C: Cond, True: Builder.CreateAdd(LHS: Q, RHS: One), False: Q);
1500 else
1501 Res = Builder.CreateSelect(C: Cond, True: Builder.CreateSub(LHS: R, RHS: Y), False: R);
1502
1503 if (IsSigned) {
1504 Res = Builder.CreateXor(LHS: Res, RHS: Sign);
1505 Res = Builder.CreateSub(LHS: Res, RHS: Sign);
1506 Res = Builder.CreateSExtOrTrunc(V: Res, DestTy: Ty);
1507 } else {
1508 Res = Builder.CreateZExtOrTrunc(V: Res, DestTy: Ty);
1509 }
1510 return Res;
1511}
1512
1513Value *AMDGPUCodeGenPrepareImpl::shrinkDivRem64(IRBuilder<> &Builder,
1514 BinaryOperator &I, Value *Num,
1515 Value *Den) const {
1516 if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den))
1517 return nullptr; // Keep it for later optimization.
1518
1519 Instruction::BinaryOps Opc = I.getOpcode();
1520
1521 bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
1522 bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
1523
1524 unsigned NumDivBits = getDivNumBits(I, Num, Den, MaxDivBits: 32, IsSigned);
1525 if (NumDivBits > 32)
1526 return nullptr;
1527
1528 Value *Narrowed = nullptr;
1529 if (NumDivBits <= 24) {
1530 Narrowed = expandDivRem24Impl(Builder, I, Num, Den, DivBits: NumDivBits,
1531 IsDiv, IsSigned);
1532 } else if (NumDivBits <= 32) {
1533 Narrowed = expandDivRem32(Builder, I, X: Num, Y: Den);
1534 }
1535
1536 if (Narrowed) {
1537 return IsSigned ? Builder.CreateSExt(V: Narrowed, DestTy: Num->getType()) :
1538 Builder.CreateZExt(V: Narrowed, DestTy: Num->getType());
1539 }
1540
1541 return nullptr;
1542}
1543
1544void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &I) const {
1545 Instruction::BinaryOps Opc = I.getOpcode();
1546 // Do the general expansion.
1547 if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {
1548 expandDivisionUpTo64Bits(Div: &I);
1549 return;
1550 }
1551
1552 if (Opc == Instruction::URem || Opc == Instruction::SRem) {
1553 expandRemainderUpTo64Bits(Rem: &I);
1554 return;
1555 }
1556
1557 llvm_unreachable("not a division");
1558}
1559
1560/*
1561This will cause non-byte load in consistency, for example:
1562```
1563 %load = load i1, ptr addrspace(4) %arg, align 4
1564 %zext = zext i1 %load to
1565 i64 %add = add i64 %zext
1566```
1567Instead of creating `s_and_b32 s0, s0, 1`,
1568it will create `s_and_b32 s0, s0, 0xff`.
1569We accept this change since the non-byte load assumes the upper bits
1570within the byte are all 0.
1571*/
1572static bool tryNarrowMathIfNoOverflow(Instruction *I,
1573 const SITargetLowering *TLI,
1574 const TargetTransformInfo &TTI,
1575 const DataLayout &DL) {
1576 unsigned Opc = I->getOpcode();
1577 Type *OldType = I->getType();
1578
1579 if (Opc != Instruction::Add && Opc != Instruction::Mul)
1580 return false;
1581
1582 unsigned OrigBit = OldType->getScalarSizeInBits();
1583
1584 if (Opc != Instruction::Add && Opc != Instruction::Mul)
1585 llvm_unreachable("Unexpected opcode, only valid for Instruction::Add and "
1586 "Instruction::Mul.");
1587
1588 unsigned MaxBitsNeeded = computeKnownBits(V: I, DL).countMaxActiveBits();
1589
1590 MaxBitsNeeded = std::max<unsigned>(a: bit_ceil(Value: MaxBitsNeeded), b: 8);
1591 Type *NewType = DL.getSmallestLegalIntType(C&: I->getContext(), Width: MaxBitsNeeded);
1592 if (!NewType)
1593 return false;
1594 unsigned NewBit = NewType->getIntegerBitWidth();
1595 if (NewBit >= OrigBit)
1596 return false;
1597 NewType = I->getType()->getWithNewBitWidth(NewBitWidth: NewBit);
1598
1599 // Old cost
1600 InstructionCost OldCost =
1601 TTI.getArithmeticInstrCost(Opcode: Opc, Ty: OldType, CostKind: TTI::TCK_RecipThroughput);
1602 // New cost of new op
1603 InstructionCost NewCost =
1604 TTI.getArithmeticInstrCost(Opcode: Opc, Ty: NewType, CostKind: TTI::TCK_RecipThroughput);
1605 // New cost of narrowing 2 operands (use trunc)
1606 int NumOfNonConstOps = 2;
1607 if (isa<Constant>(Val: I->getOperand(i: 0)) || isa<Constant>(Val: I->getOperand(i: 1))) {
1608 // Cannot be both constant, should be propagated
1609 NumOfNonConstOps = 1;
1610 }
1611 NewCost += NumOfNonConstOps * TTI.getCastInstrCost(Opcode: Instruction::Trunc,
1612 Dst: NewType, Src: OldType,
1613 CCH: TTI.getCastContextHint(I),
1614 CostKind: TTI::TCK_RecipThroughput);
1615 // New cost of zext narrowed result to original type
1616 NewCost +=
1617 TTI.getCastInstrCost(Opcode: Instruction::ZExt, Dst: OldType, Src: NewType,
1618 CCH: TTI.getCastContextHint(I), CostKind: TTI::TCK_RecipThroughput);
1619 if (NewCost >= OldCost)
1620 return false;
1621
1622 IRBuilder<> Builder(I);
1623 Value *Trunc0 = Builder.CreateTrunc(V: I->getOperand(i: 0), DestTy: NewType);
1624 Value *Trunc1 = Builder.CreateTrunc(V: I->getOperand(i: 1), DestTy: NewType);
1625 Value *Arith =
1626 Builder.CreateBinOp(Opc: (Instruction::BinaryOps)Opc, LHS: Trunc0, RHS: Trunc1);
1627
1628 Value *Zext = Builder.CreateZExt(V: Arith, DestTy: OldType);
1629 I->replaceAllUsesWith(V: Zext);
1630 I->eraseFromParent();
1631 return true;
1632}
1633
1634bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
1635 if (foldBinOpIntoSelect(BO&: I))
1636 return true;
1637
1638 if (ST.has16BitInsts() && needsPromotionToI32(T: I.getType()) &&
1639 UA.isUniform(I: &I) && promoteUniformOpToI32(I))
1640 return true;
1641
1642 if (UseMul24Intrin && replaceMulWithMul24(I))
1643 return true;
1644 if (tryNarrowMathIfNoOverflow(I: &I, TLI: ST.getTargetLowering(),
1645 TTI: TM.getTargetTransformInfo(F), DL))
1646 return true;
1647
1648 bool Changed = false;
1649 Instruction::BinaryOps Opc = I.getOpcode();
1650 Type *Ty = I.getType();
1651 Value *NewDiv = nullptr;
1652 unsigned ScalarSize = Ty->getScalarSizeInBits();
1653
1654 SmallVector<BinaryOperator *, 8> Div64ToExpand;
1655
1656 if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
1657 Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
1658 ScalarSize <= 64 &&
1659 !DisableIDivExpand) {
1660 Value *Num = I.getOperand(i_nocapture: 0);
1661 Value *Den = I.getOperand(i_nocapture: 1);
1662 IRBuilder<> Builder(&I);
1663 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1664
1665 if (auto *VT = dyn_cast<FixedVectorType>(Val: Ty)) {
1666 NewDiv = PoisonValue::get(T: VT);
1667
1668 for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {
1669 Value *NumEltN = Builder.CreateExtractElement(Vec: Num, Idx: N);
1670 Value *DenEltN = Builder.CreateExtractElement(Vec: Den, Idx: N);
1671
1672 Value *NewElt;
1673 if (ScalarSize <= 32) {
1674 NewElt = expandDivRem32(Builder, I, X: NumEltN, Y: DenEltN);
1675 if (!NewElt)
1676 NewElt = Builder.CreateBinOp(Opc, LHS: NumEltN, RHS: DenEltN);
1677 } else {
1678 // See if this 64-bit division can be shrunk to 32/24-bits before
1679 // producing the general expansion.
1680 NewElt = shrinkDivRem64(Builder, I, Num: NumEltN, Den: DenEltN);
1681 if (!NewElt) {
1682 // The general 64-bit expansion introduces control flow and doesn't
1683 // return the new value. Just insert a scalar copy and defer
1684 // expanding it.
1685 NewElt = Builder.CreateBinOp(Opc, LHS: NumEltN, RHS: DenEltN);
1686 // CreateBinOp does constant folding. If the operands are constant,
1687 // it will return a Constant instead of a BinaryOperator.
1688 if (auto *NewEltBO = dyn_cast<BinaryOperator>(Val: NewElt))
1689 Div64ToExpand.push_back(Elt: NewEltBO);
1690 }
1691 }
1692
1693 if (auto *NewEltI = dyn_cast<Instruction>(Val: NewElt))
1694 NewEltI->copyIRFlags(V: &I);
1695
1696 NewDiv = Builder.CreateInsertElement(Vec: NewDiv, NewElt, Idx: N);
1697 }
1698 } else {
1699 if (ScalarSize <= 32)
1700 NewDiv = expandDivRem32(Builder, I, X: Num, Y: Den);
1701 else {
1702 NewDiv = shrinkDivRem64(Builder, I, Num, Den);
1703 if (!NewDiv)
1704 Div64ToExpand.push_back(Elt: &I);
1705 }
1706 }
1707
1708 if (NewDiv) {
1709 I.replaceAllUsesWith(V: NewDiv);
1710 I.eraseFromParent();
1711 Changed = true;
1712 }
1713 }
1714
1715 if (ExpandDiv64InIR) {
1716 // TODO: We get much worse code in specially handled constant cases.
1717 for (BinaryOperator *Div : Div64ToExpand) {
1718 expandDivRem64(I&: *Div);
1719 FlowChanged = true;
1720 Changed = true;
1721 }
1722 }
1723
1724 return Changed;
1725}
1726
1727bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
1728 if (!WidenLoads)
1729 return false;
1730
1731 if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
1732 I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1733 canWidenScalarExtLoad(I)) {
1734 IRBuilder<> Builder(&I);
1735 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1736
1737 Type *I32Ty = Builder.getInt32Ty();
1738 LoadInst *WidenLoad = Builder.CreateLoad(Ty: I32Ty, Ptr: I.getPointerOperand());
1739 WidenLoad->copyMetadata(SrcInst: I);
1740
1741 // If we have range metadata, we need to convert the type, and not make
1742 // assumptions about the high bits.
1743 if (auto *Range = WidenLoad->getMetadata(KindID: LLVMContext::MD_range)) {
1744 ConstantInt *Lower =
1745 mdconst::extract<ConstantInt>(MD: Range->getOperand(I: 0));
1746
1747 if (Lower->isNullValue()) {
1748 WidenLoad->setMetadata(KindID: LLVMContext::MD_range, Node: nullptr);
1749 } else {
1750 Metadata *LowAndHigh[] = {
1751 ConstantAsMetadata::get(C: ConstantInt::get(Ty: I32Ty, V: Lower->getValue().zext(width: 32))),
1752 // Don't make assumptions about the high bits.
1753 ConstantAsMetadata::get(C: ConstantInt::get(Ty: I32Ty, V: 0))
1754 };
1755
1756 WidenLoad->setMetadata(KindID: LLVMContext::MD_range,
1757 Node: MDNode::get(Context&: F.getContext(), MDs: LowAndHigh));
1758 }
1759 }
1760
1761 int TySize = DL.getTypeSizeInBits(Ty: I.getType());
1762 Type *IntNTy = Builder.getIntNTy(N: TySize);
1763 Value *ValTrunc = Builder.CreateTrunc(V: WidenLoad, DestTy: IntNTy);
1764 Value *ValOrig = Builder.CreateBitCast(V: ValTrunc, DestTy: I.getType());
1765 I.replaceAllUsesWith(V: ValOrig);
1766 I.eraseFromParent();
1767 return true;
1768 }
1769
1770 return false;
1771}
1772
1773bool AMDGPUCodeGenPrepareImpl::visitICmpInst(ICmpInst &I) {
1774 bool Changed = false;
1775
1776 if (ST.has16BitInsts() && needsPromotionToI32(T: I.getOperand(i_nocapture: 0)->getType()) &&
1777 UA.isUniform(I: &I))
1778 Changed |= promoteUniformOpToI32(I);
1779
1780 return Changed;
1781}
1782
1783bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {
1784 Value *Cond = I.getCondition();
1785 Value *TrueVal = I.getTrueValue();
1786 Value *FalseVal = I.getFalseValue();
1787 Value *CmpVal;
1788 CmpPredicate Pred;
1789
1790 if (ST.has16BitInsts() && needsPromotionToI32(T: I.getType())) {
1791 if (UA.isUniform(I: &I))
1792 return promoteUniformOpToI32(I);
1793 return false;
1794 }
1795
1796 // Match fract pattern with nan check.
1797 if (!match(V: Cond, P: m_FCmp(Pred, L: m_Value(V&: CmpVal), R: m_NonNaN())))
1798 return false;
1799
1800 FPMathOperator *FPOp = dyn_cast<FPMathOperator>(Val: &I);
1801 if (!FPOp)
1802 return false;
1803
1804 IRBuilder<> Builder(&I);
1805 Builder.setFastMathFlags(FPOp->getFastMathFlags());
1806
1807 auto *IITrue = dyn_cast<IntrinsicInst>(Val: TrueVal);
1808 auto *IIFalse = dyn_cast<IntrinsicInst>(Val: FalseVal);
1809
1810 Value *Fract = nullptr;
1811 if (Pred == FCmpInst::FCMP_UNO && TrueVal == CmpVal && IIFalse &&
1812 CmpVal == matchFractPat(I&: *IIFalse)) {
1813 // isnan(x) ? x : fract(x)
1814 Fract = applyFractPat(Builder, FractArg: CmpVal);
1815 } else if (Pred == FCmpInst::FCMP_ORD && FalseVal == CmpVal && IITrue &&
1816 CmpVal == matchFractPat(I&: *IITrue)) {
1817 // !isnan(x) ? fract(x) : x
1818 Fract = applyFractPat(Builder, FractArg: CmpVal);
1819 } else
1820 return false;
1821
1822 Fract->takeName(V: &I);
1823 I.replaceAllUsesWith(V: Fract);
1824 RecursivelyDeleteTriviallyDeadInstructions(V: &I, TLI);
1825 return true;
1826}
1827
1828static bool areInSameBB(const Value *A, const Value *B) {
1829 const auto *IA = dyn_cast<Instruction>(Val: A);
1830 const auto *IB = dyn_cast<Instruction>(Val: B);
1831 return IA && IB && IA->getParent() == IB->getParent();
1832}
1833
1834// Helper for breaking large PHIs that returns true when an extractelement on V
1835// is likely to be folded away by the DAG combiner.
1836static bool isInterestingPHIIncomingValue(const Value *V) {
1837 const auto *FVT = dyn_cast<FixedVectorType>(Val: V->getType());
1838 if (!FVT)
1839 return false;
1840
1841 const Value *CurVal = V;
1842
1843 // Check for insertelements, keeping track of the elements covered.
1844 BitVector EltsCovered(FVT->getNumElements());
1845 while (const auto *IE = dyn_cast<InsertElementInst>(Val: CurVal)) {
1846 const auto *Idx = dyn_cast<ConstantInt>(Val: IE->getOperand(i_nocapture: 2));
1847
1848 // Non constant index/out of bounds index -> folding is unlikely.
1849 // The latter is more of a sanity check because canonical IR should just
1850 // have replaced those with poison.
1851 if (!Idx || Idx->getZExtValue() >= FVT->getNumElements())
1852 return false;
1853
1854 const auto *VecSrc = IE->getOperand(i_nocapture: 0);
1855
1856 // If the vector source is another instruction, it must be in the same basic
1857 // block. Otherwise, the DAGCombiner won't see the whole thing and is
1858 // unlikely to be able to do anything interesting here.
1859 if (isa<Instruction>(Val: VecSrc) && !areInSameBB(A: VecSrc, B: IE))
1860 return false;
1861
1862 CurVal = VecSrc;
1863 EltsCovered.set(Idx->getZExtValue());
1864
1865 // All elements covered.
1866 if (EltsCovered.all())
1867 return true;
1868 }
1869
1870 // We either didn't find a single insertelement, or the insertelement chain
1871 // ended before all elements were covered. Check for other interesting values.
1872
1873 // Constants are always interesting because we can just constant fold the
1874 // extractelements.
1875 if (isa<Constant>(Val: CurVal))
1876 return true;
1877
1878 // shufflevector is likely to be profitable if either operand is a constant,
1879 // or if either source is in the same block.
1880 // This is because shufflevector is most often lowered as a series of
1881 // insert/extract elements anyway.
1882 if (const auto *SV = dyn_cast<ShuffleVectorInst>(Val: CurVal)) {
1883 return isa<Constant>(Val: SV->getOperand(i_nocapture: 1)) ||
1884 areInSameBB(A: SV, B: SV->getOperand(i_nocapture: 0)) ||
1885 areInSameBB(A: SV, B: SV->getOperand(i_nocapture: 1));
1886 }
1887
1888 return false;
1889}
1890
1891static void collectPHINodes(const PHINode &I,
1892 SmallPtrSet<const PHINode *, 8> &SeenPHIs) {
1893 const auto [It, Inserted] = SeenPHIs.insert(Ptr: &I);
1894 if (!Inserted)
1895 return;
1896
1897 for (const Value *Inc : I.incoming_values()) {
1898 if (const auto *PhiInc = dyn_cast<PHINode>(Val: Inc))
1899 collectPHINodes(I: *PhiInc, SeenPHIs);
1900 }
1901
1902 for (const User *U : I.users()) {
1903 if (const auto *PhiU = dyn_cast<PHINode>(Val: U))
1904 collectPHINodes(I: *PhiU, SeenPHIs);
1905 }
1906}
1907
1908bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(const PHINode &I) {
1909 // Check in the cache first.
1910 if (const auto It = BreakPhiNodesCache.find(Val: &I);
1911 It != BreakPhiNodesCache.end())
1912 return It->second;
1913
1914 // We consider PHI nodes as part of "chains", so given a PHI node I, we
1915 // recursively consider all its users and incoming values that are also PHI
1916 // nodes. We then make a decision about all of those PHIs at once. Either they
1917 // all get broken up, or none of them do. That way, we avoid cases where a
1918 // single PHI is/is not broken and we end up reforming/exploding a vector
1919 // multiple times, or even worse, doing it in a loop.
1920 SmallPtrSet<const PHINode *, 8> WorkList;
1921 collectPHINodes(I, SeenPHIs&: WorkList);
1922
1923#ifndef NDEBUG
1924 // Check that none of the PHI nodes in the worklist are in the map. If some of
1925 // them are, it means we're not good enough at collecting related PHIs.
1926 for (const PHINode *WLP : WorkList) {
1927 assert(BreakPhiNodesCache.count(WLP) == 0);
1928 }
1929#endif
1930
1931 // To consider a PHI profitable to break, we need to see some interesting
1932 // incoming values. At least 2/3rd (rounded up) of all PHIs in the worklist
1933 // must have one to consider all PHIs breakable.
1934 //
1935 // This threshold has been determined through performance testing.
1936 //
1937 // Note that the computation below is equivalent to
1938 //
1939 // (unsigned)ceil((K / 3.0) * 2)
1940 //
1941 // It's simply written this way to avoid mixing integral/FP arithmetic.
1942 const auto Threshold = (alignTo(Value: WorkList.size() * 2, Align: 3) / 3);
1943 unsigned NumBreakablePHIs = 0;
1944 bool CanBreak = false;
1945 for (const PHINode *Cur : WorkList) {
1946 // Don't break PHIs that have no interesting incoming values. That is, where
1947 // there is no clear opportunity to fold the "extractelement" instructions
1948 // we would add.
1949 //
1950 // Note: IC does not run after this pass, so we're only interested in the
1951 // foldings that the DAG combiner can do.
1952 if (any_of(Range: Cur->incoming_values(), P: isInterestingPHIIncomingValue)) {
1953 if (++NumBreakablePHIs >= Threshold) {
1954 CanBreak = true;
1955 break;
1956 }
1957 }
1958 }
1959
1960 for (const PHINode *Cur : WorkList)
1961 BreakPhiNodesCache[Cur] = CanBreak;
1962
1963 return CanBreak;
1964}
1965
1966/// Helper class for "break large PHIs" (visitPHINode).
1967///
1968/// This represents a slice of a PHI's incoming value, which is made up of:
1969/// - The type of the slice (Ty)
1970/// - The index in the incoming value's vector where the slice starts (Idx)
1971/// - The number of elements in the slice (NumElts).
1972/// It also keeps track of the NewPHI node inserted for this particular slice.
1973///
1974/// Slice examples:
1975/// <4 x i64> -> Split into four i64 slices.
1976/// -> [i64, 0, 1], [i64, 1, 1], [i64, 2, 1], [i64, 3, 1]
1977/// <5 x i16> -> Split into 2 <2 x i16> slices + a i16 tail.
1978/// -> [<2 x i16>, 0, 2], [<2 x i16>, 2, 2], [i16, 4, 1]
1979class VectorSlice {
1980public:
1981 VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
1982 : Ty(Ty), Idx(Idx), NumElts(NumElts) {}
1983
1984 Type *Ty = nullptr;
1985 unsigned Idx = 0;
1986 unsigned NumElts = 0;
1987 PHINode *NewPHI = nullptr;
1988
1989 /// Slice \p Inc according to the information contained within this slice.
1990 /// This is cached, so if called multiple times for the same \p BB & \p Inc
1991 /// pair, it returns the same Sliced value as well.
1992 ///
1993 /// Note this *intentionally* does not return the same value for, say,
1994 /// [%bb.0, %0] & [%bb.1, %0] as:
1995 /// - It could cause issues with dominance (e.g. if bb.1 is seen first, then
1996 /// the value in bb.1 may not be reachable from bb.0 if it's its
1997 /// predecessor.)
1998 /// - We also want to make our extract instructions as local as possible so
1999 /// the DAG has better chances of folding them out. Duplicating them like
2000 /// that is beneficial in that regard.
2001 ///
2002 /// This is both a minor optimization to avoid creating duplicate
2003 /// instructions, but also a requirement for correctness. It is not forbidden
2004 /// for a PHI node to have the same [BB, Val] pair multiple times. If we
2005 /// returned a new value each time, those previously identical pairs would all
2006 /// have different incoming values (from the same block) and it'd cause a "PHI
2007 /// node has multiple entries for the same basic block with different incoming
2008 /// values!" verifier error.
2009 Value *getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName) {
2010 Value *&Res = SlicedVals[{BB, Inc}];
2011 if (Res)
2012 return Res;
2013
2014 IRBuilder<> B(BB->getTerminator());
2015 if (Instruction *IncInst = dyn_cast<Instruction>(Val: Inc))
2016 B.SetCurrentDebugLocation(IncInst->getDebugLoc());
2017
2018 if (NumElts > 1) {
2019 SmallVector<int, 4> Mask;
2020 for (unsigned K = Idx; K < (Idx + NumElts); ++K)
2021 Mask.push_back(Elt: K);
2022 Res = B.CreateShuffleVector(V: Inc, Mask, Name: NewValName);
2023 } else
2024 Res = B.CreateExtractElement(Vec: Inc, Idx, Name: NewValName);
2025
2026 return Res;
2027 }
2028
2029private:
2030 SmallDenseMap<std::pair<BasicBlock *, Value *>, Value *> SlicedVals;
2031};
2032
2033bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
2034 // Break-up fixed-vector PHIs into smaller pieces.
2035 // Default threshold is 32, so it breaks up any vector that's >32 bits into
2036 // its elements, or into 32-bit pieces (for 8/16 bit elts).
2037 //
2038 // This is only helpful for DAGISel because it doesn't handle large PHIs as
2039 // well as GlobalISel. DAGISel lowers PHIs by using CopyToReg/CopyFromReg.
2040 // With large, odd-sized PHIs we may end up needing many `build_vector`
2041 // operations with most elements being "undef". This inhibits a lot of
2042 // optimization opportunities and can result in unreasonably high register
2043 // pressure and the inevitable stack spilling.
2044 if (!BreakLargePHIs || getCGPassBuilderOption().EnableGlobalISelOption)
2045 return false;
2046
2047 FixedVectorType *FVT = dyn_cast<FixedVectorType>(Val: I.getType());
2048 if (!FVT || FVT->getNumElements() == 1 ||
2049 DL.getTypeSizeInBits(Ty: FVT) <= BreakLargePHIsThreshold)
2050 return false;
2051
2052 if (!ForceBreakLargePHIs && !canBreakPHINode(I))
2053 return false;
2054
2055 std::vector<VectorSlice> Slices;
2056
2057 Type *EltTy = FVT->getElementType();
2058 {
2059 unsigned Idx = 0;
2060 // For 8/16 bits type, don't scalarize fully but break it up into as many
2061 // 32-bit slices as we can, and scalarize the tail.
2062 const unsigned EltSize = DL.getTypeSizeInBits(Ty: EltTy);
2063 const unsigned NumElts = FVT->getNumElements();
2064 if (EltSize == 8 || EltSize == 16) {
2065 const unsigned SubVecSize = (32 / EltSize);
2066 Type *SubVecTy = FixedVectorType::get(ElementType: EltTy, NumElts: SubVecSize);
2067 for (unsigned End = alignDown(Value: NumElts, Align: SubVecSize); Idx < End;
2068 Idx += SubVecSize)
2069 Slices.emplace_back(args&: SubVecTy, args&: Idx, args: SubVecSize);
2070 }
2071
2072 // Scalarize all remaining elements.
2073 for (; Idx < NumElts; ++Idx)
2074 Slices.emplace_back(args&: EltTy, args&: Idx, args: 1);
2075 }
2076
2077 assert(Slices.size() > 1);
2078
2079 // Create one PHI per vector piece. The "VectorSlice" class takes care of
2080 // creating the necessary instruction to extract the relevant slices of each
2081 // incoming value.
2082 IRBuilder<> B(I.getParent());
2083 B.SetCurrentDebugLocation(I.getDebugLoc());
2084
2085 unsigned IncNameSuffix = 0;
2086 for (VectorSlice &S : Slices) {
2087 // We need to reset the build on each iteration, because getSlicedVal may
2088 // have inserted something into I's BB.
2089 B.SetInsertPoint(I.getParent()->getFirstNonPHIIt());
2090 S.NewPHI = B.CreatePHI(Ty: S.Ty, NumReservedValues: I.getNumIncomingValues());
2091
2092 for (const auto &[Idx, BB] : enumerate(First: I.blocks())) {
2093 S.NewPHI->addIncoming(V: S.getSlicedVal(BB, Inc: I.getIncomingValue(i: Idx),
2094 NewValName: "largephi.extractslice" +
2095 std::to_string(val: IncNameSuffix++)),
2096 BB);
2097 }
2098 }
2099
2100 // And replace this PHI with a vector of all the previous PHI values.
2101 Value *Vec = PoisonValue::get(T: FVT);
2102 unsigned NameSuffix = 0;
2103 for (VectorSlice &S : Slices) {
2104 const auto ValName = "largephi.insertslice" + std::to_string(val: NameSuffix++);
2105 if (S.NumElts > 1)
2106 Vec = B.CreateInsertVector(DstType: FVT, SrcVec: Vec, SubVec: S.NewPHI, Idx: S.Idx, Name: ValName);
2107 else
2108 Vec = B.CreateInsertElement(Vec, NewElt: S.NewPHI, Idx: S.Idx, Name: ValName);
2109 }
2110
2111 I.replaceAllUsesWith(V: Vec);
2112 I.eraseFromParent();
2113 return true;
2114}
2115
2116/// \param V Value to check
2117/// \param DL DataLayout
2118/// \param TM TargetMachine (TODO: remove once DL contains nullptr values)
2119/// \param AS Target Address Space
2120/// \return true if \p V cannot be the null value of \p AS, false otherwise.
2121static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL,
2122 const AMDGPUTargetMachine &TM, unsigned AS) {
2123 // Pointer cannot be null if it's a block address, GV or alloca.
2124 // NOTE: We don't support extern_weak, but if we did, we'd need to check for
2125 // it as the symbol could be null in such cases.
2126 if (isa<BlockAddress, GlobalValue, AllocaInst>(Val: V))
2127 return true;
2128
2129 // Check nonnull arguments.
2130 if (const auto *Arg = dyn_cast<Argument>(Val: V); Arg && Arg->hasNonNullAttr())
2131 return true;
2132
2133 // Check nonnull loads.
2134 if (const auto *Load = dyn_cast<LoadInst>(Val: V);
2135 Load && Load->hasMetadata(KindID: LLVMContext::MD_nonnull))
2136 return true;
2137
2138 // getUnderlyingObject may have looked through another addrspacecast, although
2139 // the optimizable situations most likely folded out by now.
2140 if (AS != cast<PointerType>(Val: V->getType())->getAddressSpace())
2141 return false;
2142
2143 // TODO: Calls that return nonnull?
2144
2145 // For all other things, use KnownBits.
2146 // We either use 0 or all bits set to indicate null, so check whether the
2147 // value can be zero or all ones.
2148 //
2149 // TODO: Use ValueTracking's isKnownNeverNull if it becomes aware that some
2150 // address spaces have non-zero null values.
2151 auto SrcPtrKB = computeKnownBits(V, DL);
2152 const auto NullVal = TM.getNullPointerValue(AddrSpace: AS);
2153
2154 assert(SrcPtrKB.getBitWidth() == DL.getPointerSizeInBits(AS));
2155 assert((NullVal == 0 || NullVal == -1) &&
2156 "don't know how to check for this null value!");
2157 return NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero();
2158}
2159
2160bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
2161 // Intrinsic doesn't support vectors, also it seems that it's often difficult
2162 // to prove that a vector cannot have any nulls in it so it's unclear if it's
2163 // worth supporting.
2164 if (I.getType()->isVectorTy())
2165 return false;
2166
2167 // Check if this can be lowered to a amdgcn.addrspacecast.nonnull.
2168 // This is only worthwhile for casts from/to priv/local to flat.
2169 const unsigned SrcAS = I.getSrcAddressSpace();
2170 const unsigned DstAS = I.getDestAddressSpace();
2171
2172 bool CanLower = false;
2173 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
2174 CanLower = (DstAS == AMDGPUAS::LOCAL_ADDRESS ||
2175 DstAS == AMDGPUAS::PRIVATE_ADDRESS);
2176 else if (DstAS == AMDGPUAS::FLAT_ADDRESS)
2177 CanLower = (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2178 SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
2179 if (!CanLower)
2180 return false;
2181
2182 SmallVector<const Value *, 4> WorkList;
2183 getUnderlyingObjects(V: I.getOperand(i_nocapture: 0), Objects&: WorkList);
2184 if (!all_of(Range&: WorkList, P: [&](const Value *V) {
2185 return isPtrKnownNeverNull(V, DL, TM, AS: SrcAS);
2186 }))
2187 return false;
2188
2189 IRBuilder<> B(&I);
2190 auto *Intrin = B.CreateIntrinsic(
2191 RetTy: I.getType(), ID: Intrinsic::amdgcn_addrspacecast_nonnull, Args: {I.getOperand(i_nocapture: 0)});
2192 I.replaceAllUsesWith(V: Intrin);
2193 I.eraseFromParent();
2194 return true;
2195}
2196
2197bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
2198 switch (I.getIntrinsicID()) {
2199 case Intrinsic::bitreverse:
2200 return visitBitreverseIntrinsicInst(I);
2201 case Intrinsic::minnum:
2202 case Intrinsic::minimumnum:
2203 case Intrinsic::minimum:
2204 return visitFMinLike(I);
2205 case Intrinsic::sqrt:
2206 return visitSqrt(I);
2207 default:
2208 return false;
2209 }
2210}
2211
2212bool AMDGPUCodeGenPrepareImpl::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
2213 bool Changed = false;
2214
2215 if (ST.has16BitInsts() && needsPromotionToI32(T: I.getType()) &&
2216 UA.isUniform(I: &I))
2217 Changed |= promoteUniformBitreverseToI32(I);
2218
2219 return Changed;
2220}
2221
2222/// Match non-nan fract pattern.
2223/// minnum(fsub(x, floor(x)), nextafter(1.0, -1.0))
2224/// minimumnum(fsub(x, floor(x)), nextafter(1.0, -1.0))
2225/// minimum(fsub(x, floor(x)), nextafter(1.0, -1.0))
2226///
2227/// If fract is a useful instruction for the subtarget. Does not account for the
2228/// nan handling; the instruction has a nan check on the input value.
2229Value *AMDGPUCodeGenPrepareImpl::matchFractPat(IntrinsicInst &I) {
2230 if (ST.hasFractBug())
2231 return nullptr;
2232
2233 Intrinsic::ID IID = I.getIntrinsicID();
2234
2235 // The value is only used in contexts where we know the input isn't a nan, so
2236 // any of the fmin variants are fine.
2237 if (IID != Intrinsic::minnum && IID != Intrinsic::minimum &&
2238 IID != Intrinsic::minimumnum)
2239 return nullptr;
2240
2241 Type *Ty = I.getType();
2242 if (!isLegalFloatingTy(Ty: Ty->getScalarType()))
2243 return nullptr;
2244
2245 Value *Arg0 = I.getArgOperand(i: 0);
2246 Value *Arg1 = I.getArgOperand(i: 1);
2247
2248 const APFloat *C;
2249 if (!match(V: Arg1, P: m_APFloat(Res&: C)))
2250 return nullptr;
2251
2252 APFloat One(1.0);
2253 bool LosesInfo;
2254 One.convert(ToSemantics: C->getSemantics(), RM: APFloat::rmNearestTiesToEven, losesInfo: &LosesInfo);
2255
2256 // Match nextafter(1.0, -1)
2257 One.next(nextDown: true);
2258 if (One != *C)
2259 return nullptr;
2260
2261 Value *FloorSrc;
2262 if (match(V: Arg0, P: m_FSub(L: m_Value(V&: FloorSrc),
2263 R: m_Intrinsic<Intrinsic::floor>(Op0: m_Deferred(V: FloorSrc)))))
2264 return FloorSrc;
2265 return nullptr;
2266}
2267
2268Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,
2269 Value *FractArg) {
2270 SmallVector<Value *, 4> FractVals;
2271 extractValues(Builder, Values&: FractVals, V: FractArg);
2272
2273 SmallVector<Value *, 4> ResultVals(FractVals.size());
2274
2275 Type *Ty = FractArg->getType()->getScalarType();
2276 for (unsigned I = 0, E = FractVals.size(); I != E; ++I) {
2277 ResultVals[I] =
2278 Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_fract, Types: {Ty}, Args: {FractVals[I]});
2279 }
2280
2281 return insertValues(Builder, Ty: FractArg->getType(), Values&: ResultVals);
2282}
2283
2284bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) {
2285 Value *FractArg = matchFractPat(I);
2286 if (!FractArg)
2287 return false;
2288
2289 // Match pattern for fract intrinsic in contexts where the nan check has been
2290 // optimized out (and hope the knowledge the source can't be nan wasn't lost).
2291 if (!I.hasNoNaNs() && !isKnownNeverNaN(V: FractArg, SQ: SimplifyQuery(DL, TLI)))
2292 return false;
2293
2294 IRBuilder<> Builder(&I);
2295 FastMathFlags FMF = I.getFastMathFlags();
2296 FMF.setNoNaNs();
2297 Builder.setFastMathFlags(FMF);
2298
2299 Value *Fract = applyFractPat(Builder, FractArg);
2300 Fract->takeName(V: &I);
2301 I.replaceAllUsesWith(V: Fract);
2302
2303 RecursivelyDeleteTriviallyDeadInstructions(V: &I, TLI);
2304 return true;
2305}
2306
2307static bool isOneOrNegOne(const Value *Val) {
2308 const APFloat *C;
2309 return match(V: Val, P: m_APFloat(Res&: C)) && C->getExactLog2Abs() == 0;
2310}
2311
2312// Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way.
2313bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
2314 Type *Ty = Sqrt.getType()->getScalarType();
2315 if (!Ty->isFloatTy() && (!Ty->isHalfTy() || ST.has16BitInsts()))
2316 return false;
2317
2318 const FPMathOperator *FPOp = cast<const FPMathOperator>(Val: &Sqrt);
2319 FastMathFlags SqrtFMF = FPOp->getFastMathFlags();
2320
2321 // We're trying to handle the fast-but-not-that-fast case only. The lowering
2322 // of fast llvm.sqrt will give the raw instruction anyway.
2323 if (SqrtFMF.approxFunc() || HasUnsafeFPMath)
2324 return false;
2325
2326 const float ReqdAccuracy = FPOp->getFPAccuracy();
2327
2328 // Defer correctly rounded expansion to codegen.
2329 if (ReqdAccuracy < 1.0f)
2330 return false;
2331
2332 // FIXME: This is an ugly hack for this pass using forward iteration instead
2333 // of reverse. If it worked like a normal combiner, the rsq would form before
2334 // we saw a sqrt call.
2335 auto *FDiv =
2336 dyn_cast_or_null<FPMathOperator>(Val: Sqrt.getUniqueUndroppableUser());
2337 if (FDiv && FDiv->getOpcode() == Instruction::FDiv &&
2338 FDiv->getFPAccuracy() >= 1.0f &&
2339 canOptimizeWithRsq(SqrtOp: FPOp, DivFMF: FDiv->getFastMathFlags(), SqrtFMF) &&
2340 // TODO: We should also handle the arcp case for the fdiv with non-1 value
2341 isOneOrNegOne(Val: FDiv->getOperand(i: 0)))
2342 return false;
2343
2344 Value *SrcVal = Sqrt.getOperand(i_nocapture: 0);
2345 bool CanTreatAsDAZ = canIgnoreDenormalInput(V: SrcVal, CtxI: &Sqrt);
2346
2347 // The raw instruction is 1 ulp, but the correction for denormal handling
2348 // brings it to 2.
2349 if (!CanTreatAsDAZ && ReqdAccuracy < 2.0f)
2350 return false;
2351
2352 IRBuilder<> Builder(&Sqrt);
2353 SmallVector<Value *, 4> SrcVals;
2354 extractValues(Builder, Values&: SrcVals, V: SrcVal);
2355
2356 SmallVector<Value *, 4> ResultVals(SrcVals.size());
2357 for (int I = 0, E = SrcVals.size(); I != E; ++I) {
2358 if (CanTreatAsDAZ)
2359 ResultVals[I] = Builder.CreateCall(Callee: getSqrtF32(), Args: SrcVals[I]);
2360 else
2361 ResultVals[I] = emitSqrtIEEE2ULP(Builder, Src: SrcVals[I], FMF: SqrtFMF);
2362 }
2363
2364 Value *NewSqrt = insertValues(Builder, Ty: Sqrt.getType(), Values&: ResultVals);
2365 NewSqrt->takeName(V: &Sqrt);
2366 Sqrt.replaceAllUsesWith(V: NewSqrt);
2367 Sqrt.eraseFromParent();
2368 return true;
2369}
2370
2371bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
2372 if (skipFunction(F))
2373 return false;
2374
2375 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
2376 if (!TPC)
2377 return false;
2378
2379 const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
2380 const TargetLibraryInfo *TLI =
2381 &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
2382 AssumptionCache *AC =
2383 &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2384 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
2385 const DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
2386 const UniformityInfo &UA =
2387 getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
2388 return AMDGPUCodeGenPrepareImpl(F, TM, TLI, AC, DT, UA).run();
2389}
2390
2391PreservedAnalyses AMDGPUCodeGenPreparePass::run(Function &F,
2392 FunctionAnalysisManager &FAM) {
2393 const AMDGPUTargetMachine &ATM = static_cast<const AMDGPUTargetMachine &>(TM);
2394 const TargetLibraryInfo *TLI = &FAM.getResult<TargetLibraryAnalysis>(IR&: F);
2395 AssumptionCache *AC = &FAM.getResult<AssumptionAnalysis>(IR&: F);
2396 const DominatorTree *DT = FAM.getCachedResult<DominatorTreeAnalysis>(IR&: F);
2397 const UniformityInfo &UA = FAM.getResult<UniformityInfoAnalysis>(IR&: F);
2398 AMDGPUCodeGenPrepareImpl Impl(F, ATM, TLI, AC, DT, UA);
2399 if (!Impl.run())
2400 return PreservedAnalyses::all();
2401 PreservedAnalyses PA = PreservedAnalyses::none();
2402 if (!Impl.FlowChanged)
2403 PA.preserveSet<CFGAnalyses>();
2404 return PA;
2405}
2406
2407INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
2408 "AMDGPU IR optimizations", false, false)
2409INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
2410INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
2411INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
2412INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
2413 false, false)
2414
2415char AMDGPUCodeGenPrepare::ID = 0;
2416
2417FunctionPass *llvm::createAMDGPUCodeGenPreparePass() {
2418 return new AMDGPUCodeGenPrepare();
2419}
2420