1//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass does misc. AMDGPU optimizations on IR before instruction
11/// selection.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPU.h"
16#include "AMDGPUTargetMachine.h"
17#include "SIModeRegisterDefaults.h"
18#include "llvm/ADT/SetVector.h"
19#include "llvm/Analysis/AssumptionCache.h"
20#include "llvm/Analysis/ConstantFolding.h"
21#include "llvm/Analysis/TargetLibraryInfo.h"
22#include "llvm/Analysis/TargetTransformInfo.h"
23#include "llvm/Analysis/UniformityAnalysis.h"
24#include "llvm/Analysis/ValueTracking.h"
25#include "llvm/CodeGen/TargetPassConfig.h"
26#include "llvm/IR/Dominators.h"
27#include "llvm/IR/IRBuilder.h"
28#include "llvm/IR/InstVisitor.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include "llvm/IR/PatternMatch.h"
31#include "llvm/IR/ValueHandle.h"
32#include "llvm/InitializePasses.h"
33#include "llvm/Pass.h"
34#include "llvm/Support/KnownBits.h"
35#include "llvm/Support/KnownFPClass.h"
36#include "llvm/Transforms/Utils/BasicBlockUtils.h"
37#include "llvm/Transforms/Utils/IntegerDivision.h"
38#include "llvm/Transforms/Utils/Local.h"
39
40#define DEBUG_TYPE "amdgpu-codegenprepare"
41
42using namespace llvm;
43using namespace llvm::PatternMatch;
44
45namespace {
46
47static cl::opt<bool> WidenLoads(
48 "amdgpu-codegenprepare-widen-constant-loads",
49 cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
50 cl::ReallyHidden,
51 cl::init(Val: false));
52
53static cl::opt<bool>
54 BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
55 cl::desc("Break large PHI nodes for DAGISel"),
56 cl::ReallyHidden, cl::init(Val: true));
57
58static cl::opt<bool>
59 ForceBreakLargePHIs("amdgpu-codegenprepare-force-break-large-phis",
60 cl::desc("For testing purposes, always break large "
61 "PHIs even if it isn't profitable."),
62 cl::ReallyHidden, cl::init(Val: false));
63
64static cl::opt<unsigned> BreakLargePHIsThreshold(
65 "amdgpu-codegenprepare-break-large-phis-threshold",
66 cl::desc("Minimum type size in bits for breaking large PHI nodes"),
67 cl::ReallyHidden, cl::init(Val: 32));
68
69static cl::opt<bool> UseMul24Intrin(
70 "amdgpu-codegenprepare-mul24",
71 cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
72 cl::ReallyHidden,
73 cl::init(Val: true));
74
75// Legalize 64-bit division by using the generic IR expansion.
76static cl::opt<bool> ExpandDiv64InIR(
77 "amdgpu-codegenprepare-expand-div64",
78 cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),
79 cl::ReallyHidden,
80 cl::init(Val: false));
81
82// Leave all division operations as they are. This supersedes ExpandDiv64InIR
83// and is used for testing the legalizer.
84static cl::opt<bool> DisableIDivExpand(
85 "amdgpu-codegenprepare-disable-idiv-expansion",
86 cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"),
87 cl::ReallyHidden,
88 cl::init(Val: false));
89
90// Disable processing of fdiv so we can better test the backend implementations.
91static cl::opt<bool> DisableFDivExpand(
92 "amdgpu-codegenprepare-disable-fdiv-expansion",
93 cl::desc("Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
94 cl::ReallyHidden,
95 cl::init(Val: false));
96
97class AMDGPUCodeGenPrepareImpl
98 : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
99public:
100 Function &F;
101 const GCNSubtarget &ST;
102 const AMDGPUTargetMachine &TM;
103 const TargetLibraryInfo *TLI;
104 const UniformityInfo &UA;
105 const DataLayout &DL;
106 SimplifyQuery SQ;
107 const bool HasFP32DenormalFlush;
108 bool FlowChanged = false;
109 mutable Function *SqrtF32 = nullptr;
110 mutable Function *LdexpF32 = nullptr;
111 mutable SmallVector<WeakVH> DeadVals;
112
113 DenseMap<const PHINode *, bool> BreakPhiNodesCache;
114
115 AMDGPUCodeGenPrepareImpl(Function &F, const AMDGPUTargetMachine &TM,
116 const TargetLibraryInfo *TLI, AssumptionCache *AC,
117 const DominatorTree *DT, const UniformityInfo &UA)
118 : F(F), ST(TM.getSubtarget<GCNSubtarget>(F)), TM(TM), TLI(TLI), UA(UA),
119 DL(F.getDataLayout()), SQ(DL, TLI, DT, AC),
120 HasFP32DenormalFlush(SIModeRegisterDefaults(F, ST).FP32Denormals ==
121 DenormalMode::getPreserveSign()) {}
122
123 Function *getSqrtF32() const {
124 if (SqrtF32)
125 return SqrtF32;
126
127 LLVMContext &Ctx = F.getContext();
128 SqrtF32 = Intrinsic::getOrInsertDeclaration(
129 M: F.getParent(), id: Intrinsic::amdgcn_sqrt, OverloadTys: {Type::getFloatTy(C&: Ctx)});
130 return SqrtF32;
131 }
132
133 Function *getLdexpF32() const {
134 if (LdexpF32)
135 return LdexpF32;
136
137 LLVMContext &Ctx = F.getContext();
138 LdexpF32 = Intrinsic::getOrInsertDeclaration(
139 M: F.getParent(), id: Intrinsic::ldexp,
140 OverloadTys: {Type::getFloatTy(C&: Ctx), Type::getInt32Ty(C&: Ctx)});
141 return LdexpF32;
142 }
143
144 bool canBreakPHINode(const PHINode &I);
145
146 /// Return true if \p T is a legal scalar floating point type.
147 bool isLegalFloatingTy(const Type *T) const;
148
149 /// Wrapper to pass all the arguments to computeKnownFPClass
150 KnownFPClass computeKnownFPClass(const Value *V, FPClassTest Interested,
151 const Instruction *CtxI) const {
152 return llvm::computeKnownFPClass(V, InterestedClasses: Interested,
153 SQ: SQ.getWithInstruction(I: CtxI));
154 }
155
156 bool canIgnoreDenormalInput(const Value *V, const Instruction *CtxI) const {
157 return HasFP32DenormalFlush ||
158 computeKnownFPClass(V, Interested: fcSubnormal, CtxI).isKnownNeverSubnormal();
159 }
160
161 /// \returns The minimum number of bits needed to store the value of \Op as an
162 /// unsigned integer. Truncating to this size and then zero-extending to
163 /// the original will not change the value.
164 unsigned numBitsUnsigned(Value *Op, const Instruction *CtxI) const;
165
166 /// \returns The minimum number of bits needed to store the value of \Op as a
167 /// signed integer. Truncating to this size and then sign-extending to
168 /// the original size will not change the value.
169 unsigned numBitsSigned(Value *Op, const Instruction *CtxI) const;
170
171 /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
172 /// SelectionDAG has an issue where an and asserting the bits are known
173 bool replaceMulWithMul24(BinaryOperator &I) const;
174
175 /// Perform same function as equivalently named function in DAGCombiner. Since
176 /// we expand some divisions here, we need to perform this before obscuring.
177 bool foldBinOpIntoSelect(BinaryOperator &I) const;
178
179 bool divHasSpecialOptimization(BinaryOperator &I,
180 Value *Num, Value *Den) const;
181 unsigned getDivNumBits(BinaryOperator &I, Value *Num, Value *Den,
182 unsigned MaxDivBits, bool Signed) const;
183
184 /// Expands 24 bit div or rem.
185 Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
186 Value *Num, Value *Den,
187 bool IsDiv, bool IsSigned) const;
188
189 Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I,
190 Value *Num, Value *Den, unsigned NumBits,
191 bool IsDiv, bool IsSigned) const;
192
193 /// Expands 32 bit div or rem.
194 Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
195 Value *Num, Value *Den) const;
196
197 Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I,
198 Value *Num, Value *Den) const;
199 void expandDivRem64(BinaryOperator &I) const;
200
201 /// Widen a scalar load.
202 ///
203 /// \details \p Widen scalar load for uniform, small type loads from constant
204 // memory / to a full 32-bits and then truncate the input to allow a scalar
205 // load instead of a vector load.
206 //
207 /// \returns True.
208
209 bool canWidenScalarExtLoad(LoadInst &I) const;
210
211 Value *matchFractPat(Value &V);
212 Value *applyFractPat(IRBuilder<> &Builder, Value *FractArg);
213
214 bool canOptimizeWithRsq(FastMathFlags DivFMF, FastMathFlags SqrtFMF) const;
215
216 Value *optimizeWithRsq(IRBuilder<> &Builder, Value *Num, Value *Den,
217 FastMathFlags DivFMF, FastMathFlags SqrtFMF,
218 const Instruction *CtxI) const;
219
220 Value *optimizeWithRcp(IRBuilder<> &Builder, Value *Num, Value *Den,
221 FastMathFlags FMF, const Instruction *CtxI) const;
222 Value *optimizeWithFDivFast(IRBuilder<> &Builder, Value *Num, Value *Den,
223 float ReqdAccuracy) const;
224
225 Value *visitFDivElement(IRBuilder<> &Builder, Value *Num, Value *Den,
226 FastMathFlags DivFMF, FastMathFlags SqrtFMF,
227 Value *RsqOp, const Instruction *FDiv,
228 float ReqdAccuracy) const;
229
230 std::pair<Value *, Value *> getFrexpResults(IRBuilder<> &Builder,
231 Value *Src) const;
232
233 Value *emitRcpIEEE1ULP(IRBuilder<> &Builder, Value *Src,
234 bool IsNegative) const;
235 Value *emitFrexpDiv(IRBuilder<> &Builder, Value *LHS, Value *RHS,
236 FastMathFlags FMF) const;
237 Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src,
238 FastMathFlags FMF) const;
239 Value *emitRsqF64(IRBuilder<> &Builder, Value *X, FastMathFlags SqrtFMF,
240 FastMathFlags DivFMF, const Instruction *CtxI,
241 bool IsNegative) const;
242
243 CallInst *createWorkitemIdX(IRBuilder<> &B) const;
244 void replaceWithWorkitemIdX(Instruction &I) const;
245 void replaceWithMaskedWorkitemIdX(Instruction &I, unsigned WaveSize) const;
246 bool tryReplaceWithWorkitemId(Instruction &I, unsigned Wave) const;
247
248 bool tryNarrowMathIfNoOverflow(Instruction *I);
249
250public:
251 bool visitFDiv(BinaryOperator &I);
252
253 bool visitInstruction(Instruction &I) { return false; }
254 bool visitBinaryOperator(BinaryOperator &I);
255 bool visitLoadInst(LoadInst &I);
256 bool visitSelectInst(SelectInst &I);
257 bool visitPHINode(PHINode &I);
258 bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
259
260 bool visitIntrinsicInst(IntrinsicInst &I);
261 bool visitFMinLike(IntrinsicInst &I);
262 bool visitSqrt(IntrinsicInst &I);
263 bool visitLog(FPMathOperator &Log, Intrinsic::ID IID);
264 bool visitMbcntLo(IntrinsicInst &I) const;
265 bool visitMbcntHi(IntrinsicInst &I) const;
266 bool run();
267};
268
269class AMDGPUCodeGenPrepare : public FunctionPass {
270public:
271 static char ID;
272 AMDGPUCodeGenPrepare() : FunctionPass(ID) {}
273 void getAnalysisUsage(AnalysisUsage &AU) const override {
274 AU.addRequired<AssumptionCacheTracker>();
275 AU.addRequired<UniformityInfoWrapperPass>();
276 AU.addRequired<TargetLibraryInfoWrapperPass>();
277
278 // FIXME: Division expansion needs to preserve the dominator tree.
279 if (!ExpandDiv64InIR)
280 AU.setPreservesAll();
281 }
282 bool runOnFunction(Function &F) override;
283 StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
284};
285
286} // end anonymous namespace
287
288bool AMDGPUCodeGenPrepareImpl::run() {
289 BreakPhiNodesCache.clear();
290 bool MadeChange = false;
291
292 // Need to use make_early_inc_range because integer division expansion is
293 // handled by Transform/Utils, and it can delete instructions such as the
294 // terminator of the BB.
295 for (BasicBlock &BB : reverse(C&: F)) {
296 for (Instruction &I : make_early_inc_range(Range: reverse(C&: BB))) {
297 if (!isInstructionTriviallyDead(I: &I, TLI))
298 MadeChange |= visit(I);
299 }
300 }
301
302 while (!DeadVals.empty()) {
303 if (auto *I = dyn_cast_or_null<Instruction>(Val: DeadVals.pop_back_val()))
304 RecursivelyDeleteTriviallyDeadInstructions(V: I, TLI);
305 }
306
307 return MadeChange;
308}
309
310bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const {
311 return Ty->isFloatTy() || Ty->isDoubleTy() ||
312 (Ty->isHalfTy() && ST.has16BitInsts());
313}
314
315bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {
316 Type *Ty = I.getType();
317 int TySize = DL.getTypeSizeInBits(Ty);
318 Align Alignment = DL.getValueOrABITypeAlignment(Alignment: I.getAlign(), Ty);
319
320 return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.isUniform(I: &I);
321}
322
323unsigned
324AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op,
325 const Instruction *CtxI) const {
326 return computeKnownBits(V: Op, Q: SQ.getWithInstruction(I: CtxI)).countMaxActiveBits();
327}
328
329unsigned
330AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op,
331 const Instruction *CtxI) const {
332 return ComputeMaxSignificantBits(Op, DL: SQ.DL, AC: SQ.AC, CxtI: CtxI, DT: SQ.DT);
333}
334
335static void extractValues(IRBuilder<> &Builder,
336 SmallVectorImpl<Value *> &Values, Value *V) {
337 auto *VT = dyn_cast<FixedVectorType>(Val: V->getType());
338 if (!VT) {
339 Values.push_back(Elt: V);
340 return;
341 }
342
343 for (int I = 0, E = VT->getNumElements(); I != E; ++I)
344 Values.push_back(Elt: Builder.CreateExtractElement(Vec: V, Idx: I));
345}
346
347static Value *insertValues(IRBuilder<> &Builder,
348 Type *Ty,
349 SmallVectorImpl<Value *> &Values) {
350 if (!Ty->isVectorTy()) {
351 assert(Values.size() == 1);
352 return Values[0];
353 }
354
355 Value *NewVal = PoisonValue::get(T: Ty);
356 for (int I = 0, E = Values.size(); I != E; ++I)
357 NewVal = Builder.CreateInsertElement(Vec: NewVal, NewElt: Values[I], Idx: I);
358
359 return NewVal;
360}
361
362bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
363 if (I.getOpcode() != Instruction::Mul)
364 return false;
365
366 Type *Ty = I.getType();
367 unsigned Size = Ty->getScalarSizeInBits();
368 if (Size <= 16 && ST.has16BitInsts())
369 return false;
370
371 // Prefer scalar if this could be s_mul_i32
372 if (UA.isUniform(I: &I))
373 return false;
374
375 Value *LHS = I.getOperand(i_nocapture: 0);
376 Value *RHS = I.getOperand(i_nocapture: 1);
377 IRBuilder<> Builder(&I);
378 Builder.SetCurrentDebugLocation(I.getDebugLoc());
379
380 unsigned LHSBits = 0, RHSBits = 0;
381 bool IsSigned = false;
382
383 if (ST.hasMulU24() && (LHSBits = numBitsUnsigned(Op: LHS, CtxI: &I)) <= 24 &&
384 (RHSBits = numBitsUnsigned(Op: RHS, CtxI: &I)) <= 24) {
385 IsSigned = false;
386
387 } else if (ST.hasMulI24() && (LHSBits = numBitsSigned(Op: LHS, CtxI: &I)) <= 24 &&
388 (RHSBits = numBitsSigned(Op: RHS, CtxI: &I)) <= 24) {
389 IsSigned = true;
390
391 } else
392 return false;
393
394 SmallVector<Value *, 4> LHSVals;
395 SmallVector<Value *, 4> RHSVals;
396 SmallVector<Value *, 4> ResultVals;
397 extractValues(Builder, Values&: LHSVals, V: LHS);
398 extractValues(Builder, Values&: RHSVals, V: RHS);
399
400 IntegerType *I32Ty = Builder.getInt32Ty();
401 IntegerType *IntrinTy = Size > 32 ? Builder.getInt64Ty() : I32Ty;
402 Type *DstTy = LHSVals[0]->getType();
403
404 for (int I = 0, E = LHSVals.size(); I != E; ++I) {
405 Value *LHS = IsSigned ? Builder.CreateSExtOrTrunc(V: LHSVals[I], DestTy: I32Ty)
406 : Builder.CreateZExtOrTrunc(V: LHSVals[I], DestTy: I32Ty);
407 Value *RHS = IsSigned ? Builder.CreateSExtOrTrunc(V: RHSVals[I], DestTy: I32Ty)
408 : Builder.CreateZExtOrTrunc(V: RHSVals[I], DestTy: I32Ty);
409 Intrinsic::ID ID =
410 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
411 Value *Result = Builder.CreateIntrinsic(ID, Types: {IntrinTy}, Args: {LHS, RHS});
412 Result = IsSigned ? Builder.CreateSExtOrTrunc(V: Result, DestTy: DstTy)
413 : Builder.CreateZExtOrTrunc(V: Result, DestTy: DstTy);
414 ResultVals.push_back(Elt: Result);
415 }
416
417 Value *NewVal = insertValues(Builder, Ty, Values&: ResultVals);
418 NewVal->takeName(V: &I);
419 I.replaceAllUsesWith(V: NewVal);
420 DeadVals.push_back(Elt: &I);
421
422 return true;
423}
424
425// Find a select instruction, which may have been casted. This is mostly to deal
426// with cases where i16 selects were promoted here to i32.
427static SelectInst *findSelectThroughCast(Value *V, CastInst *&Cast) {
428 Cast = nullptr;
429 if (SelectInst *Sel = dyn_cast<SelectInst>(Val: V))
430 return Sel;
431
432 if ((Cast = dyn_cast<CastInst>(Val: V))) {
433 if (SelectInst *Sel = dyn_cast<SelectInst>(Val: Cast->getOperand(i_nocapture: 0)))
434 return Sel;
435 }
436
437 return nullptr;
438}
439
440bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const {
441 // Don't do this unless the old select is going away. We want to eliminate the
442 // binary operator, not replace a binop with a select.
443 int SelOpNo = 0;
444
445 CastInst *CastOp;
446
447 // TODO: Should probably try to handle some cases with multiple
448 // users. Duplicating the select may be profitable for division.
449 SelectInst *Sel = findSelectThroughCast(V: BO.getOperand(i_nocapture: 0), Cast&: CastOp);
450 if (!Sel || !Sel->hasOneUse()) {
451 SelOpNo = 1;
452 Sel = findSelectThroughCast(V: BO.getOperand(i_nocapture: 1), Cast&: CastOp);
453 }
454
455 if (!Sel || !Sel->hasOneUse())
456 return false;
457
458 Constant *CT = dyn_cast<Constant>(Val: Sel->getTrueValue());
459 Constant *CF = dyn_cast<Constant>(Val: Sel->getFalseValue());
460 Constant *CBO = dyn_cast<Constant>(Val: BO.getOperand(i_nocapture: SelOpNo ^ 1));
461 if (!CBO || !CT || !CF)
462 return false;
463
464 if (CastOp) {
465 if (!CastOp->hasOneUse())
466 return false;
467 CT = ConstantFoldCastOperand(Opcode: CastOp->getOpcode(), C: CT, DestTy: BO.getType(), DL);
468 CF = ConstantFoldCastOperand(Opcode: CastOp->getOpcode(), C: CF, DestTy: BO.getType(), DL);
469 }
470
471 // TODO: Handle special 0/-1 cases DAG combine does, although we only really
472 // need to handle divisions here.
473 Constant *FoldedT =
474 SelOpNo ? ConstantFoldBinaryOpOperands(Opcode: BO.getOpcode(), LHS: CBO, RHS: CT, DL)
475 : ConstantFoldBinaryOpOperands(Opcode: BO.getOpcode(), LHS: CT, RHS: CBO, DL);
476 if (!FoldedT || isa<ConstantExpr>(Val: FoldedT))
477 return false;
478
479 Constant *FoldedF =
480 SelOpNo ? ConstantFoldBinaryOpOperands(Opcode: BO.getOpcode(), LHS: CBO, RHS: CF, DL)
481 : ConstantFoldBinaryOpOperands(Opcode: BO.getOpcode(), LHS: CF, RHS: CBO, DL);
482 if (!FoldedF || isa<ConstantExpr>(Val: FoldedF))
483 return false;
484
485 IRBuilder<> Builder(&BO);
486 Builder.SetCurrentDebugLocation(BO.getDebugLoc());
487 if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(Val: &BO))
488 Builder.setFastMathFlags(FPOp->getFastMathFlags());
489
490 Value *NewSelect = Builder.CreateSelect(C: Sel->getCondition(),
491 True: FoldedT, False: FoldedF);
492 NewSelect->takeName(V: &BO);
493 BO.replaceAllUsesWith(V: NewSelect);
494 DeadVals.push_back(Elt: &BO);
495 if (CastOp)
496 DeadVals.push_back(Elt: CastOp);
497 DeadVals.push_back(Elt: Sel);
498 return true;
499}
500
501std::pair<Value *, Value *>
502AMDGPUCodeGenPrepareImpl::getFrexpResults(IRBuilder<> &Builder,
503 Value *Src) const {
504 Type *Ty = Src->getType();
505 Value *Frexp = Builder.CreateIntrinsic(ID: Intrinsic::frexp,
506 Types: {Ty, Builder.getInt32Ty()}, Args: Src);
507 Value *FrexpMant = Builder.CreateExtractValue(Agg: Frexp, Idxs: {0});
508
509 // Bypass the bug workaround for the exponent result since it doesn't matter.
510 // TODO: Does the bug workaround even really need to consider the exponent
511 // result? It's unspecified by the spec.
512
513 Value *FrexpExp =
514 ST.hasFractBug()
515 ? Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_frexp_exp,
516 Types: {Builder.getInt32Ty(), Ty}, Args: Src)
517 : Builder.CreateExtractValue(Agg: Frexp, Idxs: {1});
518 return {FrexpMant, FrexpExp};
519}
520
521/// Emit an expansion of 1.0 / Src good for 1ulp that supports denormals.
522Value *AMDGPUCodeGenPrepareImpl::emitRcpIEEE1ULP(IRBuilder<> &Builder,
523 Value *Src,
524 bool IsNegative) const {
525 // Same as for 1.0, but expand the sign out of the constant.
526 // -1.0 / x -> rcp (fneg x)
527 if (IsNegative)
528 Src = Builder.CreateFNeg(V: Src);
529
530 // The rcp instruction doesn't support denormals, so scale the input
531 // out of the denormal range and convert at the end.
532 //
533 // Expand as 2^-n * (1.0 / (x * 2^n))
534
535 // TODO: Skip scaling if input is known never denormal and the input
536 // range won't underflow to denormal. The hard part is knowing the
537 // result. We need a range check, the result could be denormal for
538 // 0x1p+126 < den <= 0x1p+127.
539 auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);
540 Value *ScaleFactor = Builder.CreateNeg(V: FrexpExp);
541 Value *Rcp = Builder.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rcp, V: FrexpMant);
542 return Builder.CreateCall(Callee: getLdexpF32(), Args: {Rcp, ScaleFactor});
543}
544
545/// Emit a 2ulp expansion for fdiv by using frexp for input scaling.
546Value *AMDGPUCodeGenPrepareImpl::emitFrexpDiv(IRBuilder<> &Builder, Value *LHS,
547 Value *RHS,
548 FastMathFlags FMF) const {
549 // If we have have to work around the fract/frexp bug, we're worse off than
550 // using the fdiv.fast expansion. The full safe expansion is faster if we have
551 // fast FMA.
552 if (HasFP32DenormalFlush && ST.hasFractBug() && !ST.hasFastFMAF32() &&
553 (!FMF.noNaNs() || !FMF.noInfs()))
554 return nullptr;
555
556 // We're scaling the LHS to avoid a denormal input, and scale the denominator
557 // to avoid large values underflowing the result.
558 auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, Src: RHS);
559
560 Value *Rcp =
561 Builder.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rcp, V: FrexpMantRHS);
562
563 auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder, Src: LHS);
564 Value *Mul = Builder.CreateFMul(L: FrexpMantLHS, R: Rcp);
565
566 // We multiplied by 2^N/2^M, so we need to multiply by 2^(N-M) to scale the
567 // result.
568 Value *ExpDiff = Builder.CreateSub(LHS: FrexpExpLHS, RHS: FrexpExpRHS);
569 return Builder.CreateCall(Callee: getLdexpF32(), Args: {Mul, ExpDiff});
570}
571
572/// Emit a sqrt that handles denormals and is accurate to 2ulp.
573Value *AMDGPUCodeGenPrepareImpl::emitSqrtIEEE2ULP(IRBuilder<> &Builder,
574 Value *Src,
575 FastMathFlags FMF) const {
576 Type *Ty = Src->getType();
577 APFloat SmallestNormal =
578 APFloat::getSmallestNormalized(Sem: Ty->getFltSemantics());
579 Value *NeedScale =
580 Builder.CreateFCmpOLT(LHS: Src, RHS: ConstantFP::get(Ty, V: SmallestNormal));
581
582 ConstantInt *Zero = Builder.getInt32(C: 0);
583 Value *InputScaleFactor =
584 Builder.CreateSelect(C: NeedScale, True: Builder.getInt32(C: 32), False: Zero);
585
586 Value *Scaled = Builder.CreateCall(Callee: getLdexpF32(), Args: {Src, InputScaleFactor});
587
588 Value *Sqrt = Builder.CreateCall(Callee: getSqrtF32(), Args: Scaled);
589
590 Value *OutputScaleFactor =
591 Builder.CreateSelect(C: NeedScale, True: Builder.getInt32(C: -16), False: Zero);
592 return Builder.CreateCall(Callee: getLdexpF32(), Args: {Sqrt, OutputScaleFactor});
593}
594
595/// Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
596static Value *emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src,
597 bool IsNegative) {
598 // bool need_scale = x < 0x1p-126f;
599 // float input_scale = need_scale ? 0x1.0p+24f : 1.0f;
600 // float output_scale = need_scale ? 0x1.0p+12f : 1.0f;
601 // rsq(x * input_scale) * output_scale;
602
603 Type *Ty = Src->getType();
604 APFloat SmallestNormal =
605 APFloat::getSmallestNormalized(Sem: Ty->getFltSemantics());
606 Value *NeedScale =
607 Builder.CreateFCmpOLT(LHS: Src, RHS: ConstantFP::get(Ty, V: SmallestNormal));
608 Constant *One = ConstantFP::get(Ty, V: 1.0);
609 Constant *InputScale = ConstantFP::get(Ty, V: 0x1.0p+24);
610 Constant *OutputScale =
611 ConstantFP::get(Ty, V: IsNegative ? -0x1.0p+12 : 0x1.0p+12);
612
613 Value *InputScaleFactor = Builder.CreateSelect(C: NeedScale, True: InputScale, False: One);
614
615 Value *ScaledInput = Builder.CreateFMul(L: Src, R: InputScaleFactor);
616 Value *Rsq = Builder.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rsq, V: ScaledInput);
617 Value *OutputScaleFactor = Builder.CreateSelect(
618 C: NeedScale, True: OutputScale, False: IsNegative ? ConstantFP::get(Ty, V: -1.0) : One);
619
620 return Builder.CreateFMul(L: Rsq, R: OutputScaleFactor);
621}
622
623/// Emit inverse sqrt expansion for f64 with a correction sequence on top of
624/// v_rsq_f64. This should give a 1ulp result.
625Value *AMDGPUCodeGenPrepareImpl::emitRsqF64(IRBuilder<> &Builder, Value *X,
626 FastMathFlags SqrtFMF,
627 FastMathFlags DivFMF,
628 const Instruction *CtxI,
629 bool IsNegative) const {
630 // rsq(x):
631 // double y0 = BUILTIN_AMDGPU_RSQRT_F64(x);
632 // double e = MATH_MAD(-y0 * (x == PINF_F64 || x == 0.0 ? y0 : x), y0, 1.0);
633 // return MATH_MAD(y0*e, MATH_MAD(e, 0.375, 0.5), y0);
634 //
635 // -rsq(x):
636 // double y0 = BUILTIN_AMDGPU_RSQRT_F64(x);
637 // double e = MATH_MAD(-y0 * (x == PINF_F64 || x == 0.0 ? y0 : x), y0, 1.0);
638 // return MATH_MAD(-y0*e, MATH_MAD(e, 0.375, 0.5), -y0);
639 //
640 // The rsq instruction handles the special cases correctly. We need to check
641 // for the edge case conditions to ensure the special case propagates through
642 // the later instructions.
643
644 Value *Y0 = Builder.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rsq, V: X);
645
646 // Try to elide the edge case check.
647 //
648 // Fast math flags imply:
649 // sqrt ninf => !isinf(x)
650 // fdiv ninf => x != 0, !isinf(x)
651 bool MaybePosInf = !SqrtFMF.noInfs() && !DivFMF.noInfs();
652 bool MaybeZero = !DivFMF.noInfs();
653
654 DenormalMode DenormMode;
655 FPClassTest Interested = fcNone;
656 if (MaybePosInf)
657 Interested = fcPosInf;
658 if (MaybeZero)
659 Interested |= fcZero;
660
661 if (Interested != fcNone) {
662 KnownFPClass KnownSrc = computeKnownFPClass(V: X, Interested, CtxI);
663 if (KnownSrc.isKnownNeverPosInfinity())
664 MaybePosInf = false;
665
666 DenormMode = F.getDenormalMode(FPType: X->getType()->getFltSemantics());
667 if (KnownSrc.isKnownNeverLogicalZero(Mode: DenormMode))
668 MaybeZero = false;
669 }
670
671 Value *SpecialOrRsq = X;
672 if (MaybeZero || MaybePosInf) {
673 Value *Cond;
674 if (MaybePosInf && MaybeZero) {
675 if (DenormMode.Input != DenormalMode::DenormalModeKind::Dynamic) {
676 FPClassTest TestMask = fcPosInf | fcZero;
677 if (DenormMode.inputsAreZero())
678 TestMask |= fcSubnormal;
679
680 Cond = Builder.createIsFPClass(FPNum: X, Test: TestMask);
681 } else {
682 // Avoid using llvm.is.fpclass for dynamic denormal mode, since it
683 // doesn't respect the floating-point environment.
684 Value *IsZero =
685 Builder.CreateFCmpOEQ(LHS: X, RHS: ConstantFP::getZero(Ty: X->getType()));
686 Value *IsInf =
687 Builder.CreateFCmpOEQ(LHS: X, RHS: ConstantFP::getInfinity(Ty: X->getType()));
688 Cond = Builder.CreateOr(LHS: IsZero, RHS: IsInf);
689 }
690 } else if (MaybeZero) {
691 Cond = Builder.CreateFCmpOEQ(LHS: X, RHS: ConstantFP::getZero(Ty: X->getType()));
692 } else {
693 Cond = Builder.CreateFCmpOEQ(LHS: X, RHS: ConstantFP::getInfinity(Ty: X->getType()));
694 }
695
696 SpecialOrRsq = Builder.CreateSelect(C: Cond, True: Y0, False: X);
697 }
698
699 Value *NegY0 = Builder.CreateFNeg(V: Y0);
700 Value *NegXY0 = Builder.CreateFMul(L: SpecialOrRsq, R: NegY0);
701
702 // Could be fmuladd, but isFMAFasterThanFMulAndFAdd is always true for f64.
703 Value *E = Builder.CreateFMA(Factor1: NegXY0, Factor2: Y0, Summand: ConstantFP::get(Ty: X->getType(), V: 1.0));
704
705 Value *Y0E = Builder.CreateFMul(L: E, R: IsNegative ? NegY0 : Y0);
706
707 Value *EFMA = Builder.CreateFMA(Factor1: E, Factor2: ConstantFP::get(Ty: X->getType(), V: 0.375),
708 Summand: ConstantFP::get(Ty: X->getType(), V: 0.5));
709
710 return Builder.CreateFMA(Factor1: Y0E, Factor2: EFMA, Summand: IsNegative ? NegY0 : Y0);
711}
712
713bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(FastMathFlags DivFMF,
714 FastMathFlags SqrtFMF) const {
715 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp for f32 and
716 // f64.
717 return DivFMF.allowContract() && SqrtFMF.allowContract();
718}
719
720Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
721 IRBuilder<> &Builder, Value *Num, Value *Den, const FastMathFlags DivFMF,
722 const FastMathFlags SqrtFMF, const Instruction *CtxI) const {
723 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
724 assert(DivFMF.allowContract() && SqrtFMF.allowContract());
725
726 // rsq_f16 is accurate to 0.51 ulp.
727 // rsq_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
728 // rsq_f64 is never accurate.
729 const ConstantFP *CLHS = dyn_cast<ConstantFP>(Val: Num);
730 if (!CLHS)
731 return nullptr;
732
733 bool IsNegative = false;
734
735 // TODO: Handle other numerator values with arcp.
736 if (CLHS->isExactlyValue(V: 1.0) || (IsNegative = CLHS->isExactlyValue(V: -1.0))) {
737 // Add in the sqrt flags.
738 IRBuilder<>::FastMathFlagGuard Guard(Builder);
739 Builder.setFastMathFlags(DivFMF | SqrtFMF);
740
741 if (Den->getType()->isFloatTy()) {
742 if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) ||
743 canIgnoreDenormalInput(V: Den, CtxI)) {
744 Value *Result =
745 Builder.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rsq, V: Den);
746 // -1.0 / sqrt(x) -> fneg(rsq(x))
747 return IsNegative ? Builder.CreateFNeg(V: Result) : Result;
748 }
749
750 return emitRsqIEEE1ULP(Builder, Src: Den, IsNegative);
751 }
752
753 if (Den->getType()->isDoubleTy())
754 return emitRsqF64(Builder, X: Den, SqrtFMF, DivFMF, CtxI, IsNegative);
755 }
756
757 return nullptr;
758}
759
760// Optimize fdiv with rcp:
761//
762// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
763// allowed with afn.
764//
765// a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0
766Value *
767AMDGPUCodeGenPrepareImpl::optimizeWithRcp(IRBuilder<> &Builder, Value *Num,
768 Value *Den, FastMathFlags FMF,
769 const Instruction *CtxI) const {
770 // rcp_f16 is accurate to 0.51 ulp.
771 // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
772 // rcp_f64 is never accurate.
773 assert(Den->getType()->isFloatTy());
774
775 if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Val: Num)) {
776 bool IsNegative = false;
777 if (CLHS->isExactlyValue(V: 1.0) ||
778 (IsNegative = CLHS->isExactlyValue(V: -1.0))) {
779 Value *Src = Den;
780
781 if (HasFP32DenormalFlush || FMF.approxFunc()) {
782 // -1.0 / x -> 1.0 / fneg(x)
783 if (IsNegative)
784 Src = Builder.CreateFNeg(V: Src);
785
786 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
787 // the CI documentation has a worst case error of 1 ulp.
788 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK
789 // to use it as long as we aren't trying to use denormals.
790 //
791 // v_rcp_f16 and v_rsq_f16 DO support denormals.
792
793 // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
794 // insert rsq intrinsic here.
795
796 // 1.0 / x -> rcp(x)
797 return Builder.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rcp, V: Src);
798 }
799
800 // TODO: If the input isn't denormal, and we know the input exponent isn't
801 // big enough to introduce a denormal we can avoid the scaling.
802 return emitRcpIEEE1ULP(Builder, Src, IsNegative);
803 }
804 }
805
806 if (FMF.allowReciprocal()) {
807 // x / y -> x * (1.0 / y)
808
809 // TODO: Could avoid denormal scaling and use raw rcp if we knew the output
810 // will never underflow.
811 if (HasFP32DenormalFlush || FMF.approxFunc()) {
812 Value *Recip = Builder.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rcp, V: Den);
813 return Builder.CreateFMul(L: Num, R: Recip);
814 }
815
816 Value *Recip = emitRcpIEEE1ULP(Builder, Src: Den, IsNegative: false);
817 return Builder.CreateFMul(L: Num, R: Recip);
818 }
819
820 return nullptr;
821}
822
823// optimize with fdiv.fast:
824//
825// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
826//
827// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
828//
829// NOTE: optimizeWithRcp should be tried first because rcp is the preference.
830Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
831 IRBuilder<> &Builder, Value *Num, Value *Den, float ReqdAccuracy) const {
832 // fdiv.fast can achieve 2.5 ULP accuracy.
833 if (ReqdAccuracy < 2.5f)
834 return nullptr;
835
836 // Only have fdiv.fast for f32.
837 assert(Den->getType()->isFloatTy());
838
839 bool NumIsOne = false;
840 if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Val: Num)) {
841 if (CNum->isExactlyValue(V: +1.0) || CNum->isExactlyValue(V: -1.0))
842 NumIsOne = true;
843 }
844
845 // fdiv does not support denormals. But 1.0/x is always fine to use it.
846 //
847 // TODO: This works for any value with a specific known exponent range, don't
848 // just limit to constant 1.
849 if (!HasFP32DenormalFlush && !NumIsOne)
850 return nullptr;
851
852 return Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_fdiv_fast, Args: {Num, Den});
853}
854
855Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
856 IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF,
857 FastMathFlags SqrtFMF, Value *RsqOp, const Instruction *FDivInst,
858 float ReqdDivAccuracy) const {
859 if (RsqOp) {
860 Value *Rsq =
861 optimizeWithRsq(Builder, Num, Den: RsqOp, DivFMF, SqrtFMF, CtxI: FDivInst);
862 if (Rsq)
863 return Rsq;
864 }
865
866 if (!Num->getType()->isFloatTy())
867 return nullptr;
868
869 Value *Rcp = optimizeWithRcp(Builder, Num, Den, FMF: DivFMF, CtxI: FDivInst);
870 if (Rcp)
871 return Rcp;
872
873 // In the basic case fdiv_fast has the same instruction count as the frexp div
874 // expansion. Slightly prefer fdiv_fast since it ends in an fmul that can
875 // potentially be fused into a user. Also, materialization of the constants
876 // can be reused for multiple instances.
877 Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdAccuracy: ReqdDivAccuracy);
878 if (FDivFast)
879 return FDivFast;
880
881 return emitFrexpDiv(Builder, LHS: Num, RHS: Den, FMF: DivFMF);
882}
883
884// Optimizations is performed based on fpmath, fast math flags as well as
885// denormals to optimize fdiv with either rcp or fdiv.fast.
886//
887// With rcp:
888// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
889// allowed with afn.
890//
891// a/b -> a*rcp(b) when inaccurate rcp is allowed with afn.
892//
893// With fdiv.fast:
894// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
895//
896// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
897//
898// NOTE: rcp is the preference in cases that both are legal.
899bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
900 if (DisableFDivExpand)
901 return false;
902
903 Type *Ty = FDiv.getType()->getScalarType();
904 const bool IsFloat = Ty->isFloatTy();
905 if (!IsFloat && !Ty->isDoubleTy())
906 return false;
907
908 // The f64 rcp/rsq approximations are pretty inaccurate. We can do an
909 // expansion around them in codegen. f16 is good enough to always use.
910
911 const FPMathOperator *FPOp = cast<const FPMathOperator>(Val: &FDiv);
912 const FastMathFlags DivFMF = FPOp->getFastMathFlags();
913 const float ReqdAccuracy = FPOp->getFPAccuracy();
914
915 FastMathFlags SqrtFMF;
916
917 Value *Num = FDiv.getOperand(i_nocapture: 0);
918 Value *Den = FDiv.getOperand(i_nocapture: 1);
919
920 Value *RsqOp = nullptr;
921 auto *DenII = dyn_cast<IntrinsicInst>(Val: Den);
922 if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
923 DenII->hasOneUse()) {
924 const auto *SqrtOp = cast<FPMathOperator>(Val: DenII);
925 SqrtFMF = SqrtOp->getFastMathFlags();
926 if (canOptimizeWithRsq(DivFMF, SqrtFMF))
927 RsqOp = SqrtOp->getOperand(i: 0);
928 }
929
930 // rcp path not yet implemented for f64.
931 if (!IsFloat && !RsqOp)
932 return false;
933
934 // Inaccurate rcp is allowed with afn.
935 //
936 // Defer to codegen to handle this.
937 //
938 // TODO: Decide on an interpretation for interactions between afn + arcp +
939 // !fpmath, and make it consistent between here and codegen. For now, defer
940 // expansion of afn to codegen. The current interpretation is so aggressive we
941 // don't need any pre-consideration here when we have better information. A
942 // more conservative interpretation could use handling here.
943 const bool AllowInaccurateRcp = DivFMF.approxFunc();
944 if (!RsqOp && AllowInaccurateRcp)
945 return false;
946
947 // Defer the correct implementations to codegen.
948 if (IsFloat && ReqdAccuracy < 1.0f)
949 return false;
950
951 IRBuilder<> Builder(FDiv.getParent(), std::next(x: FDiv.getIterator()));
952 Builder.setFastMathFlags(DivFMF);
953 Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
954
955 SmallVector<Value *, 4> NumVals;
956 SmallVector<Value *, 4> DenVals;
957 SmallVector<Value *, 4> RsqDenVals;
958 extractValues(Builder, Values&: NumVals, V: Num);
959 extractValues(Builder, Values&: DenVals, V: Den);
960
961 if (RsqOp)
962 extractValues(Builder, Values&: RsqDenVals, V: RsqOp);
963
964 SmallVector<Value *, 4> ResultVals(NumVals.size());
965 for (int I = 0, E = NumVals.size(); I != E; ++I) {
966 Value *NumElt = NumVals[I];
967 Value *DenElt = DenVals[I];
968 Value *RsqDenElt = RsqOp ? RsqDenVals[I] : nullptr;
969
970 Value *NewElt =
971 visitFDivElement(Builder, Num: NumElt, Den: DenElt, DivFMF, SqrtFMF, RsqOp: RsqDenElt,
972 FDivInst: cast<Instruction>(Val: FPOp), ReqdDivAccuracy: ReqdAccuracy);
973 if (!NewElt) {
974 // Keep the original, but scalarized.
975
976 // This has the unfortunate side effect of sometimes scalarizing when
977 // we're not going to do anything.
978 NewElt = Builder.CreateFDiv(L: NumElt, R: DenElt);
979 if (auto *NewEltInst = dyn_cast<Instruction>(Val: NewElt))
980 NewEltInst->copyMetadata(SrcInst: FDiv);
981 }
982
983 ResultVals[I] = NewElt;
984 }
985
986 Value *NewVal = insertValues(Builder, Ty: FDiv.getType(), Values&: ResultVals);
987
988 if (NewVal) {
989 FDiv.replaceAllUsesWith(V: NewVal);
990 NewVal->takeName(V: &FDiv);
991 DeadVals.push_back(Elt: &FDiv);
992 }
993
994 return true;
995}
996
997static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,
998 Value *LHS, Value *RHS) {
999 Type *I32Ty = Builder.getInt32Ty();
1000 Type *I64Ty = Builder.getInt64Ty();
1001
1002 Value *LHS_EXT64 = Builder.CreateZExt(V: LHS, DestTy: I64Ty);
1003 Value *RHS_EXT64 = Builder.CreateZExt(V: RHS, DestTy: I64Ty);
1004 Value *MUL64 = Builder.CreateMul(LHS: LHS_EXT64, RHS: RHS_EXT64);
1005 Value *Lo = Builder.CreateTrunc(V: MUL64, DestTy: I32Ty);
1006 Value *Hi = Builder.CreateLShr(LHS: MUL64, RHS: Builder.getInt64(C: 32));
1007 Hi = Builder.CreateTrunc(V: Hi, DestTy: I32Ty);
1008 return std::pair(Lo, Hi);
1009}
1010
1011static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
1012 return getMul64(Builder, LHS, RHS).second;
1013}
1014
1015/// Figure out how many bits are really needed for this division.
1016/// \p MaxDivBits is an optimization hint to bypass the second
1017/// ComputeNumSignBits/computeKnownBits call if the first one is
1018/// insufficient.
1019unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,
1020 Value *Den,
1021 unsigned MaxDivBits,
1022 bool IsSigned) const {
1023 assert(Num->getType()->getScalarSizeInBits() ==
1024 Den->getType()->getScalarSizeInBits());
1025 unsigned SSBits = Num->getType()->getScalarSizeInBits();
1026 if (IsSigned) {
1027 unsigned RHSSignBits = ComputeNumSignBits(Op: Den, DL: SQ.DL, AC: SQ.AC, CxtI: &I, DT: SQ.DT);
1028 // A sign bit needs to be reserved for shrinking.
1029 unsigned DivBits = SSBits - RHSSignBits + 1;
1030 if (DivBits > MaxDivBits)
1031 return SSBits;
1032
1033 unsigned LHSSignBits = ComputeNumSignBits(Op: Num, DL: SQ.DL, AC: SQ.AC, CxtI: &I);
1034
1035 unsigned SignBits = std::min(a: LHSSignBits, b: RHSSignBits);
1036 DivBits = SSBits - SignBits + 1;
1037 return DivBits;
1038 }
1039
1040 // All bits are used for unsigned division for Num or Den in range
1041 // (SignedMax, UnsignedMax].
1042 KnownBits Known = computeKnownBits(V: Den, Q: SQ.getWithInstruction(I: &I));
1043 if (Known.isNegative() || !Known.isNonNegative())
1044 return SSBits;
1045 unsigned RHSSignBits = Known.countMinLeadingZeros();
1046 unsigned DivBits = SSBits - RHSSignBits;
1047 if (DivBits > MaxDivBits)
1048 return SSBits;
1049
1050 Known = computeKnownBits(V: Num, Q: SQ.getWithInstruction(I: &I));
1051 if (Known.isNegative() || !Known.isNonNegative())
1052 return SSBits;
1053 unsigned LHSSignBits = Known.countMinLeadingZeros();
1054
1055 unsigned SignBits = std::min(a: LHSSignBits, b: RHSSignBits);
1056 DivBits = SSBits - SignBits;
1057 return DivBits;
1058}
1059
1060// The fractional part of a float is enough to accurately represent up to
1061// a 24-bit signed integer.
1062Value *AMDGPUCodeGenPrepareImpl::expandDivRem24(IRBuilder<> &Builder,
1063 BinaryOperator &I, Value *Num,
1064 Value *Den, bool IsDiv,
1065 bool IsSigned) const {
1066 unsigned DivBits = getDivNumBits(I, Num, Den, MaxDivBits: 24, IsSigned);
1067 if (DivBits > 24)
1068 return nullptr;
1069 return expandDivRem24Impl(Builder, I, Num, Den, NumBits: DivBits, IsDiv, IsSigned);
1070}
1071
1072Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
1073 IRBuilder<> &Builder, BinaryOperator &I, Value *Num, Value *Den,
1074 unsigned DivBits, bool IsDiv, bool IsSigned) const {
1075 Type *I32Ty = Builder.getInt32Ty();
1076 Num = Builder.CreateTrunc(V: Num, DestTy: I32Ty);
1077 Den = Builder.CreateTrunc(V: Den, DestTy: I32Ty);
1078
1079 Type *F32Ty = Builder.getFloatTy();
1080 ConstantInt *One = Builder.getInt32(C: 1);
1081 Value *JQ = One;
1082
1083 if (IsSigned) {
1084 // char|short jq = ia ^ ib;
1085 JQ = Builder.CreateXor(LHS: Num, RHS: Den);
1086
1087 // jq = jq >> (bitsize - 2)
1088 JQ = Builder.CreateAShr(LHS: JQ, RHS: Builder.getInt32(C: 30));
1089
1090 // jq = jq | 0x1
1091 JQ = Builder.CreateOr(LHS: JQ, RHS: One);
1092 }
1093
1094 // int ia = (int)LHS;
1095 Value *IA = Num;
1096
1097 // int ib, (int)RHS;
1098 Value *IB = Den;
1099
1100 // float fa = (float)ia;
1101 Value *FA = IsSigned ? Builder.CreateSIToFP(V: IA, DestTy: F32Ty)
1102 : Builder.CreateUIToFP(V: IA, DestTy: F32Ty);
1103
1104 // float fb = (float)ib;
1105 Value *FB = IsSigned ? Builder.CreateSIToFP(V: IB,DestTy: F32Ty)
1106 : Builder.CreateUIToFP(V: IB,DestTy: F32Ty);
1107
1108 Value *RCP = Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_rcp,
1109 Types: Builder.getFloatTy(), Args: {FB});
1110 Value *FQM = Builder.CreateFMul(L: FA, R: RCP);
1111
1112 // fq = trunc(fqm);
1113 CallInst *FQ = Builder.CreateUnaryIntrinsic(ID: Intrinsic::trunc, V: FQM);
1114 FQ->copyFastMathFlags(FMF: Builder.getFastMathFlags());
1115
1116 // float fqneg = -fq;
1117 Value *FQNeg = Builder.CreateFNeg(V: FQ);
1118
1119 // float fr = mad(fqneg, fb, fa);
1120 auto FMAD = !ST.hasMadMacF32Insts()
1121 ? Intrinsic::fma
1122 : (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz;
1123 Value *FR = Builder.CreateIntrinsic(ID: FMAD,
1124 Types: {FQNeg->getType()}, Args: {FQNeg, FB, FA}, FMFSource: FQ);
1125
1126 // int iq = (int)fq;
1127 Value *IQ = IsSigned ? Builder.CreateFPToSI(V: FQ, DestTy: I32Ty)
1128 : Builder.CreateFPToUI(V: FQ, DestTy: I32Ty);
1129
1130 // fr = fabs(fr);
1131 FR = Builder.CreateUnaryIntrinsic(ID: Intrinsic::fabs, V: FR, FMFSource: FQ);
1132
1133 // fb = fabs(fb);
1134 FB = Builder.CreateUnaryIntrinsic(ID: Intrinsic::fabs, V: FB, FMFSource: FQ);
1135
1136 // int cv = fr >= fb;
1137 Value *CV = Builder.CreateFCmpOGE(LHS: FR, RHS: FB);
1138
1139 // jq = (cv ? jq : 0);
1140 JQ = Builder.CreateSelect(C: CV, True: JQ, False: Builder.getInt32(C: 0));
1141
1142 // dst = iq + jq;
1143 Value *Div = Builder.CreateAdd(LHS: IQ, RHS: JQ);
1144
1145 Value *Res = Div;
1146 if (!IsDiv) {
1147 // Rem needs compensation, it's easier to recompute it
1148 Value *Rem = Builder.CreateMul(LHS: Div, RHS: Den);
1149 Res = Builder.CreateSub(LHS: Num, RHS: Rem);
1150 }
1151
1152 if (DivBits != 0 && DivBits < 32) {
1153 // Extend in register from the number of bits this divide really is.
1154 if (IsSigned) {
1155 int InRegBits = 32 - DivBits;
1156
1157 Res = Builder.CreateShl(LHS: Res, RHS: InRegBits);
1158 Res = Builder.CreateAShr(LHS: Res, RHS: InRegBits);
1159 } else {
1160 ConstantInt *TruncMask
1161 = Builder.getInt32(C: (UINT64_C(1) << DivBits) - 1);
1162 Res = Builder.CreateAnd(LHS: Res, RHS: TruncMask);
1163 }
1164 }
1165
1166 return Res;
1167}
1168
1169// Try to recognize special cases the DAG will emit special, better expansions
1170// than the general expansion we do here.
1171
1172// TODO: It would be better to just directly handle those optimizations here.
1173bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I,
1174 Value *Num,
1175 Value *Den) const {
1176 if (Constant *C = dyn_cast<Constant>(Val: Den)) {
1177 // Arbitrary constants get a better expansion as long as a wider mulhi is
1178 // legal.
1179 if (C->getType()->getScalarSizeInBits() <= 32)
1180 return true;
1181
1182 // TODO: Sdiv check for not exact for some reason.
1183
1184 // If there's no wider mulhi, there's only a better expansion for powers of
1185 // two.
1186 // TODO: Should really know for each vector element.
1187 if (isKnownToBeAPowerOfTwo(V: C, OrZero: true, Q: SQ.getWithInstruction(I: &I)))
1188 return true;
1189
1190 return false;
1191 }
1192
1193 if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Val: Den)) {
1194 // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
1195 if (BinOpDen->getOpcode() == Instruction::Shl &&
1196 isa<Constant>(Val: BinOpDen->getOperand(i_nocapture: 0)) &&
1197 isKnownToBeAPowerOfTwo(V: BinOpDen->getOperand(i_nocapture: 0), OrZero: true,
1198 Q: SQ.getWithInstruction(I: &I))) {
1199 return true;
1200 }
1201 }
1202
1203 return false;
1204}
1205
1206static Value *getSign32(Value *V, IRBuilder<> &Builder, const DataLayout DL) {
1207 // Check whether the sign can be determined statically.
1208 KnownBits Known = computeKnownBits(V, DL);
1209 if (Known.isNegative())
1210 return Constant::getAllOnesValue(Ty: V->getType());
1211 if (Known.isNonNegative())
1212 return Constant::getNullValue(Ty: V->getType());
1213 return Builder.CreateAShr(LHS: V, RHS: Builder.getInt32(C: 31));
1214}
1215
1216Value *AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder<> &Builder,
1217 BinaryOperator &I, Value *X,
1218 Value *Y) const {
1219 Instruction::BinaryOps Opc = I.getOpcode();
1220 assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
1221 Opc == Instruction::SRem || Opc == Instruction::SDiv);
1222
1223 FastMathFlags FMF;
1224 FMF.setFast();
1225 Builder.setFastMathFlags(FMF);
1226
1227 if (divHasSpecialOptimization(I, Num: X, Den: Y))
1228 return nullptr; // Keep it for later optimization.
1229
1230 bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
1231 bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
1232
1233 Type *Ty = X->getType();
1234 Type *I32Ty = Builder.getInt32Ty();
1235 Type *F32Ty = Builder.getFloatTy();
1236
1237 if (Ty->getScalarSizeInBits() != 32) {
1238 if (IsSigned) {
1239 X = Builder.CreateSExtOrTrunc(V: X, DestTy: I32Ty);
1240 Y = Builder.CreateSExtOrTrunc(V: Y, DestTy: I32Ty);
1241 } else {
1242 X = Builder.CreateZExtOrTrunc(V: X, DestTy: I32Ty);
1243 Y = Builder.CreateZExtOrTrunc(V: Y, DestTy: I32Ty);
1244 }
1245 }
1246
1247 if (Value *Res = expandDivRem24(Builder, I, Num: X, Den: Y, IsDiv, IsSigned)) {
1248 return IsSigned ? Builder.CreateSExtOrTrunc(V: Res, DestTy: Ty) :
1249 Builder.CreateZExtOrTrunc(V: Res, DestTy: Ty);
1250 }
1251
1252 ConstantInt *Zero = Builder.getInt32(C: 0);
1253 ConstantInt *One = Builder.getInt32(C: 1);
1254
1255 Value *Sign = nullptr;
1256 if (IsSigned) {
1257 Value *SignX = getSign32(V: X, Builder, DL);
1258 Value *SignY = getSign32(V: Y, Builder, DL);
1259 // Remainder sign is the same as LHS
1260 Sign = IsDiv ? Builder.CreateXor(LHS: SignX, RHS: SignY) : SignX;
1261
1262 X = Builder.CreateAdd(LHS: X, RHS: SignX);
1263 Y = Builder.CreateAdd(LHS: Y, RHS: SignY);
1264
1265 X = Builder.CreateXor(LHS: X, RHS: SignX);
1266 Y = Builder.CreateXor(LHS: Y, RHS: SignY);
1267 }
1268
1269 // The algorithm here is based on ideas from "Software Integer Division", Tom
1270 // Rodeheffer, August 2008.
1271 //
1272 // unsigned udiv(unsigned x, unsigned y) {
1273 // // Initial estimate of inv(y). The constant is less than 2^32 to ensure
1274 // // that this is a lower bound on inv(y), even if some of the calculations
1275 // // round up.
1276 // unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y));
1277 //
1278 // // One round of UNR (Unsigned integer Newton-Raphson) to improve z.
1279 // // Empirically this is guaranteed to give a "two-y" lower bound on
1280 // // inv(y).
1281 // z += umulh(z, -y * z);
1282 //
1283 // // Quotient/remainder estimate.
1284 // unsigned q = umulh(x, z);
1285 // unsigned r = x - q * y;
1286 //
1287 // // Two rounds of quotient/remainder refinement.
1288 // if (r >= y) {
1289 // ++q;
1290 // r -= y;
1291 // }
1292 // if (r >= y) {
1293 // ++q;
1294 // r -= y;
1295 // }
1296 //
1297 // return q;
1298 // }
1299
1300 // Initial estimate of inv(y).
1301 Value *FloatY = Builder.CreateUIToFP(V: Y, DestTy: F32Ty);
1302 Value *RcpY = Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_rcp, Types: F32Ty, Args: {FloatY});
1303 Constant *Scale = ConstantFP::get(Ty: F32Ty, V: llvm::bit_cast<float>(from: 0x4F7FFFFE));
1304 Value *ScaledY = Builder.CreateFMul(L: RcpY, R: Scale);
1305 Value *Z = Builder.CreateFPToUI(V: ScaledY, DestTy: I32Ty);
1306
1307 // One round of UNR.
1308 Value *NegY = Builder.CreateSub(LHS: Zero, RHS: Y);
1309 Value *NegYZ = Builder.CreateMul(LHS: NegY, RHS: Z);
1310 Z = Builder.CreateAdd(LHS: Z, RHS: getMulHu(Builder, LHS: Z, RHS: NegYZ));
1311
1312 // Quotient/remainder estimate.
1313 Value *Q = getMulHu(Builder, LHS: X, RHS: Z);
1314 Value *R = Builder.CreateSub(LHS: X, RHS: Builder.CreateMul(LHS: Q, RHS: Y));
1315
1316 // First quotient/remainder refinement.
1317 Value *Cond = Builder.CreateICmpUGE(LHS: R, RHS: Y);
1318 if (IsDiv)
1319 Q = Builder.CreateSelect(C: Cond, True: Builder.CreateAdd(LHS: Q, RHS: One), False: Q);
1320 R = Builder.CreateSelect(C: Cond, True: Builder.CreateSub(LHS: R, RHS: Y), False: R);
1321
1322 // Second quotient/remainder refinement.
1323 Cond = Builder.CreateICmpUGE(LHS: R, RHS: Y);
1324 Value *Res;
1325 if (IsDiv)
1326 Res = Builder.CreateSelect(C: Cond, True: Builder.CreateAdd(LHS: Q, RHS: One), False: Q);
1327 else
1328 Res = Builder.CreateSelect(C: Cond, True: Builder.CreateSub(LHS: R, RHS: Y), False: R);
1329
1330 if (IsSigned) {
1331 Res = Builder.CreateXor(LHS: Res, RHS: Sign);
1332 Res = Builder.CreateSub(LHS: Res, RHS: Sign);
1333 Res = Builder.CreateSExtOrTrunc(V: Res, DestTy: Ty);
1334 } else {
1335 Res = Builder.CreateZExtOrTrunc(V: Res, DestTy: Ty);
1336 }
1337 return Res;
1338}
1339
1340Value *AMDGPUCodeGenPrepareImpl::shrinkDivRem64(IRBuilder<> &Builder,
1341 BinaryOperator &I, Value *Num,
1342 Value *Den) const {
1343 if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den))
1344 return nullptr; // Keep it for later optimization.
1345
1346 Instruction::BinaryOps Opc = I.getOpcode();
1347
1348 bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
1349 bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
1350
1351 unsigned NumDivBits = getDivNumBits(I, Num, Den, MaxDivBits: 32, IsSigned);
1352 if (NumDivBits > 32)
1353 return nullptr;
1354
1355 Value *Narrowed = nullptr;
1356 if (NumDivBits <= 24) {
1357 Narrowed = expandDivRem24Impl(Builder, I, Num, Den, DivBits: NumDivBits,
1358 IsDiv, IsSigned);
1359 } else if (NumDivBits <= 32) {
1360 Narrowed = expandDivRem32(Builder, I, X: Num, Y: Den);
1361 }
1362
1363 if (Narrowed) {
1364 return IsSigned ? Builder.CreateSExt(V: Narrowed, DestTy: Num->getType()) :
1365 Builder.CreateZExt(V: Narrowed, DestTy: Num->getType());
1366 }
1367
1368 return nullptr;
1369}
1370
1371void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &I) const {
1372 Instruction::BinaryOps Opc = I.getOpcode();
1373 // Do the general expansion.
1374 if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {
1375 expandDivisionUpTo64Bits(Div: &I);
1376 return;
1377 }
1378
1379 if (Opc == Instruction::URem || Opc == Instruction::SRem) {
1380 expandRemainderUpTo64Bits(Rem: &I);
1381 return;
1382 }
1383
1384 llvm_unreachable("not a division");
1385}
1386
1387/*
1388This will cause non-byte load in consistency, for example:
1389```
1390 %load = load i1, ptr addrspace(4) %arg, align 4
1391 %zext = zext i1 %load to
1392 i64 %add = add i64 %zext
1393```
1394Instead of creating `s_and_b32 s0, s0, 1`,
1395it will create `s_and_b32 s0, s0, 0xff`.
1396We accept this change since the non-byte load assumes the upper bits
1397within the byte are all 0.
1398*/
1399bool AMDGPUCodeGenPrepareImpl::tryNarrowMathIfNoOverflow(Instruction *I) {
1400 unsigned Opc = I->getOpcode();
1401 Type *OldType = I->getType();
1402
1403 if (Opc != Instruction::Add && Opc != Instruction::Mul)
1404 return false;
1405
1406 unsigned OrigBit = OldType->getScalarSizeInBits();
1407
1408 if (Opc != Instruction::Add && Opc != Instruction::Mul)
1409 llvm_unreachable("Unexpected opcode, only valid for Instruction::Add and "
1410 "Instruction::Mul.");
1411
1412 unsigned MaxBitsNeeded = computeKnownBits(V: I, DL).countMaxActiveBits();
1413
1414 MaxBitsNeeded = std::max<unsigned>(a: bit_ceil(Value: MaxBitsNeeded), b: 8);
1415 Type *NewType = DL.getSmallestLegalIntType(C&: I->getContext(), Width: MaxBitsNeeded);
1416 if (!NewType)
1417 return false;
1418 unsigned NewBit = NewType->getIntegerBitWidth();
1419 if (NewBit >= OrigBit)
1420 return false;
1421 NewType = I->getType()->getWithNewBitWidth(NewBitWidth: NewBit);
1422
1423 // Old cost
1424 const TargetTransformInfo &TTI = TM.getTargetTransformInfo(F);
1425 InstructionCost OldCost =
1426 TTI.getArithmeticInstrCost(Opcode: Opc, Ty: OldType, CostKind: TTI::TCK_RecipThroughput);
1427 // New cost of new op
1428 InstructionCost NewCost =
1429 TTI.getArithmeticInstrCost(Opcode: Opc, Ty: NewType, CostKind: TTI::TCK_RecipThroughput);
1430 // New cost of narrowing 2 operands (use trunc)
1431 int NumOfNonConstOps = 2;
1432 if (isa<Constant>(Val: I->getOperand(i: 0)) || isa<Constant>(Val: I->getOperand(i: 1))) {
1433 // Cannot be both constant, should be propagated
1434 NumOfNonConstOps = 1;
1435 }
1436 NewCost += NumOfNonConstOps * TTI.getCastInstrCost(Opcode: Instruction::Trunc,
1437 Dst: NewType, Src: OldType,
1438 CCH: TTI.getCastContextHint(I),
1439 CostKind: TTI::TCK_RecipThroughput);
1440 // New cost of zext narrowed result to original type
1441 NewCost +=
1442 TTI.getCastInstrCost(Opcode: Instruction::ZExt, Dst: OldType, Src: NewType,
1443 CCH: TTI.getCastContextHint(I), CostKind: TTI::TCK_RecipThroughput);
1444 if (NewCost >= OldCost)
1445 return false;
1446
1447 IRBuilder<> Builder(I);
1448 Value *Trunc0 = Builder.CreateTrunc(V: I->getOperand(i: 0), DestTy: NewType);
1449 Value *Trunc1 = Builder.CreateTrunc(V: I->getOperand(i: 1), DestTy: NewType);
1450 Value *Arith =
1451 Builder.CreateBinOp(Opc: (Instruction::BinaryOps)Opc, LHS: Trunc0, RHS: Trunc1);
1452
1453 Value *Zext = Builder.CreateZExt(V: Arith, DestTy: OldType);
1454 I->replaceAllUsesWith(V: Zext);
1455 DeadVals.push_back(Elt: I);
1456 return true;
1457}
1458
1459bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
1460 if (foldBinOpIntoSelect(BO&: I))
1461 return true;
1462
1463 if (UseMul24Intrin && replaceMulWithMul24(I))
1464 return true;
1465 if (tryNarrowMathIfNoOverflow(I: &I))
1466 return true;
1467
1468 bool Changed = false;
1469 Instruction::BinaryOps Opc = I.getOpcode();
1470 Type *Ty = I.getType();
1471 Value *NewDiv = nullptr;
1472 unsigned ScalarSize = Ty->getScalarSizeInBits();
1473
1474 SmallVector<BinaryOperator *, 8> Div64ToExpand;
1475
1476 if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
1477 Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
1478 ScalarSize <= 64 &&
1479 !DisableIDivExpand) {
1480 Value *Num = I.getOperand(i_nocapture: 0);
1481 Value *Den = I.getOperand(i_nocapture: 1);
1482 IRBuilder<> Builder(&I);
1483 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1484
1485 if (auto *VT = dyn_cast<FixedVectorType>(Val: Ty)) {
1486 NewDiv = PoisonValue::get(T: VT);
1487
1488 for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {
1489 Value *NumEltN = Builder.CreateExtractElement(Vec: Num, Idx: N);
1490 Value *DenEltN = Builder.CreateExtractElement(Vec: Den, Idx: N);
1491
1492 Value *NewElt;
1493 if (ScalarSize <= 32) {
1494 NewElt = expandDivRem32(Builder, I, X: NumEltN, Y: DenEltN);
1495 if (!NewElt)
1496 NewElt = Builder.CreateBinOp(Opc, LHS: NumEltN, RHS: DenEltN);
1497 } else {
1498 // See if this 64-bit division can be shrunk to 32/24-bits before
1499 // producing the general expansion.
1500 NewElt = shrinkDivRem64(Builder, I, Num: NumEltN, Den: DenEltN);
1501 if (!NewElt) {
1502 // The general 64-bit expansion introduces control flow and doesn't
1503 // return the new value. Just insert a scalar copy and defer
1504 // expanding it.
1505 NewElt = Builder.CreateBinOp(Opc, LHS: NumEltN, RHS: DenEltN);
1506 // CreateBinOp does constant folding. If the operands are constant,
1507 // it will return a Constant instead of a BinaryOperator.
1508 if (auto *NewEltBO = dyn_cast<BinaryOperator>(Val: NewElt))
1509 Div64ToExpand.push_back(Elt: NewEltBO);
1510 }
1511 }
1512
1513 if (auto *NewEltI = dyn_cast<Instruction>(Val: NewElt))
1514 NewEltI->copyIRFlags(V: &I);
1515
1516 NewDiv = Builder.CreateInsertElement(Vec: NewDiv, NewElt, Idx: N);
1517 }
1518 } else {
1519 if (ScalarSize <= 32)
1520 NewDiv = expandDivRem32(Builder, I, X: Num, Y: Den);
1521 else {
1522 NewDiv = shrinkDivRem64(Builder, I, Num, Den);
1523 if (!NewDiv)
1524 Div64ToExpand.push_back(Elt: &I);
1525 }
1526 }
1527
1528 if (NewDiv) {
1529 I.replaceAllUsesWith(V: NewDiv);
1530 DeadVals.push_back(Elt: &I);
1531 Changed = true;
1532 }
1533 }
1534
1535 if (ExpandDiv64InIR) {
1536 // TODO: We get much worse code in specially handled constant cases.
1537 for (BinaryOperator *Div : Div64ToExpand) {
1538 expandDivRem64(I&: *Div);
1539 FlowChanged = true;
1540 Changed = true;
1541 }
1542 }
1543
1544 return Changed;
1545}
1546
1547bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
1548 if (!WidenLoads)
1549 return false;
1550
1551 if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
1552 I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1553 canWidenScalarExtLoad(I)) {
1554 IRBuilder<> Builder(&I);
1555 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1556
1557 Type *I32Ty = Builder.getInt32Ty();
1558 LoadInst *WidenLoad = Builder.CreateLoad(Ty: I32Ty, Ptr: I.getPointerOperand());
1559 WidenLoad->copyMetadata(SrcInst: I);
1560
1561 // If we have range metadata, we need to convert the type, and not make
1562 // assumptions about the high bits.
1563 if (auto *Range = WidenLoad->getMetadata(KindID: LLVMContext::MD_range)) {
1564 ConstantInt *Lower =
1565 mdconst::extract<ConstantInt>(MD: Range->getOperand(I: 0));
1566
1567 if (Lower->isNullValue()) {
1568 WidenLoad->setMetadata(KindID: LLVMContext::MD_range, Node: nullptr);
1569 } else {
1570 Metadata *LowAndHigh[] = {
1571 ConstantAsMetadata::get(C: ConstantInt::get(Ty: I32Ty, V: Lower->getValue().zext(width: 32))),
1572 // Don't make assumptions about the high bits.
1573 ConstantAsMetadata::get(C: ConstantInt::get(Ty: I32Ty, V: 0))
1574 };
1575
1576 WidenLoad->setMetadata(KindID: LLVMContext::MD_range,
1577 Node: MDNode::get(Context&: F.getContext(), MDs: LowAndHigh));
1578 }
1579 }
1580
1581 int TySize = DL.getTypeSizeInBits(Ty: I.getType());
1582 Type *IntNTy = Builder.getIntNTy(N: TySize);
1583 Value *ValTrunc = Builder.CreateTrunc(V: WidenLoad, DestTy: IntNTy);
1584 Value *ValOrig = Builder.CreateBitCast(V: ValTrunc, DestTy: I.getType());
1585 I.replaceAllUsesWith(V: ValOrig);
1586 DeadVals.push_back(Elt: &I);
1587 return true;
1588 }
1589
1590 return false;
1591}
1592
1593bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {
1594 Value *Cond = I.getCondition();
1595 Value *TrueVal = I.getTrueValue();
1596 Value *FalseVal = I.getFalseValue();
1597 Value *CmpVal;
1598 CmpPredicate IsNanPred;
1599
1600 // Match fract pattern with nan check.
1601 if (!match(V: Cond, P: m_FCmp(Pred&: IsNanPred, L: m_Value(V&: CmpVal), R: m_NonNaN())))
1602 return false;
1603
1604 FPMathOperator *FPOp = dyn_cast<FPMathOperator>(Val: &I);
1605 if (!FPOp)
1606 return false;
1607
1608 IRBuilder<> Builder(&I);
1609 Builder.setFastMathFlags(FPOp->getFastMathFlags());
1610
1611 Value *Fract = nullptr;
1612 if (IsNanPred == FCmpInst::FCMP_UNO && TrueVal == CmpVal &&
1613 CmpVal == matchFractPat(V&: *FalseVal)) {
1614 // isnan(x) ? x : fract(x)
1615 Fract = applyFractPat(Builder, FractArg: CmpVal);
1616 } else if (IsNanPred == FCmpInst::FCMP_ORD && FalseVal == CmpVal) {
1617 if (CmpVal == matchFractPat(V&: *TrueVal)) {
1618 // !isnan(x) ? fract(x) : x
1619 Fract = applyFractPat(Builder, FractArg: CmpVal);
1620 } else {
1621 // Match an intermediate clamp infinity to 0 pattern. i.e.
1622 // !isnan(x) ? (!isinf(x) ? fract(x) : 0.0) : x
1623 CmpPredicate PredInf;
1624 Value *IfNotInf;
1625
1626 if (!match(V: TrueVal, P: m_Select(C: m_FCmp(Pred&: PredInf, L: m_FAbs(Op0: m_Specific(V: CmpVal)),
1627 R: m_PosInf()),
1628 L: m_Value(V&: IfNotInf), R: m_PosZeroFP())) ||
1629 PredInf != FCmpInst::FCMP_UNE || CmpVal != matchFractPat(V&: *IfNotInf))
1630 return false;
1631
1632 SelectInst *ClampInfSelect = cast<SelectInst>(Val: TrueVal);
1633
1634 // Insert before the fabs
1635 Value *InsertPt =
1636 cast<Instruction>(Val: ClampInfSelect->getCondition())->getOperand(i: 0);
1637
1638 Builder.SetInsertPoint(cast<Instruction>(Val: InsertPt));
1639 Value *NewFract = applyFractPat(Builder, FractArg: CmpVal);
1640 NewFract->takeName(V: TrueVal);
1641
1642 // Thread the new fract into the inf clamping sequence.
1643 DeadVals.push_back(Elt: ClampInfSelect->getOperand(i_nocapture: 1));
1644 ClampInfSelect->setOperand(i_nocapture: 1, Val_nocapture: NewFract);
1645
1646 // The outer select nan handling is also absorbed into the fract.
1647 Fract = ClampInfSelect;
1648 }
1649 } else
1650 return false;
1651
1652 Fract->takeName(V: &I);
1653 I.replaceAllUsesWith(V: Fract);
1654 DeadVals.push_back(Elt: &I);
1655 return true;
1656}
1657
1658static bool areInSameBB(const Value *A, const Value *B) {
1659 const auto *IA = dyn_cast<Instruction>(Val: A);
1660 const auto *IB = dyn_cast<Instruction>(Val: B);
1661 return IA && IB && IA->getParent() == IB->getParent();
1662}
1663
1664// Helper for breaking large PHIs that returns true when an extractelement on V
1665// is likely to be folded away by the DAG combiner.
1666static bool isInterestingPHIIncomingValue(const Value *V) {
1667 const auto *FVT = dyn_cast<FixedVectorType>(Val: V->getType());
1668 if (!FVT)
1669 return false;
1670
1671 const Value *CurVal = V;
1672
1673 // Check for insertelements, keeping track of the elements covered.
1674 BitVector EltsCovered(FVT->getNumElements());
1675 while (const auto *IE = dyn_cast<InsertElementInst>(Val: CurVal)) {
1676 const auto *Idx = dyn_cast<ConstantInt>(Val: IE->getOperand(i_nocapture: 2));
1677
1678 // Non constant index/out of bounds index -> folding is unlikely.
1679 // The latter is more of a sanity check because canonical IR should just
1680 // have replaced those with poison.
1681 if (!Idx || Idx->getZExtValue() >= FVT->getNumElements())
1682 return false;
1683
1684 const auto *VecSrc = IE->getOperand(i_nocapture: 0);
1685
1686 // If the vector source is another instruction, it must be in the same basic
1687 // block. Otherwise, the DAGCombiner won't see the whole thing and is
1688 // unlikely to be able to do anything interesting here.
1689 if (isa<Instruction>(Val: VecSrc) && !areInSameBB(A: VecSrc, B: IE))
1690 return false;
1691
1692 CurVal = VecSrc;
1693 EltsCovered.set(Idx->getZExtValue());
1694
1695 // All elements covered.
1696 if (EltsCovered.all())
1697 return true;
1698 }
1699
1700 // We either didn't find a single insertelement, or the insertelement chain
1701 // ended before all elements were covered. Check for other interesting values.
1702
1703 // Constants are always interesting because we can just constant fold the
1704 // extractelements.
1705 if (isa<Constant>(Val: CurVal))
1706 return true;
1707
1708 // shufflevector is likely to be profitable if either operand is a constant,
1709 // or if either source is in the same block.
1710 // This is because shufflevector is most often lowered as a series of
1711 // insert/extract elements anyway.
1712 if (const auto *SV = dyn_cast<ShuffleVectorInst>(Val: CurVal)) {
1713 return isa<Constant>(Val: SV->getOperand(i_nocapture: 1)) ||
1714 areInSameBB(A: SV, B: SV->getOperand(i_nocapture: 0)) ||
1715 areInSameBB(A: SV, B: SV->getOperand(i_nocapture: 1));
1716 }
1717
1718 return false;
1719}
1720
1721static void collectPHINodes(const PHINode &I,
1722 SmallPtrSet<const PHINode *, 8> &SeenPHIs) {
1723 const auto [It, Inserted] = SeenPHIs.insert(Ptr: &I);
1724 if (!Inserted)
1725 return;
1726
1727 for (const Value *Inc : I.incoming_values()) {
1728 if (const auto *PhiInc = dyn_cast<PHINode>(Val: Inc))
1729 collectPHINodes(I: *PhiInc, SeenPHIs);
1730 }
1731
1732 for (const User *U : I.users()) {
1733 if (const auto *PhiU = dyn_cast<PHINode>(Val: U))
1734 collectPHINodes(I: *PhiU, SeenPHIs);
1735 }
1736}
1737
1738bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(const PHINode &I) {
1739 // Check in the cache first.
1740 if (const auto It = BreakPhiNodesCache.find(Val: &I);
1741 It != BreakPhiNodesCache.end())
1742 return It->second;
1743
1744 // We consider PHI nodes as part of "chains", so given a PHI node I, we
1745 // recursively consider all its users and incoming values that are also PHI
1746 // nodes. We then make a decision about all of those PHIs at once. Either they
1747 // all get broken up, or none of them do. That way, we avoid cases where a
1748 // single PHI is/is not broken and we end up reforming/exploding a vector
1749 // multiple times, or even worse, doing it in a loop.
1750 SmallPtrSet<const PHINode *, 8> WorkList;
1751 collectPHINodes(I, SeenPHIs&: WorkList);
1752
1753#ifndef NDEBUG
1754 // Check that none of the PHI nodes in the worklist are in the map. If some of
1755 // them are, it means we're not good enough at collecting related PHIs.
1756 for (const PHINode *WLP : WorkList) {
1757 assert(BreakPhiNodesCache.count(WLP) == 0);
1758 }
1759#endif
1760
1761 // To consider a PHI profitable to break, we need to see some interesting
1762 // incoming values. At least 2/3rd (rounded up) of all PHIs in the worklist
1763 // must have one to consider all PHIs breakable.
1764 //
1765 // This threshold has been determined through performance testing.
1766 //
1767 // Note that the computation below is equivalent to
1768 //
1769 // (unsigned)ceil((K / 3.0) * 2)
1770 //
1771 // It's simply written this way to avoid mixing integral/FP arithmetic.
1772 const auto Threshold = (alignTo(Value: WorkList.size() * 2, Align: 3) / 3);
1773 unsigned NumBreakablePHIs = 0;
1774 bool CanBreak = false;
1775 for (const PHINode *Cur : WorkList) {
1776 // Don't break PHIs that have no interesting incoming values. That is, where
1777 // there is no clear opportunity to fold the "extractelement" instructions
1778 // we would add.
1779 //
1780 // Note: IC does not run after this pass, so we're only interested in the
1781 // foldings that the DAG combiner can do.
1782 if (any_of(Range: Cur->incoming_values(), P: isInterestingPHIIncomingValue)) {
1783 if (++NumBreakablePHIs >= Threshold) {
1784 CanBreak = true;
1785 break;
1786 }
1787 }
1788 }
1789
1790 for (const PHINode *Cur : WorkList)
1791 BreakPhiNodesCache[Cur] = CanBreak;
1792
1793 return CanBreak;
1794}
1795
1796/// Helper class for "break large PHIs" (visitPHINode).
1797///
1798/// This represents a slice of a PHI's incoming value, which is made up of:
1799/// - The type of the slice (Ty)
1800/// - The index in the incoming value's vector where the slice starts (Idx)
1801/// - The number of elements in the slice (NumElts).
1802/// It also keeps track of the NewPHI node inserted for this particular slice.
1803///
1804/// Slice examples:
1805/// <4 x i64> -> Split into four i64 slices.
1806/// -> [i64, 0, 1], [i64, 1, 1], [i64, 2, 1], [i64, 3, 1]
1807/// <5 x i16> -> Split into 2 <2 x i16> slices + a i16 tail.
1808/// -> [<2 x i16>, 0, 2], [<2 x i16>, 2, 2], [i16, 4, 1]
1809class VectorSlice {
1810public:
1811 VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
1812 : Ty(Ty), Idx(Idx), NumElts(NumElts) {}
1813
1814 Type *Ty = nullptr;
1815 unsigned Idx = 0;
1816 unsigned NumElts = 0;
1817 PHINode *NewPHI = nullptr;
1818
1819 /// Slice \p Inc according to the information contained within this slice.
1820 /// This is cached, so if called multiple times for the same \p BB & \p Inc
1821 /// pair, it returns the same Sliced value as well.
1822 ///
1823 /// Note this *intentionally* does not return the same value for, say,
1824 /// [%bb.0, %0] & [%bb.1, %0] as:
1825 /// - It could cause issues with dominance (e.g. if bb.1 is seen first, then
1826 /// the value in bb.1 may not be reachable from bb.0 if it's its
1827 /// predecessor.)
1828 /// - We also want to make our extract instructions as local as possible so
1829 /// the DAG has better chances of folding them out. Duplicating them like
1830 /// that is beneficial in that regard.
1831 ///
1832 /// This is both a minor optimization to avoid creating duplicate
1833 /// instructions, but also a requirement for correctness. It is not forbidden
1834 /// for a PHI node to have the same [BB, Val] pair multiple times. If we
1835 /// returned a new value each time, those previously identical pairs would all
1836 /// have different incoming values (from the same block) and it'd cause a "PHI
1837 /// node has multiple entries for the same basic block with different incoming
1838 /// values!" verifier error.
1839 Value *getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName) {
1840 Value *&Res = SlicedVals[{BB, Inc}];
1841 if (Res)
1842 return Res;
1843
1844 IRBuilder<> B(BB->getTerminator());
1845 if (Instruction *IncInst = dyn_cast<Instruction>(Val: Inc))
1846 B.SetCurrentDebugLocation(IncInst->getDebugLoc());
1847
1848 if (NumElts > 1) {
1849 SmallVector<int, 4> Mask;
1850 for (unsigned K = Idx; K < (Idx + NumElts); ++K)
1851 Mask.push_back(Elt: K);
1852 Res = B.CreateShuffleVector(V: Inc, Mask, Name: NewValName);
1853 } else
1854 Res = B.CreateExtractElement(Vec: Inc, Idx, Name: NewValName);
1855
1856 return Res;
1857 }
1858
1859private:
1860 SmallDenseMap<std::pair<BasicBlock *, Value *>, Value *> SlicedVals;
1861};
1862
1863bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
1864 // Break-up fixed-vector PHIs into smaller pieces.
1865 // Default threshold is 32, so it breaks up any vector that's >32 bits into
1866 // its elements, or into 32-bit pieces (for 8/16 bit elts).
1867 //
1868 // This is only helpful for DAGISel because it doesn't handle large PHIs as
1869 // well as GlobalISel. DAGISel lowers PHIs by using CopyToReg/CopyFromReg.
1870 // With large, odd-sized PHIs we may end up needing many `build_vector`
1871 // operations with most elements being "undef". This inhibits a lot of
1872 // optimization opportunities and can result in unreasonably high register
1873 // pressure and the inevitable stack spilling.
1874 if (!BreakLargePHIs || getCGPassBuilderOption().EnableGlobalISelOption)
1875 return false;
1876
1877 FixedVectorType *FVT = dyn_cast<FixedVectorType>(Val: I.getType());
1878 if (!FVT || FVT->getNumElements() == 1 ||
1879 DL.getTypeSizeInBits(Ty: FVT) <= BreakLargePHIsThreshold)
1880 return false;
1881
1882 if (!ForceBreakLargePHIs && !canBreakPHINode(I))
1883 return false;
1884
1885 std::vector<VectorSlice> Slices;
1886
1887 Type *EltTy = FVT->getElementType();
1888 {
1889 unsigned Idx = 0;
1890 // For 8/16 bits type, don't scalarize fully but break it up into as many
1891 // 32-bit slices as we can, and scalarize the tail.
1892 const unsigned EltSize = DL.getTypeSizeInBits(Ty: EltTy);
1893 const unsigned NumElts = FVT->getNumElements();
1894 if (EltSize == 8 || EltSize == 16) {
1895 const unsigned SubVecSize = (32 / EltSize);
1896 Type *SubVecTy = FixedVectorType::get(ElementType: EltTy, NumElts: SubVecSize);
1897 for (unsigned End = alignDown(Value: NumElts, Align: SubVecSize); Idx < End;
1898 Idx += SubVecSize)
1899 Slices.emplace_back(args&: SubVecTy, args&: Idx, args: SubVecSize);
1900 }
1901
1902 // Scalarize all remaining elements.
1903 for (; Idx < NumElts; ++Idx)
1904 Slices.emplace_back(args&: EltTy, args&: Idx, args: 1);
1905 }
1906
1907 assert(Slices.size() > 1);
1908
1909 // Create one PHI per vector piece. The "VectorSlice" class takes care of
1910 // creating the necessary instruction to extract the relevant slices of each
1911 // incoming value.
1912 IRBuilder<> B(I.getParent());
1913 B.SetCurrentDebugLocation(I.getDebugLoc());
1914
1915 unsigned IncNameSuffix = 0;
1916 for (VectorSlice &S : Slices) {
1917 // We need to reset the build on each iteration, because getSlicedVal may
1918 // have inserted something into I's BB.
1919 B.SetInsertPoint(I.getParent()->getFirstNonPHIIt());
1920 S.NewPHI = B.CreatePHI(Ty: S.Ty, NumReservedValues: I.getNumIncomingValues());
1921
1922 for (const auto &[Idx, BB] : enumerate(First: I.blocks())) {
1923 S.NewPHI->addIncoming(V: S.getSlicedVal(BB, Inc: I.getIncomingValue(i: Idx),
1924 NewValName: "largephi.extractslice" +
1925 std::to_string(val: IncNameSuffix++)),
1926 BB);
1927 }
1928 }
1929
1930 // And replace this PHI with a vector of all the previous PHI values.
1931 Value *Vec = PoisonValue::get(T: FVT);
1932 unsigned NameSuffix = 0;
1933 for (VectorSlice &S : Slices) {
1934 const auto ValName = "largephi.insertslice" + std::to_string(val: NameSuffix++);
1935 if (S.NumElts > 1)
1936 Vec = B.CreateInsertVector(DstType: FVT, SrcVec: Vec, SubVec: S.NewPHI, Idx: S.Idx, Name: ValName);
1937 else
1938 Vec = B.CreateInsertElement(Vec, NewElt: S.NewPHI, Idx: S.Idx, Name: ValName);
1939 }
1940
1941 I.replaceAllUsesWith(V: Vec);
1942 DeadVals.push_back(Elt: &I);
1943 return true;
1944}
1945
1946/// \param V Value to check
1947/// \param DL DataLayout
1948/// \param TM TargetMachine (TODO: remove once DL contains nullptr values)
1949/// \param AS Target Address Space
1950/// \return true if \p V cannot be the null value of \p AS, false otherwise.
1951static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL,
1952 const AMDGPUTargetMachine &TM, unsigned AS) {
1953 // Pointer cannot be null if it's a block address, GV or alloca.
1954 // NOTE: We don't support extern_weak, but if we did, we'd need to check for
1955 // it as the symbol could be null in such cases.
1956 if (isa<BlockAddress, GlobalValue, AllocaInst>(Val: V))
1957 return true;
1958
1959 // Check nonnull arguments.
1960 if (const auto *Arg = dyn_cast<Argument>(Val: V); Arg && Arg->hasNonNullAttr())
1961 return true;
1962
1963 // Check nonnull loads.
1964 if (const auto *Load = dyn_cast<LoadInst>(Val: V);
1965 Load && Load->hasMetadata(KindID: LLVMContext::MD_nonnull))
1966 return true;
1967
1968 // getUnderlyingObject may have looked through another addrspacecast, although
1969 // the optimizable situations most likely folded out by now.
1970 if (AS != cast<PointerType>(Val: V->getType())->getAddressSpace())
1971 return false;
1972
1973 // TODO: Calls that return nonnull?
1974
1975 // For all other things, use KnownBits.
1976 // We either use 0 or all bits set to indicate null, so check whether the
1977 // value can be zero or all ones.
1978 //
1979 // TODO: Use ValueTracking's isKnownNeverNull if it becomes aware that some
1980 // address spaces have non-zero null values.
1981 auto SrcPtrKB = computeKnownBits(V, DL);
1982 const auto NullVal = AMDGPU::getNullPointerValue(AS);
1983
1984 assert(SrcPtrKB.getBitWidth() == DL.getPointerSizeInBits(AS));
1985 assert((NullVal == 0 || NullVal == -1) &&
1986 "don't know how to check for this null value!");
1987 return NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero();
1988}
1989
1990bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
1991 // Intrinsic doesn't support vectors, also it seems that it's often difficult
1992 // to prove that a vector cannot have any nulls in it so it's unclear if it's
1993 // worth supporting.
1994 if (I.getType()->isVectorTy())
1995 return false;
1996
1997 // Check if this can be lowered to a amdgcn.addrspacecast.nonnull.
1998 // This is only worthwhile for casts from/to priv/local to flat.
1999 const unsigned SrcAS = I.getSrcAddressSpace();
2000 const unsigned DstAS = I.getDestAddressSpace();
2001
2002 bool CanLower = false;
2003 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
2004 CanLower = (DstAS == AMDGPUAS::LOCAL_ADDRESS ||
2005 DstAS == AMDGPUAS::PRIVATE_ADDRESS);
2006 else if (DstAS == AMDGPUAS::FLAT_ADDRESS)
2007 CanLower = (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2008 SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
2009 if (!CanLower)
2010 return false;
2011
2012 SmallVector<const Value *, 4> WorkList;
2013 getUnderlyingObjects(V: I.getOperand(i_nocapture: 0), Objects&: WorkList);
2014 if (!all_of(Range&: WorkList, P: [&](const Value *V) {
2015 return isPtrKnownNeverNull(V, DL, TM, AS: SrcAS);
2016 }))
2017 return false;
2018
2019 IRBuilder<> B(&I);
2020 auto *Intrin = B.CreateIntrinsic(
2021 RetTy: I.getType(), ID: Intrinsic::amdgcn_addrspacecast_nonnull, Args: {I.getOperand(i_nocapture: 0)});
2022 I.replaceAllUsesWith(V: Intrin);
2023 DeadVals.push_back(Elt: &I);
2024 return true;
2025}
2026
2027bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
2028 Intrinsic::ID IID = I.getIntrinsicID();
2029 switch (IID) {
2030 case Intrinsic::minnum:
2031 case Intrinsic::minimumnum:
2032 case Intrinsic::minimum:
2033 return visitFMinLike(I);
2034 case Intrinsic::sqrt:
2035 return visitSqrt(I);
2036 case Intrinsic::log:
2037 case Intrinsic::log10:
2038 return visitLog(Log&: cast<FPMathOperator>(Val&: I), IID);
2039 case Intrinsic::log2:
2040 // No reason to handle log2.
2041 return false;
2042 case Intrinsic::amdgcn_mbcnt_lo:
2043 return visitMbcntLo(I);
2044 case Intrinsic::amdgcn_mbcnt_hi:
2045 return visitMbcntHi(I);
2046 default:
2047 return false;
2048 }
2049}
2050
2051/// Match non-nan fract pattern.
2052/// minnum(fsub(x, floor(x)), nextafter(1.0, -1.0))
2053/// minimumnum(fsub(x, floor(x)), nextafter(1.0, -1.0))
2054/// minimum(fsub(x, floor(x)), nextafter(1.0, -1.0))
2055///
2056/// If fract is a useful instruction for the subtarget. Does not account for the
2057/// nan handling; the instruction has a nan check on the input value.
2058Value *AMDGPUCodeGenPrepareImpl::matchFractPat(Value &V) {
2059 if (ST.hasFractBug())
2060 return nullptr;
2061
2062 IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: &V);
2063 if (!II)
2064 return nullptr;
2065
2066 Intrinsic::ID IID = II->getIntrinsicID();
2067
2068 // The value is only used in contexts where we know the input isn't a nan, so
2069 // any of the fmin variants are fine.
2070 if (IID != Intrinsic::minnum && IID != Intrinsic::minimum &&
2071 IID != Intrinsic::minimumnum)
2072 return nullptr;
2073
2074 Type *Ty = V.getType();
2075 if (!isLegalFloatingTy(Ty: Ty->getScalarType()))
2076 return nullptr;
2077
2078 Value *Arg0 = II->getArgOperand(i: 0);
2079 Value *Arg1 = II->getArgOperand(i: 1);
2080
2081 const APFloat *C;
2082 if (!match(V: Arg1, P: m_APFloatAllowPoison(Res&: C)))
2083 return nullptr;
2084
2085 APFloat OneNextDown = APFloat::getOne(Sem: C->getSemantics());
2086 OneNextDown.next(nextDown: true);
2087
2088 // Match nextafter(1.0, -1)
2089 if (OneNextDown != *C)
2090 return nullptr;
2091
2092 Value *FloorSrc;
2093 if (match(V: Arg0, P: m_FSub(L: m_Value(V&: FloorSrc),
2094 R: m_Intrinsic<Intrinsic::floor>(Op0: m_Deferred(V: FloorSrc)))))
2095 return FloorSrc;
2096 return nullptr;
2097}
2098
2099Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,
2100 Value *FractArg) {
2101 SmallVector<Value *, 4> FractVals;
2102 extractValues(Builder, Values&: FractVals, V: FractArg);
2103
2104 SmallVector<Value *, 4> ResultVals(FractVals.size());
2105
2106 Type *Ty = FractArg->getType()->getScalarType();
2107 for (unsigned I = 0, E = FractVals.size(); I != E; ++I) {
2108 ResultVals[I] =
2109 Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_fract, Types: {Ty}, Args: {FractVals[I]});
2110 }
2111
2112 return insertValues(Builder, Ty: FractArg->getType(), Values&: ResultVals);
2113}
2114
2115bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) {
2116 Value *FractArg = matchFractPat(V&: I);
2117 if (!FractArg)
2118 return false;
2119
2120 // Match pattern for fract intrinsic in contexts where the nan check has been
2121 // optimized out (and hope the knowledge the source can't be nan wasn't lost).
2122 if (!I.hasNoNaNs() && !isKnownNeverNaN(V: FractArg, SQ: SQ.getWithInstruction(I: &I)))
2123 return false;
2124
2125 IRBuilder<> Builder(&I);
2126 FastMathFlags FMF = I.getFastMathFlags();
2127 FMF.setNoNaNs();
2128 Builder.setFastMathFlags(FMF);
2129
2130 Value *Fract = applyFractPat(Builder, FractArg);
2131 Fract->takeName(V: &I);
2132 I.replaceAllUsesWith(V: Fract);
2133 DeadVals.push_back(Elt: &I);
2134 return true;
2135}
2136
2137// Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way.
2138bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
2139 Type *Ty = Sqrt.getType()->getScalarType();
2140 if (!Ty->isFloatTy() && (!Ty->isHalfTy() || ST.has16BitInsts()))
2141 return false;
2142
2143 const FPMathOperator *FPOp = cast<const FPMathOperator>(Val: &Sqrt);
2144 FastMathFlags SqrtFMF = FPOp->getFastMathFlags();
2145
2146 // We're trying to handle the fast-but-not-that-fast case only. The lowering
2147 // of fast llvm.sqrt will give the raw instruction anyway.
2148 if (SqrtFMF.approxFunc())
2149 return false;
2150
2151 const float ReqdAccuracy = FPOp->getFPAccuracy();
2152
2153 // Defer correctly rounded expansion to codegen.
2154 if (ReqdAccuracy < 1.0f)
2155 return false;
2156
2157 Value *SrcVal = Sqrt.getOperand(i_nocapture: 0);
2158 bool CanTreatAsDAZ = canIgnoreDenormalInput(V: SrcVal, CtxI: &Sqrt);
2159
2160 // The raw instruction is 1 ulp, but the correction for denormal handling
2161 // brings it to 2.
2162 if (!CanTreatAsDAZ && ReqdAccuracy < 2.0f)
2163 return false;
2164
2165 IRBuilder<> Builder(&Sqrt);
2166 SmallVector<Value *, 4> SrcVals;
2167 extractValues(Builder, Values&: SrcVals, V: SrcVal);
2168
2169 SmallVector<Value *, 4> ResultVals(SrcVals.size());
2170 for (int I = 0, E = SrcVals.size(); I != E; ++I) {
2171 if (CanTreatAsDAZ)
2172 ResultVals[I] = Builder.CreateCall(Callee: getSqrtF32(), Args: SrcVals[I]);
2173 else
2174 ResultVals[I] = emitSqrtIEEE2ULP(Builder, Src: SrcVals[I], FMF: SqrtFMF);
2175 }
2176
2177 Value *NewSqrt = insertValues(Builder, Ty: Sqrt.getType(), Values&: ResultVals);
2178 NewSqrt->takeName(V: &Sqrt);
2179 Sqrt.replaceAllUsesWith(V: NewSqrt);
2180 DeadVals.push_back(Elt: &Sqrt);
2181 return true;
2182}
2183
2184/// Replace log and log10 intrinsic calls based on fpmath metadata.
2185bool AMDGPUCodeGenPrepareImpl::visitLog(FPMathOperator &Log,
2186 Intrinsic::ID IID) {
2187 Type *Ty = Log.getType();
2188 if (!Ty->getScalarType()->isHalfTy() || !ST.has16BitInsts())
2189 return false;
2190
2191 FastMathFlags FMF = Log.getFastMathFlags();
2192
2193 // Defer fast math cases to codegen.
2194 if (FMF.approxFunc())
2195 return false;
2196
2197 // Limit experimentally determined from OpenCL conformance test (1.79)
2198 if (Log.getFPAccuracy() < 1.80f)
2199 return false;
2200
2201 IRBuilder<> Builder(&cast<CallInst>(Val&: Log));
2202
2203 // Use the generic intrinsic for convenience in the vector case. Codegen will
2204 // recognize the denormal handling is not necessary from the fpext.
2205 // TODO: Move to generic code
2206 Value *Log2 =
2207 Builder.CreateUnaryIntrinsic(ID: Intrinsic::log2, V: Log.getOperand(i: 0), FMFSource: FMF);
2208
2209 double Log2BaseInverted =
2210 IID == Intrinsic::log10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
2211 Value *Mul =
2212 Builder.CreateFMulFMF(L: Log2, R: ConstantFP::get(Ty, V: Log2BaseInverted), FMFSource: FMF);
2213
2214 Mul->takeName(V: &Log);
2215
2216 Log.replaceAllUsesWith(V: Mul);
2217 DeadVals.push_back(Elt: &Log);
2218 return true;
2219}
2220
2221bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
2222 if (skipFunction(F))
2223 return false;
2224
2225 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
2226 if (!TPC)
2227 return false;
2228
2229 const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
2230 const TargetLibraryInfo *TLI =
2231 &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
2232 AssumptionCache *AC =
2233 &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2234 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
2235 const DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
2236 const UniformityInfo &UA =
2237 getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
2238 return AMDGPUCodeGenPrepareImpl(F, TM, TLI, AC, DT, UA).run();
2239}
2240
2241PreservedAnalyses AMDGPUCodeGenPreparePass::run(Function &F,
2242 FunctionAnalysisManager &FAM) {
2243 const AMDGPUTargetMachine &ATM = static_cast<const AMDGPUTargetMachine &>(TM);
2244 const TargetLibraryInfo *TLI = &FAM.getResult<TargetLibraryAnalysis>(IR&: F);
2245 AssumptionCache *AC = &FAM.getResult<AssumptionAnalysis>(IR&: F);
2246 const DominatorTree *DT = FAM.getCachedResult<DominatorTreeAnalysis>(IR&: F);
2247 const UniformityInfo &UA = FAM.getResult<UniformityInfoAnalysis>(IR&: F);
2248 AMDGPUCodeGenPrepareImpl Impl(F, ATM, TLI, AC, DT, UA);
2249 if (!Impl.run())
2250 return PreservedAnalyses::all();
2251 PreservedAnalyses PA = PreservedAnalyses::none();
2252 if (!Impl.FlowChanged)
2253 PA.preserveSet<CFGAnalyses>();
2254 return PA;
2255}
2256
2257INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
2258 "AMDGPU IR optimizations", false, false)
2259INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
2260INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
2261INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
2262INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
2263 false, false)
2264
2265/// Create a workitem.id.x intrinsic call with range metadata.
2266CallInst *AMDGPUCodeGenPrepareImpl::createWorkitemIdX(IRBuilder<> &B) const {
2267 CallInst *Tid = B.CreateIntrinsic(ID: Intrinsic::amdgcn_workitem_id_x, Args: {});
2268 ST.makeLIDRangeMetadata(I: Tid);
2269 return Tid;
2270}
2271
2272/// Replace the instruction with a direct workitem.id.x call.
2273void AMDGPUCodeGenPrepareImpl::replaceWithWorkitemIdX(Instruction &I) const {
2274 IRBuilder<> B(&I);
2275 CallInst *Tid = createWorkitemIdX(B);
2276 BasicBlock::iterator BI(&I);
2277 ReplaceInstWithValue(BI, V: Tid);
2278}
2279
2280/// Replace the instruction with (workitem.id.x & mask).
2281void AMDGPUCodeGenPrepareImpl::replaceWithMaskedWorkitemIdX(
2282 Instruction &I, unsigned WaveSize) const {
2283 IRBuilder<> B(&I);
2284 CallInst *Tid = createWorkitemIdX(B);
2285 Constant *Mask = ConstantInt::get(Ty: Tid->getType(), V: WaveSize - 1);
2286 Value *AndInst = B.CreateAnd(LHS: Tid, RHS: Mask);
2287 BasicBlock::iterator BI(&I);
2288 ReplaceInstWithValue(BI, V: AndInst);
2289}
2290
2291/// Try to optimize mbcnt instruction by replacing with workitem.id.x when
2292/// work group size allows direct computation of lane ID.
2293/// Returns true if optimization was applied, false otherwise.
2294bool AMDGPUCodeGenPrepareImpl::tryReplaceWithWorkitemId(Instruction &I,
2295 unsigned Wave) const {
2296 std::optional<unsigned> MaybeX = ST.getReqdWorkGroupSize(F, Dim: 0);
2297 if (!MaybeX)
2298 return false;
2299
2300 // When work group size == wave_size, each work group contains exactly one
2301 // wave, so the instruction can be replaced with workitem.id.x directly.
2302 if (*MaybeX == Wave) {
2303 replaceWithWorkitemIdX(I);
2304 return true;
2305 }
2306
2307 // When work group evenly splits into waves, compute lane ID within wave
2308 // using bit masking: lane_id = workitem.id.x & (wave_size - 1).
2309 if (ST.hasWavefrontsEvenlySplittingXDim(F, /*RequiresUniformYZ=*/REquiresUniformYZ: true)) {
2310 replaceWithMaskedWorkitemIdX(I, WaveSize: Wave);
2311 return true;
2312 }
2313
2314 return false;
2315}
2316
2317/// Optimize mbcnt.lo calls on wave32 architectures for lane ID computation.
2318bool AMDGPUCodeGenPrepareImpl::visitMbcntLo(IntrinsicInst &I) const {
2319 // This optimization only applies to wave32 targets where mbcnt.lo operates on
2320 // the full execution mask.
2321 if (!ST.isWave32())
2322 return false;
2323
2324 // Only optimize the pattern mbcnt.lo(~0, 0) which counts active lanes with
2325 // lower IDs.
2326 if (!match(V: &I,
2327 P: m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>(Op0: m_AllOnes(), Op1: m_Zero())))
2328 return false;
2329
2330 return tryReplaceWithWorkitemId(I, Wave: ST.getWavefrontSize());
2331}
2332
2333/// Optimize mbcnt.hi calls for lane ID computation.
2334bool AMDGPUCodeGenPrepareImpl::visitMbcntHi(IntrinsicInst &I) const {
2335 // Abort if wave size is not known at compile time.
2336 if (!ST.isWaveSizeKnown())
2337 return false;
2338
2339 unsigned Wave = ST.getWavefrontSize();
2340
2341 // On wave32, the upper 32 bits of execution mask are always 0, so
2342 // mbcnt.hi(mask, val) always returns val unchanged.
2343 if (ST.isWave32()) {
2344 if (auto MaybeX = ST.getReqdWorkGroupSize(F, Dim: 0)) {
2345 // Replace mbcnt.hi(mask, val) with val only when work group size matches
2346 // wave size (single wave per work group).
2347 if (*MaybeX == Wave) {
2348 BasicBlock::iterator BI(&I);
2349 ReplaceInstWithValue(BI, V: I.getArgOperand(i: 1));
2350 return true;
2351 }
2352 }
2353 }
2354
2355 // Optimize the complete lane ID computation pattern:
2356 // mbcnt.hi(~0, mbcnt.lo(~0, 0)) which counts all active lanes with lower IDs
2357 // across the full execution mask.
2358 using namespace PatternMatch;
2359
2360 // Check for pattern: mbcnt.hi(~0, mbcnt.lo(~0, 0))
2361 if (!match(V: &I, P: m_Intrinsic<Intrinsic::amdgcn_mbcnt_hi>(
2362 Op0: m_AllOnes(), Op1: m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>(
2363 Op0: m_AllOnes(), Op1: m_Zero()))))
2364 return false;
2365
2366 return tryReplaceWithWorkitemId(I, Wave);
2367}
2368
2369char AMDGPUCodeGenPrepare::ID = 0;
2370
2371FunctionPass *llvm::createAMDGPUCodeGenPreparePass() {
2372 return new AMDGPUCodeGenPrepare();
2373}
2374