1//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass does misc. AMDGPU optimizations on IR before instruction
11/// selection.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPU.h"
16#include "AMDGPUTargetMachine.h"
17#include "SIModeRegisterDefaults.h"
18#include "llvm/ADT/SetVector.h"
19#include "llvm/Analysis/AssumptionCache.h"
20#include "llvm/Analysis/ConstantFolding.h"
21#include "llvm/Analysis/TargetLibraryInfo.h"
22#include "llvm/Analysis/TargetTransformInfo.h"
23#include "llvm/Analysis/UniformityAnalysis.h"
24#include "llvm/Analysis/ValueTracking.h"
25#include "llvm/CodeGen/TargetPassConfig.h"
26#include "llvm/IR/Dominators.h"
27#include "llvm/IR/IRBuilder.h"
28#include "llvm/IR/InstVisitor.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include "llvm/IR/PatternMatch.h"
31#include "llvm/IR/ValueHandle.h"
32#include "llvm/InitializePasses.h"
33#include "llvm/Pass.h"
34#include "llvm/Support/KnownBits.h"
35#include "llvm/Support/KnownFPClass.h"
36#include "llvm/Transforms/Utils/BasicBlockUtils.h"
37#include "llvm/Transforms/Utils/IntegerDivision.h"
38#include "llvm/Transforms/Utils/Local.h"
39
40#define DEBUG_TYPE "amdgpu-codegenprepare"
41
42using namespace llvm;
43using namespace llvm::PatternMatch;
44
45namespace {
46
47static cl::opt<bool> WidenLoads(
48 "amdgpu-codegenprepare-widen-constant-loads",
49 cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
50 cl::ReallyHidden,
51 cl::init(Val: false));
52
53static cl::opt<bool>
54 BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
55 cl::desc("Break large PHI nodes for DAGISel"),
56 cl::ReallyHidden, cl::init(Val: true));
57
58static cl::opt<bool>
59 ForceBreakLargePHIs("amdgpu-codegenprepare-force-break-large-phis",
60 cl::desc("For testing purposes, always break large "
61 "PHIs even if it isn't profitable."),
62 cl::ReallyHidden, cl::init(Val: false));
63
64static cl::opt<unsigned> BreakLargePHIsThreshold(
65 "amdgpu-codegenprepare-break-large-phis-threshold",
66 cl::desc("Minimum type size in bits for breaking large PHI nodes"),
67 cl::ReallyHidden, cl::init(Val: 32));
68
69static cl::opt<bool> UseMul24Intrin(
70 "amdgpu-codegenprepare-mul24",
71 cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
72 cl::ReallyHidden,
73 cl::init(Val: true));
74
75// Legalize 64-bit division by using the generic IR expansion.
76static cl::opt<bool> ExpandDiv64InIR(
77 "amdgpu-codegenprepare-expand-div64",
78 cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),
79 cl::ReallyHidden,
80 cl::init(Val: false));
81
82// Leave all division operations as they are. This supersedes ExpandDiv64InIR
83// and is used for testing the legalizer.
84static cl::opt<bool> DisableIDivExpand(
85 "amdgpu-codegenprepare-disable-idiv-expansion",
86 cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"),
87 cl::ReallyHidden,
88 cl::init(Val: false));
89
90// Disable processing of fdiv so we can better test the backend implementations.
91static cl::opt<bool> DisableFDivExpand(
92 "amdgpu-codegenprepare-disable-fdiv-expansion",
93 cl::desc("Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
94 cl::ReallyHidden,
95 cl::init(Val: false));
96
97class AMDGPUCodeGenPrepareImpl
98 : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
99public:
100 Function &F;
101 const GCNSubtarget &ST;
102 const AMDGPUTargetMachine &TM;
103 const TargetLibraryInfo *TLI;
104 const UniformityInfo &UA;
105 const DataLayout &DL;
106 SimplifyQuery SQ;
107 const bool HasFP32DenormalFlush;
108 bool FlowChanged = false;
109 mutable Function *SqrtF32 = nullptr;
110 mutable Function *LdexpF32 = nullptr;
111 mutable SmallVector<WeakVH> DeadVals;
112
113 DenseMap<const PHINode *, bool> BreakPhiNodesCache;
114
115 AMDGPUCodeGenPrepareImpl(Function &F, const AMDGPUTargetMachine &TM,
116 const TargetLibraryInfo *TLI, AssumptionCache *AC,
117 const DominatorTree *DT, const UniformityInfo &UA)
118 : F(F), ST(TM.getSubtarget<GCNSubtarget>(F)), TM(TM), TLI(TLI), UA(UA),
119 DL(F.getDataLayout()), SQ(DL, TLI, DT, AC),
120 HasFP32DenormalFlush(SIModeRegisterDefaults(F, ST).FP32Denormals ==
121 DenormalMode::getPreserveSign()) {}
122
123 Function *getSqrtF32() const {
124 if (SqrtF32)
125 return SqrtF32;
126
127 LLVMContext &Ctx = F.getContext();
128 SqrtF32 = Intrinsic::getOrInsertDeclaration(
129 M: F.getParent(), id: Intrinsic::amdgcn_sqrt, Tys: {Type::getFloatTy(C&: Ctx)});
130 return SqrtF32;
131 }
132
133 Function *getLdexpF32() const {
134 if (LdexpF32)
135 return LdexpF32;
136
137 LLVMContext &Ctx = F.getContext();
138 LdexpF32 = Intrinsic::getOrInsertDeclaration(
139 M: F.getParent(), id: Intrinsic::ldexp,
140 Tys: {Type::getFloatTy(C&: Ctx), Type::getInt32Ty(C&: Ctx)});
141 return LdexpF32;
142 }
143
144 bool canBreakPHINode(const PHINode &I);
145
146 /// Return true if \p T is a legal scalar floating point type.
147 bool isLegalFloatingTy(const Type *T) const;
148
149 /// Wrapper to pass all the arguments to computeKnownFPClass
150 KnownFPClass computeKnownFPClass(const Value *V, FPClassTest Interested,
151 const Instruction *CtxI) const {
152 return llvm::computeKnownFPClass(V, InterestedClasses: Interested,
153 SQ: SQ.getWithInstruction(I: CtxI));
154 }
155
156 bool canIgnoreDenormalInput(const Value *V, const Instruction *CtxI) const {
157 return HasFP32DenormalFlush ||
158 computeKnownFPClass(V, Interested: fcSubnormal, CtxI).isKnownNeverSubnormal();
159 }
160
161 /// \returns The minimum number of bits needed to store the value of \Op as an
162 /// unsigned integer. Truncating to this size and then zero-extending to
163 /// the original will not change the value.
164 unsigned numBitsUnsigned(Value *Op, const Instruction *CtxI) const;
165
166 /// \returns The minimum number of bits needed to store the value of \Op as a
167 /// signed integer. Truncating to this size and then sign-extending to
168 /// the original size will not change the value.
169 unsigned numBitsSigned(Value *Op, const Instruction *CtxI) const;
170
171 /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
172 /// SelectionDAG has an issue where an and asserting the bits are known
173 bool replaceMulWithMul24(BinaryOperator &I) const;
174
175 /// Perform same function as equivalently named function in DAGCombiner. Since
176 /// we expand some divisions here, we need to perform this before obscuring.
177 bool foldBinOpIntoSelect(BinaryOperator &I) const;
178
179 bool divHasSpecialOptimization(BinaryOperator &I,
180 Value *Num, Value *Den) const;
181 unsigned getDivNumBits(BinaryOperator &I, Value *Num, Value *Den,
182 unsigned MaxDivBits, bool Signed) const;
183
184 /// Expands 24 bit div or rem.
185 Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
186 Value *Num, Value *Den,
187 bool IsDiv, bool IsSigned) const;
188
189 Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I,
190 Value *Num, Value *Den, unsigned NumBits,
191 bool IsDiv, bool IsSigned) const;
192
193 /// Expands 32 bit div or rem.
194 Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
195 Value *Num, Value *Den) const;
196
197 Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I,
198 Value *Num, Value *Den) const;
199 void expandDivRem64(BinaryOperator &I) const;
200
201 /// Widen a scalar load.
202 ///
203 /// \details \p Widen scalar load for uniform, small type loads from constant
204 // memory / to a full 32-bits and then truncate the input to allow a scalar
205 // load instead of a vector load.
206 //
207 /// \returns True.
208
209 bool canWidenScalarExtLoad(LoadInst &I) const;
210
211 Value *matchFractPat(IntrinsicInst &I);
212 Value *applyFractPat(IRBuilder<> &Builder, Value *FractArg);
213
214 bool canOptimizeWithRsq(FastMathFlags DivFMF, FastMathFlags SqrtFMF) const;
215
216 Value *optimizeWithRsq(IRBuilder<> &Builder, Value *Num, Value *Den,
217 FastMathFlags DivFMF, FastMathFlags SqrtFMF,
218 const Instruction *CtxI) const;
219
220 Value *optimizeWithRcp(IRBuilder<> &Builder, Value *Num, Value *Den,
221 FastMathFlags FMF, const Instruction *CtxI) const;
222 Value *optimizeWithFDivFast(IRBuilder<> &Builder, Value *Num, Value *Den,
223 float ReqdAccuracy) const;
224
225 Value *visitFDivElement(IRBuilder<> &Builder, Value *Num, Value *Den,
226 FastMathFlags DivFMF, FastMathFlags SqrtFMF,
227 Value *RsqOp, const Instruction *FDiv,
228 float ReqdAccuracy) const;
229
230 std::pair<Value *, Value *> getFrexpResults(IRBuilder<> &Builder,
231 Value *Src) const;
232
233 Value *emitRcpIEEE1ULP(IRBuilder<> &Builder, Value *Src,
234 bool IsNegative) const;
235 Value *emitFrexpDiv(IRBuilder<> &Builder, Value *LHS, Value *RHS,
236 FastMathFlags FMF) const;
237 Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src,
238 FastMathFlags FMF) const;
239 Value *emitRsqF64(IRBuilder<> &Builder, Value *X, FastMathFlags SqrtFMF,
240 FastMathFlags DivFMF, const Instruction *CtxI,
241 bool IsNegative) const;
242
243 CallInst *createWorkitemIdX(IRBuilder<> &B) const;
244 void replaceWithWorkitemIdX(Instruction &I) const;
245 void replaceWithMaskedWorkitemIdX(Instruction &I, unsigned WaveSize) const;
246 bool tryReplaceWithWorkitemId(Instruction &I, unsigned Wave) const;
247
248 bool tryNarrowMathIfNoOverflow(Instruction *I);
249
250public:
251 bool visitFDiv(BinaryOperator &I);
252
253 bool visitInstruction(Instruction &I) { return false; }
254 bool visitBinaryOperator(BinaryOperator &I);
255 bool visitLoadInst(LoadInst &I);
256 bool visitSelectInst(SelectInst &I);
257 bool visitPHINode(PHINode &I);
258 bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
259
260 bool visitIntrinsicInst(IntrinsicInst &I);
261 bool visitFMinLike(IntrinsicInst &I);
262 bool visitSqrt(IntrinsicInst &I);
263 bool visitMbcntLo(IntrinsicInst &I) const;
264 bool visitMbcntHi(IntrinsicInst &I) const;
265 bool run();
266};
267
268class AMDGPUCodeGenPrepare : public FunctionPass {
269public:
270 static char ID;
271 AMDGPUCodeGenPrepare() : FunctionPass(ID) {}
272 void getAnalysisUsage(AnalysisUsage &AU) const override {
273 AU.addRequired<AssumptionCacheTracker>();
274 AU.addRequired<UniformityInfoWrapperPass>();
275 AU.addRequired<TargetLibraryInfoWrapperPass>();
276
277 // FIXME: Division expansion needs to preserve the dominator tree.
278 if (!ExpandDiv64InIR)
279 AU.setPreservesAll();
280 }
281 bool runOnFunction(Function &F) override;
282 StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
283};
284
285} // end anonymous namespace
286
287bool AMDGPUCodeGenPrepareImpl::run() {
288 BreakPhiNodesCache.clear();
289 bool MadeChange = false;
290
291 // Need to use make_early_inc_range because integer division expansion is
292 // handled by Transform/Utils, and it can delete instructions such as the
293 // terminator of the BB.
294 for (BasicBlock &BB : reverse(C&: F)) {
295 for (Instruction &I : make_early_inc_range(Range: reverse(C&: BB))) {
296 if (!isInstructionTriviallyDead(I: &I, TLI))
297 MadeChange |= visit(I);
298 }
299 }
300
301 while (!DeadVals.empty()) {
302 if (auto *I = dyn_cast_or_null<Instruction>(Val: DeadVals.pop_back_val()))
303 RecursivelyDeleteTriviallyDeadInstructions(V: I, TLI);
304 }
305
306 return MadeChange;
307}
308
309bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const {
310 return Ty->isFloatTy() || Ty->isDoubleTy() ||
311 (Ty->isHalfTy() && ST.has16BitInsts());
312}
313
314bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {
315 Type *Ty = I.getType();
316 int TySize = DL.getTypeSizeInBits(Ty);
317 Align Alignment = DL.getValueOrABITypeAlignment(Alignment: I.getAlign(), Ty);
318
319 return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.isUniform(I: &I);
320}
321
322unsigned
323AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op,
324 const Instruction *CtxI) const {
325 return computeKnownBits(V: Op, Q: SQ.getWithInstruction(I: CtxI)).countMaxActiveBits();
326}
327
328unsigned
329AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op,
330 const Instruction *CtxI) const {
331 return ComputeMaxSignificantBits(Op, DL: SQ.DL, AC: SQ.AC, CxtI: CtxI, DT: SQ.DT);
332}
333
334static void extractValues(IRBuilder<> &Builder,
335 SmallVectorImpl<Value *> &Values, Value *V) {
336 auto *VT = dyn_cast<FixedVectorType>(Val: V->getType());
337 if (!VT) {
338 Values.push_back(Elt: V);
339 return;
340 }
341
342 for (int I = 0, E = VT->getNumElements(); I != E; ++I)
343 Values.push_back(Elt: Builder.CreateExtractElement(Vec: V, Idx: I));
344}
345
346static Value *insertValues(IRBuilder<> &Builder,
347 Type *Ty,
348 SmallVectorImpl<Value *> &Values) {
349 if (!Ty->isVectorTy()) {
350 assert(Values.size() == 1);
351 return Values[0];
352 }
353
354 Value *NewVal = PoisonValue::get(T: Ty);
355 for (int I = 0, E = Values.size(); I != E; ++I)
356 NewVal = Builder.CreateInsertElement(Vec: NewVal, NewElt: Values[I], Idx: I);
357
358 return NewVal;
359}
360
361bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
362 if (I.getOpcode() != Instruction::Mul)
363 return false;
364
365 Type *Ty = I.getType();
366 unsigned Size = Ty->getScalarSizeInBits();
367 if (Size <= 16 && ST.has16BitInsts())
368 return false;
369
370 // Prefer scalar if this could be s_mul_i32
371 if (UA.isUniform(I: &I))
372 return false;
373
374 Value *LHS = I.getOperand(i_nocapture: 0);
375 Value *RHS = I.getOperand(i_nocapture: 1);
376 IRBuilder<> Builder(&I);
377 Builder.SetCurrentDebugLocation(I.getDebugLoc());
378
379 unsigned LHSBits = 0, RHSBits = 0;
380 bool IsSigned = false;
381
382 if (ST.hasMulU24() && (LHSBits = numBitsUnsigned(Op: LHS, CtxI: &I)) <= 24 &&
383 (RHSBits = numBitsUnsigned(Op: RHS, CtxI: &I)) <= 24) {
384 IsSigned = false;
385
386 } else if (ST.hasMulI24() && (LHSBits = numBitsSigned(Op: LHS, CtxI: &I)) <= 24 &&
387 (RHSBits = numBitsSigned(Op: RHS, CtxI: &I)) <= 24) {
388 IsSigned = true;
389
390 } else
391 return false;
392
393 SmallVector<Value *, 4> LHSVals;
394 SmallVector<Value *, 4> RHSVals;
395 SmallVector<Value *, 4> ResultVals;
396 extractValues(Builder, Values&: LHSVals, V: LHS);
397 extractValues(Builder, Values&: RHSVals, V: RHS);
398
399 IntegerType *I32Ty = Builder.getInt32Ty();
400 IntegerType *IntrinTy = Size > 32 ? Builder.getInt64Ty() : I32Ty;
401 Type *DstTy = LHSVals[0]->getType();
402
403 for (int I = 0, E = LHSVals.size(); I != E; ++I) {
404 Value *LHS = IsSigned ? Builder.CreateSExtOrTrunc(V: LHSVals[I], DestTy: I32Ty)
405 : Builder.CreateZExtOrTrunc(V: LHSVals[I], DestTy: I32Ty);
406 Value *RHS = IsSigned ? Builder.CreateSExtOrTrunc(V: RHSVals[I], DestTy: I32Ty)
407 : Builder.CreateZExtOrTrunc(V: RHSVals[I], DestTy: I32Ty);
408 Intrinsic::ID ID =
409 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
410 Value *Result = Builder.CreateIntrinsic(ID, Types: {IntrinTy}, Args: {LHS, RHS});
411 Result = IsSigned ? Builder.CreateSExtOrTrunc(V: Result, DestTy: DstTy)
412 : Builder.CreateZExtOrTrunc(V: Result, DestTy: DstTy);
413 ResultVals.push_back(Elt: Result);
414 }
415
416 Value *NewVal = insertValues(Builder, Ty, Values&: ResultVals);
417 NewVal->takeName(V: &I);
418 I.replaceAllUsesWith(V: NewVal);
419 DeadVals.push_back(Elt: &I);
420
421 return true;
422}
423
424// Find a select instruction, which may have been casted. This is mostly to deal
425// with cases where i16 selects were promoted here to i32.
426static SelectInst *findSelectThroughCast(Value *V, CastInst *&Cast) {
427 Cast = nullptr;
428 if (SelectInst *Sel = dyn_cast<SelectInst>(Val: V))
429 return Sel;
430
431 if ((Cast = dyn_cast<CastInst>(Val: V))) {
432 if (SelectInst *Sel = dyn_cast<SelectInst>(Val: Cast->getOperand(i_nocapture: 0)))
433 return Sel;
434 }
435
436 return nullptr;
437}
438
439bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const {
440 // Don't do this unless the old select is going away. We want to eliminate the
441 // binary operator, not replace a binop with a select.
442 int SelOpNo = 0;
443
444 CastInst *CastOp;
445
446 // TODO: Should probably try to handle some cases with multiple
447 // users. Duplicating the select may be profitable for division.
448 SelectInst *Sel = findSelectThroughCast(V: BO.getOperand(i_nocapture: 0), Cast&: CastOp);
449 if (!Sel || !Sel->hasOneUse()) {
450 SelOpNo = 1;
451 Sel = findSelectThroughCast(V: BO.getOperand(i_nocapture: 1), Cast&: CastOp);
452 }
453
454 if (!Sel || !Sel->hasOneUse())
455 return false;
456
457 Constant *CT = dyn_cast<Constant>(Val: Sel->getTrueValue());
458 Constant *CF = dyn_cast<Constant>(Val: Sel->getFalseValue());
459 Constant *CBO = dyn_cast<Constant>(Val: BO.getOperand(i_nocapture: SelOpNo ^ 1));
460 if (!CBO || !CT || !CF)
461 return false;
462
463 if (CastOp) {
464 if (!CastOp->hasOneUse())
465 return false;
466 CT = ConstantFoldCastOperand(Opcode: CastOp->getOpcode(), C: CT, DestTy: BO.getType(), DL);
467 CF = ConstantFoldCastOperand(Opcode: CastOp->getOpcode(), C: CF, DestTy: BO.getType(), DL);
468 }
469
470 // TODO: Handle special 0/-1 cases DAG combine does, although we only really
471 // need to handle divisions here.
472 Constant *FoldedT =
473 SelOpNo ? ConstantFoldBinaryOpOperands(Opcode: BO.getOpcode(), LHS: CBO, RHS: CT, DL)
474 : ConstantFoldBinaryOpOperands(Opcode: BO.getOpcode(), LHS: CT, RHS: CBO, DL);
475 if (!FoldedT || isa<ConstantExpr>(Val: FoldedT))
476 return false;
477
478 Constant *FoldedF =
479 SelOpNo ? ConstantFoldBinaryOpOperands(Opcode: BO.getOpcode(), LHS: CBO, RHS: CF, DL)
480 : ConstantFoldBinaryOpOperands(Opcode: BO.getOpcode(), LHS: CF, RHS: CBO, DL);
481 if (!FoldedF || isa<ConstantExpr>(Val: FoldedF))
482 return false;
483
484 IRBuilder<> Builder(&BO);
485 Builder.SetCurrentDebugLocation(BO.getDebugLoc());
486 if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(Val: &BO))
487 Builder.setFastMathFlags(FPOp->getFastMathFlags());
488
489 Value *NewSelect = Builder.CreateSelect(C: Sel->getCondition(),
490 True: FoldedT, False: FoldedF);
491 NewSelect->takeName(V: &BO);
492 BO.replaceAllUsesWith(V: NewSelect);
493 DeadVals.push_back(Elt: &BO);
494 if (CastOp)
495 DeadVals.push_back(Elt: CastOp);
496 DeadVals.push_back(Elt: Sel);
497 return true;
498}
499
500std::pair<Value *, Value *>
501AMDGPUCodeGenPrepareImpl::getFrexpResults(IRBuilder<> &Builder,
502 Value *Src) const {
503 Type *Ty = Src->getType();
504 Value *Frexp = Builder.CreateIntrinsic(ID: Intrinsic::frexp,
505 Types: {Ty, Builder.getInt32Ty()}, Args: Src);
506 Value *FrexpMant = Builder.CreateExtractValue(Agg: Frexp, Idxs: {0});
507
508 // Bypass the bug workaround for the exponent result since it doesn't matter.
509 // TODO: Does the bug workaround even really need to consider the exponent
510 // result? It's unspecified by the spec.
511
512 Value *FrexpExp =
513 ST.hasFractBug()
514 ? Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_frexp_exp,
515 Types: {Builder.getInt32Ty(), Ty}, Args: Src)
516 : Builder.CreateExtractValue(Agg: Frexp, Idxs: {1});
517 return {FrexpMant, FrexpExp};
518}
519
520/// Emit an expansion of 1.0 / Src good for 1ulp that supports denormals.
521Value *AMDGPUCodeGenPrepareImpl::emitRcpIEEE1ULP(IRBuilder<> &Builder,
522 Value *Src,
523 bool IsNegative) const {
524 // Same as for 1.0, but expand the sign out of the constant.
525 // -1.0 / x -> rcp (fneg x)
526 if (IsNegative)
527 Src = Builder.CreateFNeg(V: Src);
528
529 // The rcp instruction doesn't support denormals, so scale the input
530 // out of the denormal range and convert at the end.
531 //
532 // Expand as 2^-n * (1.0 / (x * 2^n))
533
534 // TODO: Skip scaling if input is known never denormal and the input
535 // range won't underflow to denormal. The hard part is knowing the
536 // result. We need a range check, the result could be denormal for
537 // 0x1p+126 < den <= 0x1p+127.
538 auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);
539 Value *ScaleFactor = Builder.CreateNeg(V: FrexpExp);
540 Value *Rcp = Builder.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rcp, V: FrexpMant);
541 return Builder.CreateCall(Callee: getLdexpF32(), Args: {Rcp, ScaleFactor});
542}
543
544/// Emit a 2ulp expansion for fdiv by using frexp for input scaling.
545Value *AMDGPUCodeGenPrepareImpl::emitFrexpDiv(IRBuilder<> &Builder, Value *LHS,
546 Value *RHS,
547 FastMathFlags FMF) const {
548 // If we have have to work around the fract/frexp bug, we're worse off than
549 // using the fdiv.fast expansion. The full safe expansion is faster if we have
550 // fast FMA.
551 if (HasFP32DenormalFlush && ST.hasFractBug() && !ST.hasFastFMAF32() &&
552 (!FMF.noNaNs() || !FMF.noInfs()))
553 return nullptr;
554
555 // We're scaling the LHS to avoid a denormal input, and scale the denominator
556 // to avoid large values underflowing the result.
557 auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, Src: RHS);
558
559 Value *Rcp =
560 Builder.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rcp, V: FrexpMantRHS);
561
562 auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder, Src: LHS);
563 Value *Mul = Builder.CreateFMul(L: FrexpMantLHS, R: Rcp);
564
565 // We multiplied by 2^N/2^M, so we need to multiply by 2^(N-M) to scale the
566 // result.
567 Value *ExpDiff = Builder.CreateSub(LHS: FrexpExpLHS, RHS: FrexpExpRHS);
568 return Builder.CreateCall(Callee: getLdexpF32(), Args: {Mul, ExpDiff});
569}
570
571/// Emit a sqrt that handles denormals and is accurate to 2ulp.
572Value *AMDGPUCodeGenPrepareImpl::emitSqrtIEEE2ULP(IRBuilder<> &Builder,
573 Value *Src,
574 FastMathFlags FMF) const {
575 Type *Ty = Src->getType();
576 APFloat SmallestNormal =
577 APFloat::getSmallestNormalized(Sem: Ty->getFltSemantics());
578 Value *NeedScale =
579 Builder.CreateFCmpOLT(LHS: Src, RHS: ConstantFP::get(Ty, V: SmallestNormal));
580
581 ConstantInt *Zero = Builder.getInt32(C: 0);
582 Value *InputScaleFactor =
583 Builder.CreateSelect(C: NeedScale, True: Builder.getInt32(C: 32), False: Zero);
584
585 Value *Scaled = Builder.CreateCall(Callee: getLdexpF32(), Args: {Src, InputScaleFactor});
586
587 Value *Sqrt = Builder.CreateCall(Callee: getSqrtF32(), Args: Scaled);
588
589 Value *OutputScaleFactor =
590 Builder.CreateSelect(C: NeedScale, True: Builder.getInt32(C: -16), False: Zero);
591 return Builder.CreateCall(Callee: getLdexpF32(), Args: {Sqrt, OutputScaleFactor});
592}
593
594/// Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
595static Value *emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src,
596 bool IsNegative) {
597 // bool need_scale = x < 0x1p-126f;
598 // float input_scale = need_scale ? 0x1.0p+24f : 1.0f;
599 // float output_scale = need_scale ? 0x1.0p+12f : 1.0f;
600 // rsq(x * input_scale) * output_scale;
601
602 Type *Ty = Src->getType();
603 APFloat SmallestNormal =
604 APFloat::getSmallestNormalized(Sem: Ty->getFltSemantics());
605 Value *NeedScale =
606 Builder.CreateFCmpOLT(LHS: Src, RHS: ConstantFP::get(Ty, V: SmallestNormal));
607 Constant *One = ConstantFP::get(Ty, V: 1.0);
608 Constant *InputScale = ConstantFP::get(Ty, V: 0x1.0p+24);
609 Constant *OutputScale =
610 ConstantFP::get(Ty, V: IsNegative ? -0x1.0p+12 : 0x1.0p+12);
611
612 Value *InputScaleFactor = Builder.CreateSelect(C: NeedScale, True: InputScale, False: One);
613
614 Value *ScaledInput = Builder.CreateFMul(L: Src, R: InputScaleFactor);
615 Value *Rsq = Builder.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rsq, V: ScaledInput);
616 Value *OutputScaleFactor = Builder.CreateSelect(
617 C: NeedScale, True: OutputScale, False: IsNegative ? ConstantFP::get(Ty, V: -1.0) : One);
618
619 return Builder.CreateFMul(L: Rsq, R: OutputScaleFactor);
620}
621
622/// Emit inverse sqrt expansion for f64 with a correction sequence on top of
623/// v_rsq_f64. This should give a 1ulp result.
624Value *AMDGPUCodeGenPrepareImpl::emitRsqF64(IRBuilder<> &Builder, Value *X,
625 FastMathFlags SqrtFMF,
626 FastMathFlags DivFMF,
627 const Instruction *CtxI,
628 bool IsNegative) const {
629 // rsq(x):
630 // double y0 = BUILTIN_AMDGPU_RSQRT_F64(x);
631 // double e = MATH_MAD(-y0 * (x == PINF_F64 || x == 0.0 ? y0 : x), y0, 1.0);
632 // return MATH_MAD(y0*e, MATH_MAD(e, 0.375, 0.5), y0);
633 //
634 // -rsq(x):
635 // double y0 = BUILTIN_AMDGPU_RSQRT_F64(x);
636 // double e = MATH_MAD(-y0 * (x == PINF_F64 || x == 0.0 ? y0 : x), y0, 1.0);
637 // return MATH_MAD(-y0*e, MATH_MAD(e, 0.375, 0.5), -y0);
638 //
639 // The rsq instruction handles the special cases correctly. We need to check
640 // for the edge case conditions to ensure the special case propagates through
641 // the later instructions.
642
643 Value *Y0 = Builder.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rsq, V: X);
644
645 // Try to elide the edge case check.
646 //
647 // Fast math flags imply:
648 // sqrt ninf => !isinf(x)
649 // fdiv ninf => x != 0, !isinf(x)
650 bool MaybePosInf = !SqrtFMF.noInfs() && !DivFMF.noInfs();
651 bool MaybeZero = !DivFMF.noInfs();
652
653 DenormalMode DenormMode;
654 FPClassTest Interested = fcNone;
655 if (MaybePosInf)
656 Interested = fcPosInf;
657 if (MaybeZero)
658 Interested |= fcZero;
659
660 if (Interested != fcNone) {
661 KnownFPClass KnownSrc = computeKnownFPClass(V: X, Interested, CtxI);
662 if (KnownSrc.isKnownNeverPosInfinity())
663 MaybePosInf = false;
664
665 DenormMode = F.getDenormalMode(FPType: X->getType()->getFltSemantics());
666 if (KnownSrc.isKnownNeverLogicalZero(Mode: DenormMode))
667 MaybeZero = false;
668 }
669
670 Value *SpecialOrRsq = X;
671 if (MaybeZero || MaybePosInf) {
672 Value *Cond;
673 if (MaybePosInf && MaybeZero) {
674 if (DenormMode.Input != DenormalMode::DenormalModeKind::Dynamic) {
675 FPClassTest TestMask = fcPosInf | fcZero;
676 if (DenormMode.inputsAreZero())
677 TestMask |= fcSubnormal;
678
679 Cond = Builder.createIsFPClass(FPNum: X, Test: TestMask);
680 } else {
681 // Avoid using llvm.is.fpclass for dynamic denormal mode, since it
682 // doesn't respect the floating-point environment.
683 Value *IsZero =
684 Builder.CreateFCmpOEQ(LHS: X, RHS: ConstantFP::getZero(Ty: X->getType()));
685 Value *IsInf =
686 Builder.CreateFCmpOEQ(LHS: X, RHS: ConstantFP::getInfinity(Ty: X->getType()));
687 Cond = Builder.CreateOr(LHS: IsZero, RHS: IsInf);
688 }
689 } else if (MaybeZero) {
690 Cond = Builder.CreateFCmpOEQ(LHS: X, RHS: ConstantFP::getZero(Ty: X->getType()));
691 } else {
692 Cond = Builder.CreateFCmpOEQ(LHS: X, RHS: ConstantFP::getInfinity(Ty: X->getType()));
693 }
694
695 SpecialOrRsq = Builder.CreateSelect(C: Cond, True: Y0, False: X);
696 }
697
698 Value *NegY0 = Builder.CreateFNeg(V: Y0);
699 Value *NegXY0 = Builder.CreateFMul(L: SpecialOrRsq, R: NegY0);
700
701 // Could be fmuladd, but isFMAFasterThanFMulAndFAdd is always true for f64.
702 Value *E = Builder.CreateFMA(Factor1: NegXY0, Factor2: Y0, Summand: ConstantFP::get(Ty: X->getType(), V: 1.0));
703
704 Value *Y0E = Builder.CreateFMul(L: E, R: IsNegative ? NegY0 : Y0);
705
706 Value *EFMA = Builder.CreateFMA(Factor1: E, Factor2: ConstantFP::get(Ty: X->getType(), V: 0.375),
707 Summand: ConstantFP::get(Ty: X->getType(), V: 0.5));
708
709 return Builder.CreateFMA(Factor1: Y0E, Factor2: EFMA, Summand: IsNegative ? NegY0 : Y0);
710}
711
712bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(FastMathFlags DivFMF,
713 FastMathFlags SqrtFMF) const {
714 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp for f32 and
715 // f64.
716 return DivFMF.allowContract() && SqrtFMF.allowContract();
717}
718
719Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
720 IRBuilder<> &Builder, Value *Num, Value *Den, const FastMathFlags DivFMF,
721 const FastMathFlags SqrtFMF, const Instruction *CtxI) const {
722 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
723 assert(DivFMF.allowContract() && SqrtFMF.allowContract());
724
725 // rsq_f16 is accurate to 0.51 ulp.
726 // rsq_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
727 // rsq_f64 is never accurate.
728 const ConstantFP *CLHS = dyn_cast<ConstantFP>(Val: Num);
729 if (!CLHS)
730 return nullptr;
731
732 bool IsNegative = false;
733
734 // TODO: Handle other numerator values with arcp.
735 if (CLHS->isExactlyValue(V: 1.0) || (IsNegative = CLHS->isExactlyValue(V: -1.0))) {
736 // Add in the sqrt flags.
737 IRBuilder<>::FastMathFlagGuard Guard(Builder);
738 Builder.setFastMathFlags(DivFMF | SqrtFMF);
739
740 if (Den->getType()->isFloatTy()) {
741 if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) ||
742 canIgnoreDenormalInput(V: Den, CtxI)) {
743 Value *Result =
744 Builder.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rsq, V: Den);
745 // -1.0 / sqrt(x) -> fneg(rsq(x))
746 return IsNegative ? Builder.CreateFNeg(V: Result) : Result;
747 }
748
749 return emitRsqIEEE1ULP(Builder, Src: Den, IsNegative);
750 }
751
752 if (Den->getType()->isDoubleTy())
753 return emitRsqF64(Builder, X: Den, SqrtFMF, DivFMF, CtxI, IsNegative);
754 }
755
756 return nullptr;
757}
758
759// Optimize fdiv with rcp:
760//
761// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
762// allowed with afn.
763//
764// a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0
765Value *
766AMDGPUCodeGenPrepareImpl::optimizeWithRcp(IRBuilder<> &Builder, Value *Num,
767 Value *Den, FastMathFlags FMF,
768 const Instruction *CtxI) const {
769 // rcp_f16 is accurate to 0.51 ulp.
770 // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
771 // rcp_f64 is never accurate.
772 assert(Den->getType()->isFloatTy());
773
774 if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Val: Num)) {
775 bool IsNegative = false;
776 if (CLHS->isExactlyValue(V: 1.0) ||
777 (IsNegative = CLHS->isExactlyValue(V: -1.0))) {
778 Value *Src = Den;
779
780 if (HasFP32DenormalFlush || FMF.approxFunc()) {
781 // -1.0 / x -> 1.0 / fneg(x)
782 if (IsNegative)
783 Src = Builder.CreateFNeg(V: Src);
784
785 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
786 // the CI documentation has a worst case error of 1 ulp.
787 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK
788 // to use it as long as we aren't trying to use denormals.
789 //
790 // v_rcp_f16 and v_rsq_f16 DO support denormals.
791
792 // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
793 // insert rsq intrinsic here.
794
795 // 1.0 / x -> rcp(x)
796 return Builder.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rcp, V: Src);
797 }
798
799 // TODO: If the input isn't denormal, and we know the input exponent isn't
800 // big enough to introduce a denormal we can avoid the scaling.
801 return emitRcpIEEE1ULP(Builder, Src, IsNegative);
802 }
803 }
804
805 if (FMF.allowReciprocal()) {
806 // x / y -> x * (1.0 / y)
807
808 // TODO: Could avoid denormal scaling and use raw rcp if we knew the output
809 // will never underflow.
810 if (HasFP32DenormalFlush || FMF.approxFunc()) {
811 Value *Recip = Builder.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rcp, V: Den);
812 return Builder.CreateFMul(L: Num, R: Recip);
813 }
814
815 Value *Recip = emitRcpIEEE1ULP(Builder, Src: Den, IsNegative: false);
816 return Builder.CreateFMul(L: Num, R: Recip);
817 }
818
819 return nullptr;
820}
821
822// optimize with fdiv.fast:
823//
824// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
825//
826// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
827//
828// NOTE: optimizeWithRcp should be tried first because rcp is the preference.
829Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
830 IRBuilder<> &Builder, Value *Num, Value *Den, float ReqdAccuracy) const {
831 // fdiv.fast can achieve 2.5 ULP accuracy.
832 if (ReqdAccuracy < 2.5f)
833 return nullptr;
834
835 // Only have fdiv.fast for f32.
836 assert(Den->getType()->isFloatTy());
837
838 bool NumIsOne = false;
839 if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Val: Num)) {
840 if (CNum->isExactlyValue(V: +1.0) || CNum->isExactlyValue(V: -1.0))
841 NumIsOne = true;
842 }
843
844 // fdiv does not support denormals. But 1.0/x is always fine to use it.
845 //
846 // TODO: This works for any value with a specific known exponent range, don't
847 // just limit to constant 1.
848 if (!HasFP32DenormalFlush && !NumIsOne)
849 return nullptr;
850
851 return Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_fdiv_fast, Args: {Num, Den});
852}
853
854Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
855 IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF,
856 FastMathFlags SqrtFMF, Value *RsqOp, const Instruction *FDivInst,
857 float ReqdDivAccuracy) const {
858 if (RsqOp) {
859 Value *Rsq =
860 optimizeWithRsq(Builder, Num, Den: RsqOp, DivFMF, SqrtFMF, CtxI: FDivInst);
861 if (Rsq)
862 return Rsq;
863 }
864
865 if (!Num->getType()->isFloatTy())
866 return nullptr;
867
868 Value *Rcp = optimizeWithRcp(Builder, Num, Den, FMF: DivFMF, CtxI: FDivInst);
869 if (Rcp)
870 return Rcp;
871
872 // In the basic case fdiv_fast has the same instruction count as the frexp div
873 // expansion. Slightly prefer fdiv_fast since it ends in an fmul that can
874 // potentially be fused into a user. Also, materialization of the constants
875 // can be reused for multiple instances.
876 Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdAccuracy: ReqdDivAccuracy);
877 if (FDivFast)
878 return FDivFast;
879
880 return emitFrexpDiv(Builder, LHS: Num, RHS: Den, FMF: DivFMF);
881}
882
883// Optimizations is performed based on fpmath, fast math flags as well as
884// denormals to optimize fdiv with either rcp or fdiv.fast.
885//
886// With rcp:
887// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
888// allowed with afn.
889//
890// a/b -> a*rcp(b) when inaccurate rcp is allowed with afn.
891//
892// With fdiv.fast:
893// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
894//
895// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
896//
897// NOTE: rcp is the preference in cases that both are legal.
898bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
899 if (DisableFDivExpand)
900 return false;
901
902 Type *Ty = FDiv.getType()->getScalarType();
903 const bool IsFloat = Ty->isFloatTy();
904 if (!IsFloat && !Ty->isDoubleTy())
905 return false;
906
907 // The f64 rcp/rsq approximations are pretty inaccurate. We can do an
908 // expansion around them in codegen. f16 is good enough to always use.
909
910 const FPMathOperator *FPOp = cast<const FPMathOperator>(Val: &FDiv);
911 const FastMathFlags DivFMF = FPOp->getFastMathFlags();
912 const float ReqdAccuracy = FPOp->getFPAccuracy();
913
914 FastMathFlags SqrtFMF;
915
916 Value *Num = FDiv.getOperand(i_nocapture: 0);
917 Value *Den = FDiv.getOperand(i_nocapture: 1);
918
919 Value *RsqOp = nullptr;
920 auto *DenII = dyn_cast<IntrinsicInst>(Val: Den);
921 if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
922 DenII->hasOneUse()) {
923 const auto *SqrtOp = cast<FPMathOperator>(Val: DenII);
924 SqrtFMF = SqrtOp->getFastMathFlags();
925 if (canOptimizeWithRsq(DivFMF, SqrtFMF))
926 RsqOp = SqrtOp->getOperand(i: 0);
927 }
928
929 // rcp path not yet implemented for f64.
930 if (!IsFloat && !RsqOp)
931 return false;
932
933 // Inaccurate rcp is allowed with afn.
934 //
935 // Defer to codegen to handle this.
936 //
937 // TODO: Decide on an interpretation for interactions between afn + arcp +
938 // !fpmath, and make it consistent between here and codegen. For now, defer
939 // expansion of afn to codegen. The current interpretation is so aggressive we
940 // don't need any pre-consideration here when we have better information. A
941 // more conservative interpretation could use handling here.
942 const bool AllowInaccurateRcp = DivFMF.approxFunc();
943 if (!RsqOp && AllowInaccurateRcp)
944 return false;
945
946 // Defer the correct implementations to codegen.
947 if (IsFloat && ReqdAccuracy < 1.0f)
948 return false;
949
950 IRBuilder<> Builder(FDiv.getParent(), std::next(x: FDiv.getIterator()));
951 Builder.setFastMathFlags(DivFMF);
952 Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
953
954 SmallVector<Value *, 4> NumVals;
955 SmallVector<Value *, 4> DenVals;
956 SmallVector<Value *, 4> RsqDenVals;
957 extractValues(Builder, Values&: NumVals, V: Num);
958 extractValues(Builder, Values&: DenVals, V: Den);
959
960 if (RsqOp)
961 extractValues(Builder, Values&: RsqDenVals, V: RsqOp);
962
963 SmallVector<Value *, 4> ResultVals(NumVals.size());
964 for (int I = 0, E = NumVals.size(); I != E; ++I) {
965 Value *NumElt = NumVals[I];
966 Value *DenElt = DenVals[I];
967 Value *RsqDenElt = RsqOp ? RsqDenVals[I] : nullptr;
968
969 Value *NewElt =
970 visitFDivElement(Builder, Num: NumElt, Den: DenElt, DivFMF, SqrtFMF, RsqOp: RsqDenElt,
971 FDivInst: cast<Instruction>(Val: FPOp), ReqdDivAccuracy: ReqdAccuracy);
972 if (!NewElt) {
973 // Keep the original, but scalarized.
974
975 // This has the unfortunate side effect of sometimes scalarizing when
976 // we're not going to do anything.
977 NewElt = Builder.CreateFDiv(L: NumElt, R: DenElt);
978 if (auto *NewEltInst = dyn_cast<Instruction>(Val: NewElt))
979 NewEltInst->copyMetadata(SrcInst: FDiv);
980 }
981
982 ResultVals[I] = NewElt;
983 }
984
985 Value *NewVal = insertValues(Builder, Ty: FDiv.getType(), Values&: ResultVals);
986
987 if (NewVal) {
988 FDiv.replaceAllUsesWith(V: NewVal);
989 NewVal->takeName(V: &FDiv);
990 DeadVals.push_back(Elt: &FDiv);
991 }
992
993 return true;
994}
995
996static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,
997 Value *LHS, Value *RHS) {
998 Type *I32Ty = Builder.getInt32Ty();
999 Type *I64Ty = Builder.getInt64Ty();
1000
1001 Value *LHS_EXT64 = Builder.CreateZExt(V: LHS, DestTy: I64Ty);
1002 Value *RHS_EXT64 = Builder.CreateZExt(V: RHS, DestTy: I64Ty);
1003 Value *MUL64 = Builder.CreateMul(LHS: LHS_EXT64, RHS: RHS_EXT64);
1004 Value *Lo = Builder.CreateTrunc(V: MUL64, DestTy: I32Ty);
1005 Value *Hi = Builder.CreateLShr(LHS: MUL64, RHS: Builder.getInt64(C: 32));
1006 Hi = Builder.CreateTrunc(V: Hi, DestTy: I32Ty);
1007 return std::pair(Lo, Hi);
1008}
1009
1010static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
1011 return getMul64(Builder, LHS, RHS).second;
1012}
1013
1014/// Figure out how many bits are really needed for this division.
1015/// \p MaxDivBits is an optimization hint to bypass the second
1016/// ComputeNumSignBits/computeKnownBits call if the first one is
1017/// insufficient.
1018unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,
1019 Value *Den,
1020 unsigned MaxDivBits,
1021 bool IsSigned) const {
1022 assert(Num->getType()->getScalarSizeInBits() ==
1023 Den->getType()->getScalarSizeInBits());
1024 unsigned SSBits = Num->getType()->getScalarSizeInBits();
1025 if (IsSigned) {
1026 unsigned RHSSignBits = ComputeNumSignBits(Op: Den, DL: SQ.DL, AC: SQ.AC, CxtI: &I, DT: SQ.DT);
1027 // A sign bit needs to be reserved for shrinking.
1028 unsigned DivBits = SSBits - RHSSignBits + 1;
1029 if (DivBits > MaxDivBits)
1030 return SSBits;
1031
1032 unsigned LHSSignBits = ComputeNumSignBits(Op: Num, DL: SQ.DL, AC: SQ.AC, CxtI: &I);
1033
1034 unsigned SignBits = std::min(a: LHSSignBits, b: RHSSignBits);
1035 DivBits = SSBits - SignBits + 1;
1036 return DivBits;
1037 }
1038
1039 // All bits are used for unsigned division for Num or Den in range
1040 // (SignedMax, UnsignedMax].
1041 KnownBits Known = computeKnownBits(V: Den, Q: SQ.getWithInstruction(I: &I));
1042 if (Known.isNegative() || !Known.isNonNegative())
1043 return SSBits;
1044 unsigned RHSSignBits = Known.countMinLeadingZeros();
1045 unsigned DivBits = SSBits - RHSSignBits;
1046 if (DivBits > MaxDivBits)
1047 return SSBits;
1048
1049 Known = computeKnownBits(V: Num, Q: SQ.getWithInstruction(I: &I));
1050 if (Known.isNegative() || !Known.isNonNegative())
1051 return SSBits;
1052 unsigned LHSSignBits = Known.countMinLeadingZeros();
1053
1054 unsigned SignBits = std::min(a: LHSSignBits, b: RHSSignBits);
1055 DivBits = SSBits - SignBits;
1056 return DivBits;
1057}
1058
1059// The fractional part of a float is enough to accurately represent up to
1060// a 24-bit signed integer.
1061Value *AMDGPUCodeGenPrepareImpl::expandDivRem24(IRBuilder<> &Builder,
1062 BinaryOperator &I, Value *Num,
1063 Value *Den, bool IsDiv,
1064 bool IsSigned) const {
1065 unsigned DivBits = getDivNumBits(I, Num, Den, MaxDivBits: 24, IsSigned);
1066 if (DivBits > 24)
1067 return nullptr;
1068 return expandDivRem24Impl(Builder, I, Num, Den, NumBits: DivBits, IsDiv, IsSigned);
1069}
1070
1071Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
1072 IRBuilder<> &Builder, BinaryOperator &I, Value *Num, Value *Den,
1073 unsigned DivBits, bool IsDiv, bool IsSigned) const {
1074 Type *I32Ty = Builder.getInt32Ty();
1075 Num = Builder.CreateTrunc(V: Num, DestTy: I32Ty);
1076 Den = Builder.CreateTrunc(V: Den, DestTy: I32Ty);
1077
1078 Type *F32Ty = Builder.getFloatTy();
1079 ConstantInt *One = Builder.getInt32(C: 1);
1080 Value *JQ = One;
1081
1082 if (IsSigned) {
1083 // char|short jq = ia ^ ib;
1084 JQ = Builder.CreateXor(LHS: Num, RHS: Den);
1085
1086 // jq = jq >> (bitsize - 2)
1087 JQ = Builder.CreateAShr(LHS: JQ, RHS: Builder.getInt32(C: 30));
1088
1089 // jq = jq | 0x1
1090 JQ = Builder.CreateOr(LHS: JQ, RHS: One);
1091 }
1092
1093 // int ia = (int)LHS;
1094 Value *IA = Num;
1095
1096 // int ib, (int)RHS;
1097 Value *IB = Den;
1098
1099 // float fa = (float)ia;
1100 Value *FA = IsSigned ? Builder.CreateSIToFP(V: IA, DestTy: F32Ty)
1101 : Builder.CreateUIToFP(V: IA, DestTy: F32Ty);
1102
1103 // float fb = (float)ib;
1104 Value *FB = IsSigned ? Builder.CreateSIToFP(V: IB,DestTy: F32Ty)
1105 : Builder.CreateUIToFP(V: IB,DestTy: F32Ty);
1106
1107 Value *RCP = Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_rcp,
1108 Types: Builder.getFloatTy(), Args: {FB});
1109 Value *FQM = Builder.CreateFMul(L: FA, R: RCP);
1110
1111 // fq = trunc(fqm);
1112 CallInst *FQ = Builder.CreateUnaryIntrinsic(ID: Intrinsic::trunc, V: FQM);
1113 FQ->copyFastMathFlags(FMF: Builder.getFastMathFlags());
1114
1115 // float fqneg = -fq;
1116 Value *FQNeg = Builder.CreateFNeg(V: FQ);
1117
1118 // float fr = mad(fqneg, fb, fa);
1119 auto FMAD = !ST.hasMadMacF32Insts()
1120 ? Intrinsic::fma
1121 : (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz;
1122 Value *FR = Builder.CreateIntrinsic(ID: FMAD,
1123 Types: {FQNeg->getType()}, Args: {FQNeg, FB, FA}, FMFSource: FQ);
1124
1125 // int iq = (int)fq;
1126 Value *IQ = IsSigned ? Builder.CreateFPToSI(V: FQ, DestTy: I32Ty)
1127 : Builder.CreateFPToUI(V: FQ, DestTy: I32Ty);
1128
1129 // fr = fabs(fr);
1130 FR = Builder.CreateUnaryIntrinsic(ID: Intrinsic::fabs, V: FR, FMFSource: FQ);
1131
1132 // fb = fabs(fb);
1133 FB = Builder.CreateUnaryIntrinsic(ID: Intrinsic::fabs, V: FB, FMFSource: FQ);
1134
1135 // int cv = fr >= fb;
1136 Value *CV = Builder.CreateFCmpOGE(LHS: FR, RHS: FB);
1137
1138 // jq = (cv ? jq : 0);
1139 JQ = Builder.CreateSelect(C: CV, True: JQ, False: Builder.getInt32(C: 0));
1140
1141 // dst = iq + jq;
1142 Value *Div = Builder.CreateAdd(LHS: IQ, RHS: JQ);
1143
1144 Value *Res = Div;
1145 if (!IsDiv) {
1146 // Rem needs compensation, it's easier to recompute it
1147 Value *Rem = Builder.CreateMul(LHS: Div, RHS: Den);
1148 Res = Builder.CreateSub(LHS: Num, RHS: Rem);
1149 }
1150
1151 if (DivBits != 0 && DivBits < 32) {
1152 // Extend in register from the number of bits this divide really is.
1153 if (IsSigned) {
1154 int InRegBits = 32 - DivBits;
1155
1156 Res = Builder.CreateShl(LHS: Res, RHS: InRegBits);
1157 Res = Builder.CreateAShr(LHS: Res, RHS: InRegBits);
1158 } else {
1159 ConstantInt *TruncMask
1160 = Builder.getInt32(C: (UINT64_C(1) << DivBits) - 1);
1161 Res = Builder.CreateAnd(LHS: Res, RHS: TruncMask);
1162 }
1163 }
1164
1165 return Res;
1166}
1167
1168// Try to recognize special cases the DAG will emit special, better expansions
1169// than the general expansion we do here.
1170
1171// TODO: It would be better to just directly handle those optimizations here.
1172bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I,
1173 Value *Num,
1174 Value *Den) const {
1175 if (Constant *C = dyn_cast<Constant>(Val: Den)) {
1176 // Arbitrary constants get a better expansion as long as a wider mulhi is
1177 // legal.
1178 if (C->getType()->getScalarSizeInBits() <= 32)
1179 return true;
1180
1181 // TODO: Sdiv check for not exact for some reason.
1182
1183 // If there's no wider mulhi, there's only a better expansion for powers of
1184 // two.
1185 // TODO: Should really know for each vector element.
1186 if (isKnownToBeAPowerOfTwo(V: C, OrZero: true, Q: SQ.getWithInstruction(I: &I)))
1187 return true;
1188
1189 return false;
1190 }
1191
1192 if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Val: Den)) {
1193 // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
1194 if (BinOpDen->getOpcode() == Instruction::Shl &&
1195 isa<Constant>(Val: BinOpDen->getOperand(i_nocapture: 0)) &&
1196 isKnownToBeAPowerOfTwo(V: BinOpDen->getOperand(i_nocapture: 0), OrZero: true,
1197 Q: SQ.getWithInstruction(I: &I))) {
1198 return true;
1199 }
1200 }
1201
1202 return false;
1203}
1204
1205static Value *getSign32(Value *V, IRBuilder<> &Builder, const DataLayout DL) {
1206 // Check whether the sign can be determined statically.
1207 KnownBits Known = computeKnownBits(V, DL);
1208 if (Known.isNegative())
1209 return Constant::getAllOnesValue(Ty: V->getType());
1210 if (Known.isNonNegative())
1211 return Constant::getNullValue(Ty: V->getType());
1212 return Builder.CreateAShr(LHS: V, RHS: Builder.getInt32(C: 31));
1213}
1214
1215Value *AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder<> &Builder,
1216 BinaryOperator &I, Value *X,
1217 Value *Y) const {
1218 Instruction::BinaryOps Opc = I.getOpcode();
1219 assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
1220 Opc == Instruction::SRem || Opc == Instruction::SDiv);
1221
1222 FastMathFlags FMF;
1223 FMF.setFast();
1224 Builder.setFastMathFlags(FMF);
1225
1226 if (divHasSpecialOptimization(I, Num: X, Den: Y))
1227 return nullptr; // Keep it for later optimization.
1228
1229 bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
1230 bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
1231
1232 Type *Ty = X->getType();
1233 Type *I32Ty = Builder.getInt32Ty();
1234 Type *F32Ty = Builder.getFloatTy();
1235
1236 if (Ty->getScalarSizeInBits() != 32) {
1237 if (IsSigned) {
1238 X = Builder.CreateSExtOrTrunc(V: X, DestTy: I32Ty);
1239 Y = Builder.CreateSExtOrTrunc(V: Y, DestTy: I32Ty);
1240 } else {
1241 X = Builder.CreateZExtOrTrunc(V: X, DestTy: I32Ty);
1242 Y = Builder.CreateZExtOrTrunc(V: Y, DestTy: I32Ty);
1243 }
1244 }
1245
1246 if (Value *Res = expandDivRem24(Builder, I, Num: X, Den: Y, IsDiv, IsSigned)) {
1247 return IsSigned ? Builder.CreateSExtOrTrunc(V: Res, DestTy: Ty) :
1248 Builder.CreateZExtOrTrunc(V: Res, DestTy: Ty);
1249 }
1250
1251 ConstantInt *Zero = Builder.getInt32(C: 0);
1252 ConstantInt *One = Builder.getInt32(C: 1);
1253
1254 Value *Sign = nullptr;
1255 if (IsSigned) {
1256 Value *SignX = getSign32(V: X, Builder, DL);
1257 Value *SignY = getSign32(V: Y, Builder, DL);
1258 // Remainder sign is the same as LHS
1259 Sign = IsDiv ? Builder.CreateXor(LHS: SignX, RHS: SignY) : SignX;
1260
1261 X = Builder.CreateAdd(LHS: X, RHS: SignX);
1262 Y = Builder.CreateAdd(LHS: Y, RHS: SignY);
1263
1264 X = Builder.CreateXor(LHS: X, RHS: SignX);
1265 Y = Builder.CreateXor(LHS: Y, RHS: SignY);
1266 }
1267
1268 // The algorithm here is based on ideas from "Software Integer Division", Tom
1269 // Rodeheffer, August 2008.
1270 //
1271 // unsigned udiv(unsigned x, unsigned y) {
1272 // // Initial estimate of inv(y). The constant is less than 2^32 to ensure
1273 // // that this is a lower bound on inv(y), even if some of the calculations
1274 // // round up.
1275 // unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y));
1276 //
1277 // // One round of UNR (Unsigned integer Newton-Raphson) to improve z.
1278 // // Empirically this is guaranteed to give a "two-y" lower bound on
1279 // // inv(y).
1280 // z += umulh(z, -y * z);
1281 //
1282 // // Quotient/remainder estimate.
1283 // unsigned q = umulh(x, z);
1284 // unsigned r = x - q * y;
1285 //
1286 // // Two rounds of quotient/remainder refinement.
1287 // if (r >= y) {
1288 // ++q;
1289 // r -= y;
1290 // }
1291 // if (r >= y) {
1292 // ++q;
1293 // r -= y;
1294 // }
1295 //
1296 // return q;
1297 // }
1298
1299 // Initial estimate of inv(y).
1300 Value *FloatY = Builder.CreateUIToFP(V: Y, DestTy: F32Ty);
1301 Value *RcpY = Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_rcp, Types: F32Ty, Args: {FloatY});
1302 Constant *Scale = ConstantFP::get(Ty: F32Ty, V: llvm::bit_cast<float>(from: 0x4F7FFFFE));
1303 Value *ScaledY = Builder.CreateFMul(L: RcpY, R: Scale);
1304 Value *Z = Builder.CreateFPToUI(V: ScaledY, DestTy: I32Ty);
1305
1306 // One round of UNR.
1307 Value *NegY = Builder.CreateSub(LHS: Zero, RHS: Y);
1308 Value *NegYZ = Builder.CreateMul(LHS: NegY, RHS: Z);
1309 Z = Builder.CreateAdd(LHS: Z, RHS: getMulHu(Builder, LHS: Z, RHS: NegYZ));
1310
1311 // Quotient/remainder estimate.
1312 Value *Q = getMulHu(Builder, LHS: X, RHS: Z);
1313 Value *R = Builder.CreateSub(LHS: X, RHS: Builder.CreateMul(LHS: Q, RHS: Y));
1314
1315 // First quotient/remainder refinement.
1316 Value *Cond = Builder.CreateICmpUGE(LHS: R, RHS: Y);
1317 if (IsDiv)
1318 Q = Builder.CreateSelect(C: Cond, True: Builder.CreateAdd(LHS: Q, RHS: One), False: Q);
1319 R = Builder.CreateSelect(C: Cond, True: Builder.CreateSub(LHS: R, RHS: Y), False: R);
1320
1321 // Second quotient/remainder refinement.
1322 Cond = Builder.CreateICmpUGE(LHS: R, RHS: Y);
1323 Value *Res;
1324 if (IsDiv)
1325 Res = Builder.CreateSelect(C: Cond, True: Builder.CreateAdd(LHS: Q, RHS: One), False: Q);
1326 else
1327 Res = Builder.CreateSelect(C: Cond, True: Builder.CreateSub(LHS: R, RHS: Y), False: R);
1328
1329 if (IsSigned) {
1330 Res = Builder.CreateXor(LHS: Res, RHS: Sign);
1331 Res = Builder.CreateSub(LHS: Res, RHS: Sign);
1332 Res = Builder.CreateSExtOrTrunc(V: Res, DestTy: Ty);
1333 } else {
1334 Res = Builder.CreateZExtOrTrunc(V: Res, DestTy: Ty);
1335 }
1336 return Res;
1337}
1338
1339Value *AMDGPUCodeGenPrepareImpl::shrinkDivRem64(IRBuilder<> &Builder,
1340 BinaryOperator &I, Value *Num,
1341 Value *Den) const {
1342 if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den))
1343 return nullptr; // Keep it for later optimization.
1344
1345 Instruction::BinaryOps Opc = I.getOpcode();
1346
1347 bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
1348 bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
1349
1350 unsigned NumDivBits = getDivNumBits(I, Num, Den, MaxDivBits: 32, IsSigned);
1351 if (NumDivBits > 32)
1352 return nullptr;
1353
1354 Value *Narrowed = nullptr;
1355 if (NumDivBits <= 24) {
1356 Narrowed = expandDivRem24Impl(Builder, I, Num, Den, DivBits: NumDivBits,
1357 IsDiv, IsSigned);
1358 } else if (NumDivBits <= 32) {
1359 Narrowed = expandDivRem32(Builder, I, X: Num, Y: Den);
1360 }
1361
1362 if (Narrowed) {
1363 return IsSigned ? Builder.CreateSExt(V: Narrowed, DestTy: Num->getType()) :
1364 Builder.CreateZExt(V: Narrowed, DestTy: Num->getType());
1365 }
1366
1367 return nullptr;
1368}
1369
1370void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &I) const {
1371 Instruction::BinaryOps Opc = I.getOpcode();
1372 // Do the general expansion.
1373 if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {
1374 expandDivisionUpTo64Bits(Div: &I);
1375 return;
1376 }
1377
1378 if (Opc == Instruction::URem || Opc == Instruction::SRem) {
1379 expandRemainderUpTo64Bits(Rem: &I);
1380 return;
1381 }
1382
1383 llvm_unreachable("not a division");
1384}
1385
1386/*
1387This will cause non-byte load in consistency, for example:
1388```
1389 %load = load i1, ptr addrspace(4) %arg, align 4
1390 %zext = zext i1 %load to
1391 i64 %add = add i64 %zext
1392```
1393Instead of creating `s_and_b32 s0, s0, 1`,
1394it will create `s_and_b32 s0, s0, 0xff`.
1395We accept this change since the non-byte load assumes the upper bits
1396within the byte are all 0.
1397*/
1398bool AMDGPUCodeGenPrepareImpl::tryNarrowMathIfNoOverflow(Instruction *I) {
1399 unsigned Opc = I->getOpcode();
1400 Type *OldType = I->getType();
1401
1402 if (Opc != Instruction::Add && Opc != Instruction::Mul)
1403 return false;
1404
1405 unsigned OrigBit = OldType->getScalarSizeInBits();
1406
1407 if (Opc != Instruction::Add && Opc != Instruction::Mul)
1408 llvm_unreachable("Unexpected opcode, only valid for Instruction::Add and "
1409 "Instruction::Mul.");
1410
1411 unsigned MaxBitsNeeded = computeKnownBits(V: I, DL).countMaxActiveBits();
1412
1413 MaxBitsNeeded = std::max<unsigned>(a: bit_ceil(Value: MaxBitsNeeded), b: 8);
1414 Type *NewType = DL.getSmallestLegalIntType(C&: I->getContext(), Width: MaxBitsNeeded);
1415 if (!NewType)
1416 return false;
1417 unsigned NewBit = NewType->getIntegerBitWidth();
1418 if (NewBit >= OrigBit)
1419 return false;
1420 NewType = I->getType()->getWithNewBitWidth(NewBitWidth: NewBit);
1421
1422 // Old cost
1423 const TargetTransformInfo &TTI = TM.getTargetTransformInfo(F);
1424 InstructionCost OldCost =
1425 TTI.getArithmeticInstrCost(Opcode: Opc, Ty: OldType, CostKind: TTI::TCK_RecipThroughput);
1426 // New cost of new op
1427 InstructionCost NewCost =
1428 TTI.getArithmeticInstrCost(Opcode: Opc, Ty: NewType, CostKind: TTI::TCK_RecipThroughput);
1429 // New cost of narrowing 2 operands (use trunc)
1430 int NumOfNonConstOps = 2;
1431 if (isa<Constant>(Val: I->getOperand(i: 0)) || isa<Constant>(Val: I->getOperand(i: 1))) {
1432 // Cannot be both constant, should be propagated
1433 NumOfNonConstOps = 1;
1434 }
1435 NewCost += NumOfNonConstOps * TTI.getCastInstrCost(Opcode: Instruction::Trunc,
1436 Dst: NewType, Src: OldType,
1437 CCH: TTI.getCastContextHint(I),
1438 CostKind: TTI::TCK_RecipThroughput);
1439 // New cost of zext narrowed result to original type
1440 NewCost +=
1441 TTI.getCastInstrCost(Opcode: Instruction::ZExt, Dst: OldType, Src: NewType,
1442 CCH: TTI.getCastContextHint(I), CostKind: TTI::TCK_RecipThroughput);
1443 if (NewCost >= OldCost)
1444 return false;
1445
1446 IRBuilder<> Builder(I);
1447 Value *Trunc0 = Builder.CreateTrunc(V: I->getOperand(i: 0), DestTy: NewType);
1448 Value *Trunc1 = Builder.CreateTrunc(V: I->getOperand(i: 1), DestTy: NewType);
1449 Value *Arith =
1450 Builder.CreateBinOp(Opc: (Instruction::BinaryOps)Opc, LHS: Trunc0, RHS: Trunc1);
1451
1452 Value *Zext = Builder.CreateZExt(V: Arith, DestTy: OldType);
1453 I->replaceAllUsesWith(V: Zext);
1454 DeadVals.push_back(Elt: I);
1455 return true;
1456}
1457
1458bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
1459 if (foldBinOpIntoSelect(BO&: I))
1460 return true;
1461
1462 if (UseMul24Intrin && replaceMulWithMul24(I))
1463 return true;
1464 if (tryNarrowMathIfNoOverflow(I: &I))
1465 return true;
1466
1467 bool Changed = false;
1468 Instruction::BinaryOps Opc = I.getOpcode();
1469 Type *Ty = I.getType();
1470 Value *NewDiv = nullptr;
1471 unsigned ScalarSize = Ty->getScalarSizeInBits();
1472
1473 SmallVector<BinaryOperator *, 8> Div64ToExpand;
1474
1475 if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
1476 Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
1477 ScalarSize <= 64 &&
1478 !DisableIDivExpand) {
1479 Value *Num = I.getOperand(i_nocapture: 0);
1480 Value *Den = I.getOperand(i_nocapture: 1);
1481 IRBuilder<> Builder(&I);
1482 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1483
1484 if (auto *VT = dyn_cast<FixedVectorType>(Val: Ty)) {
1485 NewDiv = PoisonValue::get(T: VT);
1486
1487 for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {
1488 Value *NumEltN = Builder.CreateExtractElement(Vec: Num, Idx: N);
1489 Value *DenEltN = Builder.CreateExtractElement(Vec: Den, Idx: N);
1490
1491 Value *NewElt;
1492 if (ScalarSize <= 32) {
1493 NewElt = expandDivRem32(Builder, I, X: NumEltN, Y: DenEltN);
1494 if (!NewElt)
1495 NewElt = Builder.CreateBinOp(Opc, LHS: NumEltN, RHS: DenEltN);
1496 } else {
1497 // See if this 64-bit division can be shrunk to 32/24-bits before
1498 // producing the general expansion.
1499 NewElt = shrinkDivRem64(Builder, I, Num: NumEltN, Den: DenEltN);
1500 if (!NewElt) {
1501 // The general 64-bit expansion introduces control flow and doesn't
1502 // return the new value. Just insert a scalar copy and defer
1503 // expanding it.
1504 NewElt = Builder.CreateBinOp(Opc, LHS: NumEltN, RHS: DenEltN);
1505 // CreateBinOp does constant folding. If the operands are constant,
1506 // it will return a Constant instead of a BinaryOperator.
1507 if (auto *NewEltBO = dyn_cast<BinaryOperator>(Val: NewElt))
1508 Div64ToExpand.push_back(Elt: NewEltBO);
1509 }
1510 }
1511
1512 if (auto *NewEltI = dyn_cast<Instruction>(Val: NewElt))
1513 NewEltI->copyIRFlags(V: &I);
1514
1515 NewDiv = Builder.CreateInsertElement(Vec: NewDiv, NewElt, Idx: N);
1516 }
1517 } else {
1518 if (ScalarSize <= 32)
1519 NewDiv = expandDivRem32(Builder, I, X: Num, Y: Den);
1520 else {
1521 NewDiv = shrinkDivRem64(Builder, I, Num, Den);
1522 if (!NewDiv)
1523 Div64ToExpand.push_back(Elt: &I);
1524 }
1525 }
1526
1527 if (NewDiv) {
1528 I.replaceAllUsesWith(V: NewDiv);
1529 DeadVals.push_back(Elt: &I);
1530 Changed = true;
1531 }
1532 }
1533
1534 if (ExpandDiv64InIR) {
1535 // TODO: We get much worse code in specially handled constant cases.
1536 for (BinaryOperator *Div : Div64ToExpand) {
1537 expandDivRem64(I&: *Div);
1538 FlowChanged = true;
1539 Changed = true;
1540 }
1541 }
1542
1543 return Changed;
1544}
1545
1546bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
1547 if (!WidenLoads)
1548 return false;
1549
1550 if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
1551 I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1552 canWidenScalarExtLoad(I)) {
1553 IRBuilder<> Builder(&I);
1554 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1555
1556 Type *I32Ty = Builder.getInt32Ty();
1557 LoadInst *WidenLoad = Builder.CreateLoad(Ty: I32Ty, Ptr: I.getPointerOperand());
1558 WidenLoad->copyMetadata(SrcInst: I);
1559
1560 // If we have range metadata, we need to convert the type, and not make
1561 // assumptions about the high bits.
1562 if (auto *Range = WidenLoad->getMetadata(KindID: LLVMContext::MD_range)) {
1563 ConstantInt *Lower =
1564 mdconst::extract<ConstantInt>(MD: Range->getOperand(I: 0));
1565
1566 if (Lower->isNullValue()) {
1567 WidenLoad->setMetadata(KindID: LLVMContext::MD_range, Node: nullptr);
1568 } else {
1569 Metadata *LowAndHigh[] = {
1570 ConstantAsMetadata::get(C: ConstantInt::get(Ty: I32Ty, V: Lower->getValue().zext(width: 32))),
1571 // Don't make assumptions about the high bits.
1572 ConstantAsMetadata::get(C: ConstantInt::get(Ty: I32Ty, V: 0))
1573 };
1574
1575 WidenLoad->setMetadata(KindID: LLVMContext::MD_range,
1576 Node: MDNode::get(Context&: F.getContext(), MDs: LowAndHigh));
1577 }
1578 }
1579
1580 int TySize = DL.getTypeSizeInBits(Ty: I.getType());
1581 Type *IntNTy = Builder.getIntNTy(N: TySize);
1582 Value *ValTrunc = Builder.CreateTrunc(V: WidenLoad, DestTy: IntNTy);
1583 Value *ValOrig = Builder.CreateBitCast(V: ValTrunc, DestTy: I.getType());
1584 I.replaceAllUsesWith(V: ValOrig);
1585 DeadVals.push_back(Elt: &I);
1586 return true;
1587 }
1588
1589 return false;
1590}
1591
1592bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {
1593 Value *Cond = I.getCondition();
1594 Value *TrueVal = I.getTrueValue();
1595 Value *FalseVal = I.getFalseValue();
1596 Value *CmpVal;
1597 CmpPredicate Pred;
1598
1599 // Match fract pattern with nan check.
1600 if (!match(V: Cond, P: m_FCmp(Pred, L: m_Value(V&: CmpVal), R: m_NonNaN())))
1601 return false;
1602
1603 FPMathOperator *FPOp = dyn_cast<FPMathOperator>(Val: &I);
1604 if (!FPOp)
1605 return false;
1606
1607 IRBuilder<> Builder(&I);
1608 Builder.setFastMathFlags(FPOp->getFastMathFlags());
1609
1610 auto *IITrue = dyn_cast<IntrinsicInst>(Val: TrueVal);
1611 auto *IIFalse = dyn_cast<IntrinsicInst>(Val: FalseVal);
1612
1613 Value *Fract = nullptr;
1614 if (Pred == FCmpInst::FCMP_UNO && TrueVal == CmpVal && IIFalse &&
1615 CmpVal == matchFractPat(I&: *IIFalse)) {
1616 // isnan(x) ? x : fract(x)
1617 Fract = applyFractPat(Builder, FractArg: CmpVal);
1618 } else if (Pred == FCmpInst::FCMP_ORD && FalseVal == CmpVal && IITrue &&
1619 CmpVal == matchFractPat(I&: *IITrue)) {
1620 // !isnan(x) ? fract(x) : x
1621 Fract = applyFractPat(Builder, FractArg: CmpVal);
1622 } else
1623 return false;
1624
1625 Fract->takeName(V: &I);
1626 I.replaceAllUsesWith(V: Fract);
1627 DeadVals.push_back(Elt: &I);
1628 return true;
1629}
1630
1631static bool areInSameBB(const Value *A, const Value *B) {
1632 const auto *IA = dyn_cast<Instruction>(Val: A);
1633 const auto *IB = dyn_cast<Instruction>(Val: B);
1634 return IA && IB && IA->getParent() == IB->getParent();
1635}
1636
1637// Helper for breaking large PHIs that returns true when an extractelement on V
1638// is likely to be folded away by the DAG combiner.
1639static bool isInterestingPHIIncomingValue(const Value *V) {
1640 const auto *FVT = dyn_cast<FixedVectorType>(Val: V->getType());
1641 if (!FVT)
1642 return false;
1643
1644 const Value *CurVal = V;
1645
1646 // Check for insertelements, keeping track of the elements covered.
1647 BitVector EltsCovered(FVT->getNumElements());
1648 while (const auto *IE = dyn_cast<InsertElementInst>(Val: CurVal)) {
1649 const auto *Idx = dyn_cast<ConstantInt>(Val: IE->getOperand(i_nocapture: 2));
1650
1651 // Non constant index/out of bounds index -> folding is unlikely.
1652 // The latter is more of a sanity check because canonical IR should just
1653 // have replaced those with poison.
1654 if (!Idx || Idx->getZExtValue() >= FVT->getNumElements())
1655 return false;
1656
1657 const auto *VecSrc = IE->getOperand(i_nocapture: 0);
1658
1659 // If the vector source is another instruction, it must be in the same basic
1660 // block. Otherwise, the DAGCombiner won't see the whole thing and is
1661 // unlikely to be able to do anything interesting here.
1662 if (isa<Instruction>(Val: VecSrc) && !areInSameBB(A: VecSrc, B: IE))
1663 return false;
1664
1665 CurVal = VecSrc;
1666 EltsCovered.set(Idx->getZExtValue());
1667
1668 // All elements covered.
1669 if (EltsCovered.all())
1670 return true;
1671 }
1672
1673 // We either didn't find a single insertelement, or the insertelement chain
1674 // ended before all elements were covered. Check for other interesting values.
1675
1676 // Constants are always interesting because we can just constant fold the
1677 // extractelements.
1678 if (isa<Constant>(Val: CurVal))
1679 return true;
1680
1681 // shufflevector is likely to be profitable if either operand is a constant,
1682 // or if either source is in the same block.
1683 // This is because shufflevector is most often lowered as a series of
1684 // insert/extract elements anyway.
1685 if (const auto *SV = dyn_cast<ShuffleVectorInst>(Val: CurVal)) {
1686 return isa<Constant>(Val: SV->getOperand(i_nocapture: 1)) ||
1687 areInSameBB(A: SV, B: SV->getOperand(i_nocapture: 0)) ||
1688 areInSameBB(A: SV, B: SV->getOperand(i_nocapture: 1));
1689 }
1690
1691 return false;
1692}
1693
1694static void collectPHINodes(const PHINode &I,
1695 SmallPtrSet<const PHINode *, 8> &SeenPHIs) {
1696 const auto [It, Inserted] = SeenPHIs.insert(Ptr: &I);
1697 if (!Inserted)
1698 return;
1699
1700 for (const Value *Inc : I.incoming_values()) {
1701 if (const auto *PhiInc = dyn_cast<PHINode>(Val: Inc))
1702 collectPHINodes(I: *PhiInc, SeenPHIs);
1703 }
1704
1705 for (const User *U : I.users()) {
1706 if (const auto *PhiU = dyn_cast<PHINode>(Val: U))
1707 collectPHINodes(I: *PhiU, SeenPHIs);
1708 }
1709}
1710
1711bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(const PHINode &I) {
1712 // Check in the cache first.
1713 if (const auto It = BreakPhiNodesCache.find(Val: &I);
1714 It != BreakPhiNodesCache.end())
1715 return It->second;
1716
1717 // We consider PHI nodes as part of "chains", so given a PHI node I, we
1718 // recursively consider all its users and incoming values that are also PHI
1719 // nodes. We then make a decision about all of those PHIs at once. Either they
1720 // all get broken up, or none of them do. That way, we avoid cases where a
1721 // single PHI is/is not broken and we end up reforming/exploding a vector
1722 // multiple times, or even worse, doing it in a loop.
1723 SmallPtrSet<const PHINode *, 8> WorkList;
1724 collectPHINodes(I, SeenPHIs&: WorkList);
1725
1726#ifndef NDEBUG
1727 // Check that none of the PHI nodes in the worklist are in the map. If some of
1728 // them are, it means we're not good enough at collecting related PHIs.
1729 for (const PHINode *WLP : WorkList) {
1730 assert(BreakPhiNodesCache.count(WLP) == 0);
1731 }
1732#endif
1733
1734 // To consider a PHI profitable to break, we need to see some interesting
1735 // incoming values. At least 2/3rd (rounded up) of all PHIs in the worklist
1736 // must have one to consider all PHIs breakable.
1737 //
1738 // This threshold has been determined through performance testing.
1739 //
1740 // Note that the computation below is equivalent to
1741 //
1742 // (unsigned)ceil((K / 3.0) * 2)
1743 //
1744 // It's simply written this way to avoid mixing integral/FP arithmetic.
1745 const auto Threshold = (alignTo(Value: WorkList.size() * 2, Align: 3) / 3);
1746 unsigned NumBreakablePHIs = 0;
1747 bool CanBreak = false;
1748 for (const PHINode *Cur : WorkList) {
1749 // Don't break PHIs that have no interesting incoming values. That is, where
1750 // there is no clear opportunity to fold the "extractelement" instructions
1751 // we would add.
1752 //
1753 // Note: IC does not run after this pass, so we're only interested in the
1754 // foldings that the DAG combiner can do.
1755 if (any_of(Range: Cur->incoming_values(), P: isInterestingPHIIncomingValue)) {
1756 if (++NumBreakablePHIs >= Threshold) {
1757 CanBreak = true;
1758 break;
1759 }
1760 }
1761 }
1762
1763 for (const PHINode *Cur : WorkList)
1764 BreakPhiNodesCache[Cur] = CanBreak;
1765
1766 return CanBreak;
1767}
1768
1769/// Helper class for "break large PHIs" (visitPHINode).
1770///
1771/// This represents a slice of a PHI's incoming value, which is made up of:
1772/// - The type of the slice (Ty)
1773/// - The index in the incoming value's vector where the slice starts (Idx)
1774/// - The number of elements in the slice (NumElts).
1775/// It also keeps track of the NewPHI node inserted for this particular slice.
1776///
1777/// Slice examples:
1778/// <4 x i64> -> Split into four i64 slices.
1779/// -> [i64, 0, 1], [i64, 1, 1], [i64, 2, 1], [i64, 3, 1]
1780/// <5 x i16> -> Split into 2 <2 x i16> slices + a i16 tail.
1781/// -> [<2 x i16>, 0, 2], [<2 x i16>, 2, 2], [i16, 4, 1]
1782class VectorSlice {
1783public:
1784 VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
1785 : Ty(Ty), Idx(Idx), NumElts(NumElts) {}
1786
1787 Type *Ty = nullptr;
1788 unsigned Idx = 0;
1789 unsigned NumElts = 0;
1790 PHINode *NewPHI = nullptr;
1791
1792 /// Slice \p Inc according to the information contained within this slice.
1793 /// This is cached, so if called multiple times for the same \p BB & \p Inc
1794 /// pair, it returns the same Sliced value as well.
1795 ///
1796 /// Note this *intentionally* does not return the same value for, say,
1797 /// [%bb.0, %0] & [%bb.1, %0] as:
1798 /// - It could cause issues with dominance (e.g. if bb.1 is seen first, then
1799 /// the value in bb.1 may not be reachable from bb.0 if it's its
1800 /// predecessor.)
1801 /// - We also want to make our extract instructions as local as possible so
1802 /// the DAG has better chances of folding them out. Duplicating them like
1803 /// that is beneficial in that regard.
1804 ///
1805 /// This is both a minor optimization to avoid creating duplicate
1806 /// instructions, but also a requirement for correctness. It is not forbidden
1807 /// for a PHI node to have the same [BB, Val] pair multiple times. If we
1808 /// returned a new value each time, those previously identical pairs would all
1809 /// have different incoming values (from the same block) and it'd cause a "PHI
1810 /// node has multiple entries for the same basic block with different incoming
1811 /// values!" verifier error.
1812 Value *getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName) {
1813 Value *&Res = SlicedVals[{BB, Inc}];
1814 if (Res)
1815 return Res;
1816
1817 IRBuilder<> B(BB->getTerminator());
1818 if (Instruction *IncInst = dyn_cast<Instruction>(Val: Inc))
1819 B.SetCurrentDebugLocation(IncInst->getDebugLoc());
1820
1821 if (NumElts > 1) {
1822 SmallVector<int, 4> Mask;
1823 for (unsigned K = Idx; K < (Idx + NumElts); ++K)
1824 Mask.push_back(Elt: K);
1825 Res = B.CreateShuffleVector(V: Inc, Mask, Name: NewValName);
1826 } else
1827 Res = B.CreateExtractElement(Vec: Inc, Idx, Name: NewValName);
1828
1829 return Res;
1830 }
1831
1832private:
1833 SmallDenseMap<std::pair<BasicBlock *, Value *>, Value *> SlicedVals;
1834};
1835
1836bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
1837 // Break-up fixed-vector PHIs into smaller pieces.
1838 // Default threshold is 32, so it breaks up any vector that's >32 bits into
1839 // its elements, or into 32-bit pieces (for 8/16 bit elts).
1840 //
1841 // This is only helpful for DAGISel because it doesn't handle large PHIs as
1842 // well as GlobalISel. DAGISel lowers PHIs by using CopyToReg/CopyFromReg.
1843 // With large, odd-sized PHIs we may end up needing many `build_vector`
1844 // operations with most elements being "undef". This inhibits a lot of
1845 // optimization opportunities and can result in unreasonably high register
1846 // pressure and the inevitable stack spilling.
1847 if (!BreakLargePHIs || getCGPassBuilderOption().EnableGlobalISelOption)
1848 return false;
1849
1850 FixedVectorType *FVT = dyn_cast<FixedVectorType>(Val: I.getType());
1851 if (!FVT || FVT->getNumElements() == 1 ||
1852 DL.getTypeSizeInBits(Ty: FVT) <= BreakLargePHIsThreshold)
1853 return false;
1854
1855 if (!ForceBreakLargePHIs && !canBreakPHINode(I))
1856 return false;
1857
1858 std::vector<VectorSlice> Slices;
1859
1860 Type *EltTy = FVT->getElementType();
1861 {
1862 unsigned Idx = 0;
1863 // For 8/16 bits type, don't scalarize fully but break it up into as many
1864 // 32-bit slices as we can, and scalarize the tail.
1865 const unsigned EltSize = DL.getTypeSizeInBits(Ty: EltTy);
1866 const unsigned NumElts = FVT->getNumElements();
1867 if (EltSize == 8 || EltSize == 16) {
1868 const unsigned SubVecSize = (32 / EltSize);
1869 Type *SubVecTy = FixedVectorType::get(ElementType: EltTy, NumElts: SubVecSize);
1870 for (unsigned End = alignDown(Value: NumElts, Align: SubVecSize); Idx < End;
1871 Idx += SubVecSize)
1872 Slices.emplace_back(args&: SubVecTy, args&: Idx, args: SubVecSize);
1873 }
1874
1875 // Scalarize all remaining elements.
1876 for (; Idx < NumElts; ++Idx)
1877 Slices.emplace_back(args&: EltTy, args&: Idx, args: 1);
1878 }
1879
1880 assert(Slices.size() > 1);
1881
1882 // Create one PHI per vector piece. The "VectorSlice" class takes care of
1883 // creating the necessary instruction to extract the relevant slices of each
1884 // incoming value.
1885 IRBuilder<> B(I.getParent());
1886 B.SetCurrentDebugLocation(I.getDebugLoc());
1887
1888 unsigned IncNameSuffix = 0;
1889 for (VectorSlice &S : Slices) {
1890 // We need to reset the build on each iteration, because getSlicedVal may
1891 // have inserted something into I's BB.
1892 B.SetInsertPoint(I.getParent()->getFirstNonPHIIt());
1893 S.NewPHI = B.CreatePHI(Ty: S.Ty, NumReservedValues: I.getNumIncomingValues());
1894
1895 for (const auto &[Idx, BB] : enumerate(First: I.blocks())) {
1896 S.NewPHI->addIncoming(V: S.getSlicedVal(BB, Inc: I.getIncomingValue(i: Idx),
1897 NewValName: "largephi.extractslice" +
1898 std::to_string(val: IncNameSuffix++)),
1899 BB);
1900 }
1901 }
1902
1903 // And replace this PHI with a vector of all the previous PHI values.
1904 Value *Vec = PoisonValue::get(T: FVT);
1905 unsigned NameSuffix = 0;
1906 for (VectorSlice &S : Slices) {
1907 const auto ValName = "largephi.insertslice" + std::to_string(val: NameSuffix++);
1908 if (S.NumElts > 1)
1909 Vec = B.CreateInsertVector(DstType: FVT, SrcVec: Vec, SubVec: S.NewPHI, Idx: S.Idx, Name: ValName);
1910 else
1911 Vec = B.CreateInsertElement(Vec, NewElt: S.NewPHI, Idx: S.Idx, Name: ValName);
1912 }
1913
1914 I.replaceAllUsesWith(V: Vec);
1915 DeadVals.push_back(Elt: &I);
1916 return true;
1917}
1918
1919/// \param V Value to check
1920/// \param DL DataLayout
1921/// \param TM TargetMachine (TODO: remove once DL contains nullptr values)
1922/// \param AS Target Address Space
1923/// \return true if \p V cannot be the null value of \p AS, false otherwise.
1924static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL,
1925 const AMDGPUTargetMachine &TM, unsigned AS) {
1926 // Pointer cannot be null if it's a block address, GV or alloca.
1927 // NOTE: We don't support extern_weak, but if we did, we'd need to check for
1928 // it as the symbol could be null in such cases.
1929 if (isa<BlockAddress, GlobalValue, AllocaInst>(Val: V))
1930 return true;
1931
1932 // Check nonnull arguments.
1933 if (const auto *Arg = dyn_cast<Argument>(Val: V); Arg && Arg->hasNonNullAttr())
1934 return true;
1935
1936 // Check nonnull loads.
1937 if (const auto *Load = dyn_cast<LoadInst>(Val: V);
1938 Load && Load->hasMetadata(KindID: LLVMContext::MD_nonnull))
1939 return true;
1940
1941 // getUnderlyingObject may have looked through another addrspacecast, although
1942 // the optimizable situations most likely folded out by now.
1943 if (AS != cast<PointerType>(Val: V->getType())->getAddressSpace())
1944 return false;
1945
1946 // TODO: Calls that return nonnull?
1947
1948 // For all other things, use KnownBits.
1949 // We either use 0 or all bits set to indicate null, so check whether the
1950 // value can be zero or all ones.
1951 //
1952 // TODO: Use ValueTracking's isKnownNeverNull if it becomes aware that some
1953 // address spaces have non-zero null values.
1954 auto SrcPtrKB = computeKnownBits(V, DL);
1955 const auto NullVal = TM.getNullPointerValue(AddrSpace: AS);
1956
1957 assert(SrcPtrKB.getBitWidth() == DL.getPointerSizeInBits(AS));
1958 assert((NullVal == 0 || NullVal == -1) &&
1959 "don't know how to check for this null value!");
1960 return NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero();
1961}
1962
1963bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
1964 // Intrinsic doesn't support vectors, also it seems that it's often difficult
1965 // to prove that a vector cannot have any nulls in it so it's unclear if it's
1966 // worth supporting.
1967 if (I.getType()->isVectorTy())
1968 return false;
1969
1970 // Check if this can be lowered to a amdgcn.addrspacecast.nonnull.
1971 // This is only worthwhile for casts from/to priv/local to flat.
1972 const unsigned SrcAS = I.getSrcAddressSpace();
1973 const unsigned DstAS = I.getDestAddressSpace();
1974
1975 bool CanLower = false;
1976 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1977 CanLower = (DstAS == AMDGPUAS::LOCAL_ADDRESS ||
1978 DstAS == AMDGPUAS::PRIVATE_ADDRESS);
1979 else if (DstAS == AMDGPUAS::FLAT_ADDRESS)
1980 CanLower = (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
1981 SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
1982 if (!CanLower)
1983 return false;
1984
1985 SmallVector<const Value *, 4> WorkList;
1986 getUnderlyingObjects(V: I.getOperand(i_nocapture: 0), Objects&: WorkList);
1987 if (!all_of(Range&: WorkList, P: [&](const Value *V) {
1988 return isPtrKnownNeverNull(V, DL, TM, AS: SrcAS);
1989 }))
1990 return false;
1991
1992 IRBuilder<> B(&I);
1993 auto *Intrin = B.CreateIntrinsic(
1994 RetTy: I.getType(), ID: Intrinsic::amdgcn_addrspacecast_nonnull, Args: {I.getOperand(i_nocapture: 0)});
1995 I.replaceAllUsesWith(V: Intrin);
1996 DeadVals.push_back(Elt: &I);
1997 return true;
1998}
1999
2000bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
2001 switch (I.getIntrinsicID()) {
2002 case Intrinsic::minnum:
2003 case Intrinsic::minimumnum:
2004 case Intrinsic::minimum:
2005 return visitFMinLike(I);
2006 case Intrinsic::sqrt:
2007 return visitSqrt(I);
2008 case Intrinsic::amdgcn_mbcnt_lo:
2009 return visitMbcntLo(I);
2010 case Intrinsic::amdgcn_mbcnt_hi:
2011 return visitMbcntHi(I);
2012 default:
2013 return false;
2014 }
2015}
2016
2017/// Match non-nan fract pattern.
2018/// minnum(fsub(x, floor(x)), nextafter(1.0, -1.0))
2019/// minimumnum(fsub(x, floor(x)), nextafter(1.0, -1.0))
2020/// minimum(fsub(x, floor(x)), nextafter(1.0, -1.0))
2021///
2022/// If fract is a useful instruction for the subtarget. Does not account for the
2023/// nan handling; the instruction has a nan check on the input value.
2024Value *AMDGPUCodeGenPrepareImpl::matchFractPat(IntrinsicInst &I) {
2025 if (ST.hasFractBug())
2026 return nullptr;
2027
2028 Intrinsic::ID IID = I.getIntrinsicID();
2029
2030 // The value is only used in contexts where we know the input isn't a nan, so
2031 // any of the fmin variants are fine.
2032 if (IID != Intrinsic::minnum && IID != Intrinsic::minimum &&
2033 IID != Intrinsic::minimumnum)
2034 return nullptr;
2035
2036 Type *Ty = I.getType();
2037 if (!isLegalFloatingTy(Ty: Ty->getScalarType()))
2038 return nullptr;
2039
2040 Value *Arg0 = I.getArgOperand(i: 0);
2041 Value *Arg1 = I.getArgOperand(i: 1);
2042
2043 const APFloat *C;
2044 if (!match(V: Arg1, P: m_APFloat(Res&: C)))
2045 return nullptr;
2046
2047 APFloat One(1.0);
2048 bool LosesInfo;
2049 One.convert(ToSemantics: C->getSemantics(), RM: APFloat::rmNearestTiesToEven, losesInfo: &LosesInfo);
2050
2051 // Match nextafter(1.0, -1)
2052 One.next(nextDown: true);
2053 if (One != *C)
2054 return nullptr;
2055
2056 Value *FloorSrc;
2057 if (match(V: Arg0, P: m_FSub(L: m_Value(V&: FloorSrc),
2058 R: m_Intrinsic<Intrinsic::floor>(Op0: m_Deferred(V: FloorSrc)))))
2059 return FloorSrc;
2060 return nullptr;
2061}
2062
2063Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,
2064 Value *FractArg) {
2065 SmallVector<Value *, 4> FractVals;
2066 extractValues(Builder, Values&: FractVals, V: FractArg);
2067
2068 SmallVector<Value *, 4> ResultVals(FractVals.size());
2069
2070 Type *Ty = FractArg->getType()->getScalarType();
2071 for (unsigned I = 0, E = FractVals.size(); I != E; ++I) {
2072 ResultVals[I] =
2073 Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_fract, Types: {Ty}, Args: {FractVals[I]});
2074 }
2075
2076 return insertValues(Builder, Ty: FractArg->getType(), Values&: ResultVals);
2077}
2078
2079bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) {
2080 Value *FractArg = matchFractPat(I);
2081 if (!FractArg)
2082 return false;
2083
2084 // Match pattern for fract intrinsic in contexts where the nan check has been
2085 // optimized out (and hope the knowledge the source can't be nan wasn't lost).
2086 if (!I.hasNoNaNs() && !isKnownNeverNaN(V: FractArg, SQ: SQ.getWithInstruction(I: &I)))
2087 return false;
2088
2089 IRBuilder<> Builder(&I);
2090 FastMathFlags FMF = I.getFastMathFlags();
2091 FMF.setNoNaNs();
2092 Builder.setFastMathFlags(FMF);
2093
2094 Value *Fract = applyFractPat(Builder, FractArg);
2095 Fract->takeName(V: &I);
2096 I.replaceAllUsesWith(V: Fract);
2097 DeadVals.push_back(Elt: &I);
2098 return true;
2099}
2100
2101// Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way.
2102bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
2103 Type *Ty = Sqrt.getType()->getScalarType();
2104 if (!Ty->isFloatTy() && (!Ty->isHalfTy() || ST.has16BitInsts()))
2105 return false;
2106
2107 const FPMathOperator *FPOp = cast<const FPMathOperator>(Val: &Sqrt);
2108 FastMathFlags SqrtFMF = FPOp->getFastMathFlags();
2109
2110 // We're trying to handle the fast-but-not-that-fast case only. The lowering
2111 // of fast llvm.sqrt will give the raw instruction anyway.
2112 if (SqrtFMF.approxFunc())
2113 return false;
2114
2115 const float ReqdAccuracy = FPOp->getFPAccuracy();
2116
2117 // Defer correctly rounded expansion to codegen.
2118 if (ReqdAccuracy < 1.0f)
2119 return false;
2120
2121 Value *SrcVal = Sqrt.getOperand(i_nocapture: 0);
2122 bool CanTreatAsDAZ = canIgnoreDenormalInput(V: SrcVal, CtxI: &Sqrt);
2123
2124 // The raw instruction is 1 ulp, but the correction for denormal handling
2125 // brings it to 2.
2126 if (!CanTreatAsDAZ && ReqdAccuracy < 2.0f)
2127 return false;
2128
2129 IRBuilder<> Builder(&Sqrt);
2130 SmallVector<Value *, 4> SrcVals;
2131 extractValues(Builder, Values&: SrcVals, V: SrcVal);
2132
2133 SmallVector<Value *, 4> ResultVals(SrcVals.size());
2134 for (int I = 0, E = SrcVals.size(); I != E; ++I) {
2135 if (CanTreatAsDAZ)
2136 ResultVals[I] = Builder.CreateCall(Callee: getSqrtF32(), Args: SrcVals[I]);
2137 else
2138 ResultVals[I] = emitSqrtIEEE2ULP(Builder, Src: SrcVals[I], FMF: SqrtFMF);
2139 }
2140
2141 Value *NewSqrt = insertValues(Builder, Ty: Sqrt.getType(), Values&: ResultVals);
2142 NewSqrt->takeName(V: &Sqrt);
2143 Sqrt.replaceAllUsesWith(V: NewSqrt);
2144 DeadVals.push_back(Elt: &Sqrt);
2145 return true;
2146}
2147
2148bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
2149 if (skipFunction(F))
2150 return false;
2151
2152 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
2153 if (!TPC)
2154 return false;
2155
2156 const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
2157 const TargetLibraryInfo *TLI =
2158 &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
2159 AssumptionCache *AC =
2160 &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2161 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
2162 const DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
2163 const UniformityInfo &UA =
2164 getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
2165 return AMDGPUCodeGenPrepareImpl(F, TM, TLI, AC, DT, UA).run();
2166}
2167
2168PreservedAnalyses AMDGPUCodeGenPreparePass::run(Function &F,
2169 FunctionAnalysisManager &FAM) {
2170 const AMDGPUTargetMachine &ATM = static_cast<const AMDGPUTargetMachine &>(TM);
2171 const TargetLibraryInfo *TLI = &FAM.getResult<TargetLibraryAnalysis>(IR&: F);
2172 AssumptionCache *AC = &FAM.getResult<AssumptionAnalysis>(IR&: F);
2173 const DominatorTree *DT = FAM.getCachedResult<DominatorTreeAnalysis>(IR&: F);
2174 const UniformityInfo &UA = FAM.getResult<UniformityInfoAnalysis>(IR&: F);
2175 AMDGPUCodeGenPrepareImpl Impl(F, ATM, TLI, AC, DT, UA);
2176 if (!Impl.run())
2177 return PreservedAnalyses::all();
2178 PreservedAnalyses PA = PreservedAnalyses::none();
2179 if (!Impl.FlowChanged)
2180 PA.preserveSet<CFGAnalyses>();
2181 return PA;
2182}
2183
2184INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
2185 "AMDGPU IR optimizations", false, false)
2186INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
2187INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
2188INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
2189INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
2190 false, false)
2191
2192/// Create a workitem.id.x intrinsic call with range metadata.
2193CallInst *AMDGPUCodeGenPrepareImpl::createWorkitemIdX(IRBuilder<> &B) const {
2194 CallInst *Tid = B.CreateIntrinsic(ID: Intrinsic::amdgcn_workitem_id_x, Args: {});
2195 ST.makeLIDRangeMetadata(I: Tid);
2196 return Tid;
2197}
2198
2199/// Replace the instruction with a direct workitem.id.x call.
2200void AMDGPUCodeGenPrepareImpl::replaceWithWorkitemIdX(Instruction &I) const {
2201 IRBuilder<> B(&I);
2202 CallInst *Tid = createWorkitemIdX(B);
2203 BasicBlock::iterator BI(&I);
2204 ReplaceInstWithValue(BI, V: Tid);
2205}
2206
2207/// Replace the instruction with (workitem.id.x & mask).
2208void AMDGPUCodeGenPrepareImpl::replaceWithMaskedWorkitemIdX(
2209 Instruction &I, unsigned WaveSize) const {
2210 IRBuilder<> B(&I);
2211 CallInst *Tid = createWorkitemIdX(B);
2212 Constant *Mask = ConstantInt::get(Ty: Tid->getType(), V: WaveSize - 1);
2213 Value *AndInst = B.CreateAnd(LHS: Tid, RHS: Mask);
2214 BasicBlock::iterator BI(&I);
2215 ReplaceInstWithValue(BI, V: AndInst);
2216}
2217
2218/// Try to optimize mbcnt instruction by replacing with workitem.id.x when
2219/// work group size allows direct computation of lane ID.
2220/// Returns true if optimization was applied, false otherwise.
2221bool AMDGPUCodeGenPrepareImpl::tryReplaceWithWorkitemId(Instruction &I,
2222 unsigned Wave) const {
2223 std::optional<unsigned> MaybeX = ST.getReqdWorkGroupSize(F, Dim: 0);
2224 if (!MaybeX)
2225 return false;
2226
2227 // When work group size == wave_size, each work group contains exactly one
2228 // wave, so the instruction can be replaced with workitem.id.x directly.
2229 if (*MaybeX == Wave) {
2230 replaceWithWorkitemIdX(I);
2231 return true;
2232 }
2233
2234 // When work group evenly splits into waves, compute lane ID within wave
2235 // using bit masking: lane_id = workitem.id.x & (wave_size - 1).
2236 if (ST.hasWavefrontsEvenlySplittingXDim(F, /*RequiresUniformYZ=*/REquiresUniformYZ: true)) {
2237 replaceWithMaskedWorkitemIdX(I, WaveSize: Wave);
2238 return true;
2239 }
2240
2241 return false;
2242}
2243
2244/// Optimize mbcnt.lo calls on wave32 architectures for lane ID computation.
2245bool AMDGPUCodeGenPrepareImpl::visitMbcntLo(IntrinsicInst &I) const {
2246 // This optimization only applies to wave32 targets where mbcnt.lo operates on
2247 // the full execution mask.
2248 if (!ST.isWave32())
2249 return false;
2250
2251 // Only optimize the pattern mbcnt.lo(~0, 0) which counts active lanes with
2252 // lower IDs.
2253 if (!match(V: &I,
2254 P: m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>(Op0: m_AllOnes(), Op1: m_Zero())))
2255 return false;
2256
2257 return tryReplaceWithWorkitemId(I, Wave: ST.getWavefrontSize());
2258}
2259
2260/// Optimize mbcnt.hi calls for lane ID computation.
2261bool AMDGPUCodeGenPrepareImpl::visitMbcntHi(IntrinsicInst &I) const {
2262 // Abort if wave size is not known at compile time.
2263 if (!ST.isWaveSizeKnown())
2264 return false;
2265
2266 unsigned Wave = ST.getWavefrontSize();
2267
2268 // On wave32, the upper 32 bits of execution mask are always 0, so
2269 // mbcnt.hi(mask, val) always returns val unchanged.
2270 if (ST.isWave32()) {
2271 if (auto MaybeX = ST.getReqdWorkGroupSize(F, Dim: 0)) {
2272 // Replace mbcnt.hi(mask, val) with val only when work group size matches
2273 // wave size (single wave per work group).
2274 if (*MaybeX == Wave) {
2275 BasicBlock::iterator BI(&I);
2276 ReplaceInstWithValue(BI, V: I.getArgOperand(i: 1));
2277 return true;
2278 }
2279 }
2280 }
2281
2282 // Optimize the complete lane ID computation pattern:
2283 // mbcnt.hi(~0, mbcnt.lo(~0, 0)) which counts all active lanes with lower IDs
2284 // across the full execution mask.
2285 using namespace PatternMatch;
2286
2287 // Check for pattern: mbcnt.hi(~0, mbcnt.lo(~0, 0))
2288 if (!match(V: &I, P: m_Intrinsic<Intrinsic::amdgcn_mbcnt_hi>(
2289 Op0: m_AllOnes(), Op1: m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>(
2290 Op0: m_AllOnes(), Op1: m_Zero()))))
2291 return false;
2292
2293 return tryReplaceWithWorkitemId(I, Wave);
2294}
2295
2296char AMDGPUCodeGenPrepare::ID = 0;
2297
2298FunctionPass *llvm::createAMDGPUCodeGenPreparePass() {
2299 return new AMDGPUCodeGenPrepare();
2300}
2301