1//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass does misc. AMDGPU optimizations on IR before instruction
11/// selection.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPU.h"
16#include "AMDGPUMemoryUtils.h"
17#include "AMDGPUTargetMachine.h"
18#include "SIModeRegisterDefaults.h"
19#include "llvm/ADT/SetVector.h"
20#include "llvm/Analysis/AssumptionCache.h"
21#include "llvm/Analysis/ConstantFolding.h"
22#include "llvm/Analysis/TargetLibraryInfo.h"
23#include "llvm/Analysis/TargetTransformInfo.h"
24#include "llvm/Analysis/UniformityAnalysis.h"
25#include "llvm/Analysis/ValueTracking.h"
26#include "llvm/CodeGen/TargetPassConfig.h"
27#include "llvm/IR/Dominators.h"
28#include "llvm/IR/IRBuilder.h"
29#include "llvm/IR/InstVisitor.h"
30#include "llvm/IR/IntrinsicsAMDGPU.h"
31#include "llvm/IR/PatternMatch.h"
32#include "llvm/IR/ValueHandle.h"
33#include "llvm/InitializePasses.h"
34#include "llvm/Pass.h"
35#include "llvm/Support/KnownBits.h"
36#include "llvm/Support/KnownFPClass.h"
37#include "llvm/Transforms/Utils/BasicBlockUtils.h"
38#include "llvm/Transforms/Utils/IntegerDivision.h"
39#include "llvm/Transforms/Utils/Local.h"
40
41#define DEBUG_TYPE "amdgpu-codegenprepare"
42
43using namespace llvm;
44using namespace llvm::PatternMatch;
45
46namespace {
47
48static cl::opt<bool> WidenLoads(
49 "amdgpu-codegenprepare-widen-constant-loads",
50 cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
51 cl::ReallyHidden,
52 cl::init(Val: false));
53
54static cl::opt<bool>
55 BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
56 cl::desc("Break large PHI nodes for DAGISel"),
57 cl::ReallyHidden, cl::init(Val: true));
58
59static cl::opt<bool>
60 ForceBreakLargePHIs("amdgpu-codegenprepare-force-break-large-phis",
61 cl::desc("For testing purposes, always break large "
62 "PHIs even if it isn't profitable."),
63 cl::ReallyHidden, cl::init(Val: false));
64
65static cl::opt<unsigned> BreakLargePHIsThreshold(
66 "amdgpu-codegenprepare-break-large-phis-threshold",
67 cl::desc("Minimum type size in bits for breaking large PHI nodes"),
68 cl::ReallyHidden, cl::init(Val: 32));
69
70static cl::opt<bool> UseMul24Intrin(
71 "amdgpu-codegenprepare-mul24",
72 cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
73 cl::ReallyHidden,
74 cl::init(Val: true));
75
76// Legalize 64-bit division by using the generic IR expansion.
77static cl::opt<bool> ExpandDiv64InIR(
78 "amdgpu-codegenprepare-expand-div64",
79 cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),
80 cl::ReallyHidden,
81 cl::init(Val: false));
82
83// Leave all division operations as they are. This supersedes ExpandDiv64InIR
84// and is used for testing the legalizer.
85static cl::opt<bool> DisableIDivExpand(
86 "amdgpu-codegenprepare-disable-idiv-expansion",
87 cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"),
88 cl::ReallyHidden,
89 cl::init(Val: false));
90
91// Disable processing of fdiv so we can better test the backend implementations.
92static cl::opt<bool> DisableFDivExpand(
93 "amdgpu-codegenprepare-disable-fdiv-expansion",
94 cl::desc("Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
95 cl::ReallyHidden,
96 cl::init(Val: false));
97
98class AMDGPUCodeGenPrepareImpl
99 : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
100public:
101 Function &F;
102 const GCNSubtarget &ST;
103 const AMDGPUTargetMachine &TM;
104 const TargetLibraryInfo *TLI;
105 const UniformityInfo &UA;
106 const DataLayout &DL;
107 SimplifyQuery SQ;
108 const bool HasFP32DenormalFlush;
109 bool FlowChanged = false;
110 mutable Function *SqrtF32 = nullptr;
111 mutable Function *LdexpF32 = nullptr;
112 mutable SmallVector<WeakVH> DeadVals;
113
114 DenseMap<const PHINode *, bool> BreakPhiNodesCache;
115
116 AMDGPUCodeGenPrepareImpl(Function &F, const AMDGPUTargetMachine &TM,
117 const TargetLibraryInfo *TLI, AssumptionCache *AC,
118 const DominatorTree *DT, const UniformityInfo &UA)
119 : F(F), ST(TM.getSubtarget<GCNSubtarget>(F)), TM(TM), TLI(TLI), UA(UA),
120 DL(F.getDataLayout()), SQ(DL, TLI, DT, AC),
121 HasFP32DenormalFlush(SIModeRegisterDefaults(F, ST).FP32Denormals ==
122 DenormalMode::getPreserveSign()) {}
123
124 Function *getSqrtF32() const {
125 if (SqrtF32)
126 return SqrtF32;
127
128 LLVMContext &Ctx = F.getContext();
129 SqrtF32 = Intrinsic::getOrInsertDeclaration(
130 M: F.getParent(), id: Intrinsic::amdgcn_sqrt, OverloadTys: {Type::getFloatTy(C&: Ctx)});
131 return SqrtF32;
132 }
133
134 Function *getLdexpF32() const {
135 if (LdexpF32)
136 return LdexpF32;
137
138 LLVMContext &Ctx = F.getContext();
139 LdexpF32 = Intrinsic::getOrInsertDeclaration(
140 M: F.getParent(), id: Intrinsic::ldexp,
141 OverloadTys: {Type::getFloatTy(C&: Ctx), Type::getInt32Ty(C&: Ctx)});
142 return LdexpF32;
143 }
144
145 bool canBreakPHINode(const PHINode &I);
146
147 /// Return true if \p T is a legal scalar floating point type.
148 bool isLegalFloatingTy(const Type *T) const;
149
150 /// Wrapper to pass all the arguments to computeKnownFPClass
151 KnownFPClass computeKnownFPClass(const Value *V, FPClassTest Interested,
152 const Instruction *CtxI) const {
153 return llvm::computeKnownFPClass(V, InterestedClasses: Interested,
154 SQ: SQ.getWithInstruction(I: CtxI));
155 }
156
157 bool canIgnoreDenormalInput(const Value *V, const Instruction *CtxI) const {
158 return HasFP32DenormalFlush ||
159 computeKnownFPClass(V, Interested: fcSubnormal, CtxI).isKnownNeverSubnormal();
160 }
161
162 /// \returns The minimum number of bits needed to store the value of \Op as an
163 /// unsigned integer. Truncating to this size and then zero-extending to
164 /// the original will not change the value.
165 unsigned numBitsUnsigned(Value *Op, const Instruction *CtxI) const;
166
167 /// \returns The minimum number of bits needed to store the value of \Op as a
168 /// signed integer. Truncating to this size and then sign-extending to
169 /// the original size will not change the value.
170 unsigned numBitsSigned(Value *Op, const Instruction *CtxI) const;
171
172 /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
173 /// SelectionDAG has an issue where an and asserting the bits are known
174 bool replaceMulWithMul24(BinaryOperator &I) const;
175
176 /// Perform same function as equivalently named function in DAGCombiner. Since
177 /// we expand some divisions here, we need to perform this before obscuring.
178 bool foldBinOpIntoSelect(BinaryOperator &I) const;
179
180 bool divHasSpecialOptimization(BinaryOperator &I,
181 Value *Num, Value *Den) const;
182 unsigned getDivNumBits(BinaryOperator &I, Value *Num, Value *Den,
183 unsigned MaxDivBits, bool Signed) const;
184
185 /// Expands div or rem by using floating-point operations.
186 /// Operands must be in the range [-0x400000,0x3FFFFF]
187 Value *expandDivRemToFloat(IRBuilder<> &Builder, BinaryOperator &I,
188 Value *Num, Value *Den, bool IsDiv,
189 bool IsSigned) const;
190
191 Value *expandDivRemToFloatImpl(IRBuilder<> &Builder, BinaryOperator &I,
192 Value *Num, Value *Den, unsigned NumBits,
193 bool IsDiv, bool IsSigned) const;
194
195 /// Expands 32 bit div or rem.
196 Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
197 Value *Num, Value *Den) const;
198
199 Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I,
200 Value *Num, Value *Den) const;
201 void expandDivRem64(BinaryOperator &I) const;
202
203 /// Widen a scalar load.
204 ///
205 /// \details \p Widen scalar load for uniform, small type loads from constant
206 // memory / to a full 32-bits and then truncate the input to allow a scalar
207 // load instead of a vector load.
208 //
209 /// \returns True.
210
211 bool canWidenScalarExtLoad(LoadInst &I) const;
212
213 Value *matchFractPatImpl(Value &V, const APFloat &C) const;
214 Value *matchFractPatNanAvoidant(Value &V);
215 Value *applyFractPat(IRBuilder<> &Builder, Value *FractArg);
216
217 bool canOptimizeWithRsq(FastMathFlags DivFMF, FastMathFlags SqrtFMF) const;
218
219 Value *optimizeWithRsq(IRBuilder<> &Builder, Value *Num, Value *Den,
220 FastMathFlags DivFMF, FastMathFlags SqrtFMF,
221 const Instruction *CtxI) const;
222
223 Value *optimizeWithRcp(IRBuilder<> &Builder, Value *Num, Value *Den,
224 FastMathFlags FMF, const Instruction *CtxI) const;
225 Value *optimizeWithFDivFast(IRBuilder<> &Builder, Value *Num, Value *Den,
226 float ReqdAccuracy) const;
227
228 Value *visitFDivElement(IRBuilder<> &Builder, Value *Num, Value *Den,
229 FastMathFlags DivFMF, FastMathFlags SqrtFMF,
230 Value *RsqOp, const Instruction *FDiv,
231 float ReqdAccuracy) const;
232
233 std::pair<Value *, Value *> getFrexpResults(IRBuilder<> &Builder,
234 Value *Src) const;
235
236 Value *emitRcpIEEE1ULP(IRBuilder<> &Builder, Value *Src,
237 bool IsNegative) const;
238 Value *emitFrexpDiv(IRBuilder<> &Builder, Value *LHS, Value *RHS,
239 FastMathFlags FMF) const;
240 Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src,
241 FastMathFlags FMF) const;
242 Value *emitRsqF64(IRBuilder<> &Builder, Value *X, FastMathFlags SqrtFMF,
243 FastMathFlags DivFMF, const Instruction *CtxI,
244 bool IsNegative) const;
245
246 CallInst *createWorkitemIdX(IRBuilder<> &B) const;
247 void replaceWithWorkitemIdX(Instruction &I) const;
248 void replaceWithMaskedWorkitemIdX(Instruction &I, unsigned WaveSize) const;
249 bool tryReplaceWithWorkitemId(Instruction &I, unsigned Wave) const;
250
251 bool tryNarrowMathIfNoOverflow(Instruction *I);
252
253public:
254 bool visitFDiv(BinaryOperator &I);
255
256 bool visitInstruction(Instruction &I) { return false; }
257 bool visitBinaryOperator(BinaryOperator &I);
258 bool visitLoadInst(LoadInst &I);
259 bool visitSelectInst(SelectInst &I);
260 bool visitPHINode(PHINode &I);
261 bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
262
263 bool visitIntrinsicInst(IntrinsicInst &I);
264 bool visitFMinLike(IntrinsicInst &I);
265 bool visitSqrt(IntrinsicInst &I);
266 bool visitLog(FPMathOperator &Log, Intrinsic::ID IID);
267 bool visitMbcntLo(IntrinsicInst &I) const;
268 bool visitMbcntHi(IntrinsicInst &I) const;
269 bool visitVectorReduceAdd(IntrinsicInst &I);
270 bool visitSaturatingAdd(IntrinsicInst &I);
271 bool run();
272};
273
274class AMDGPUCodeGenPrepare : public FunctionPass {
275public:
276 static char ID;
277 AMDGPUCodeGenPrepare() : FunctionPass(ID) {}
278 void getAnalysisUsage(AnalysisUsage &AU) const override {
279 AU.addRequired<AssumptionCacheTracker>();
280 AU.addRequired<UniformityInfoWrapperPass>();
281 AU.addRequired<TargetLibraryInfoWrapperPass>();
282
283 // FIXME: Division expansion needs to preserve the dominator tree.
284 if (!ExpandDiv64InIR)
285 AU.setPreservesAll();
286 }
287 bool runOnFunction(Function &F) override;
288 StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
289};
290
291} // end anonymous namespace
292
293bool AMDGPUCodeGenPrepareImpl::run() {
294 BreakPhiNodesCache.clear();
295 bool MadeChange = false;
296
297 // Need to use make_early_inc_range because integer division expansion is
298 // handled by Transform/Utils, and it can delete instructions such as the
299 // terminator of the BB.
300 for (BasicBlock &BB : reverse(C&: F)) {
301 for (Instruction &I : make_early_inc_range(Range: reverse(C&: BB))) {
302 if (!isInstructionTriviallyDead(I: &I, TLI))
303 MadeChange |= visit(I);
304 }
305 }
306
307 while (!DeadVals.empty()) {
308 if (auto *I = dyn_cast_or_null<Instruction>(Val: DeadVals.pop_back_val()))
309 RecursivelyDeleteTriviallyDeadInstructions(V: I, TLI);
310 }
311
312 return MadeChange;
313}
314
315bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const {
316 return Ty->isFloatTy() || Ty->isDoubleTy() ||
317 (Ty->isHalfTy() && ST.has16BitInsts());
318}
319
320bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {
321 Type *Ty = I.getType();
322 int TySize = DL.getTypeSizeInBits(Ty);
323 Align Alignment = DL.getValueOrABITypeAlignment(Alignment: I.getAlign(), Ty);
324
325 return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.isUniformAtDef(V: &I);
326}
327
328unsigned
329AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op,
330 const Instruction *CtxI) const {
331 return computeKnownBits(V: Op, Q: SQ.getWithInstruction(I: CtxI)).countMaxActiveBits();
332}
333
334unsigned
335AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op,
336 const Instruction *CtxI) const {
337 return ComputeMaxSignificantBits(Op, DL: SQ.DL, AC: SQ.AC, CxtI: CtxI, DT: SQ.DT);
338}
339
340static void extractValues(IRBuilder<> &Builder,
341 SmallVectorImpl<Value *> &Values, Value *V) {
342 auto *VT = dyn_cast<FixedVectorType>(Val: V->getType());
343 if (!VT) {
344 Values.push_back(Elt: V);
345 return;
346 }
347
348 for (int I = 0, E = VT->getNumElements(); I != E; ++I)
349 Values.push_back(Elt: Builder.CreateExtractElement(Vec: V, Idx: I));
350}
351
352static Value *insertValues(IRBuilder<> &Builder,
353 Type *Ty,
354 SmallVectorImpl<Value *> &Values) {
355 if (!Ty->isVectorTy()) {
356 assert(Values.size() == 1);
357 return Values[0];
358 }
359
360 Value *NewVal = PoisonValue::get(T: Ty);
361 for (int I = 0, E = Values.size(); I != E; ++I)
362 NewVal = Builder.CreateInsertElement(Vec: NewVal, NewElt: Values[I], Idx: I);
363
364 return NewVal;
365}
366
367bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
368 if (I.getOpcode() != Instruction::Mul)
369 return false;
370
371 Type *Ty = I.getType();
372 unsigned Size = Ty->getScalarSizeInBits();
373 if (Size <= 16 && ST.has16BitInsts())
374 return false;
375
376 // Prefer scalar if this could be s_mul_i32
377 if (UA.isUniformAtDef(V: &I))
378 return false;
379
380 Value *LHS = I.getOperand(i_nocapture: 0);
381 Value *RHS = I.getOperand(i_nocapture: 1);
382 IRBuilder<> Builder(&I);
383 Builder.SetCurrentDebugLocation(I.getDebugLoc());
384
385 unsigned LHSBits = 0, RHSBits = 0;
386 bool IsSigned = false;
387
388 if (ST.hasMulU24() && (LHSBits = numBitsUnsigned(Op: LHS, CtxI: &I)) <= 24 &&
389 (RHSBits = numBitsUnsigned(Op: RHS, CtxI: &I)) <= 24) {
390 IsSigned = false;
391
392 } else if (ST.hasMulI24() && (LHSBits = numBitsSigned(Op: LHS, CtxI: &I)) <= 24 &&
393 (RHSBits = numBitsSigned(Op: RHS, CtxI: &I)) <= 24) {
394 IsSigned = true;
395
396 } else
397 return false;
398
399 SmallVector<Value *, 4> LHSVals;
400 SmallVector<Value *, 4> RHSVals;
401 SmallVector<Value *, 4> ResultVals;
402 extractValues(Builder, Values&: LHSVals, V: LHS);
403 extractValues(Builder, Values&: RHSVals, V: RHS);
404
405 IntegerType *I32Ty = Builder.getInt32Ty();
406 IntegerType *IntrinTy = Size > 32 ? Builder.getInt64Ty() : I32Ty;
407 Type *DstTy = LHSVals[0]->getType();
408
409 for (int I = 0, E = LHSVals.size(); I != E; ++I) {
410 Value *LHS = IsSigned ? Builder.CreateSExtOrTrunc(V: LHSVals[I], DestTy: I32Ty)
411 : Builder.CreateZExtOrTrunc(V: LHSVals[I], DestTy: I32Ty);
412 Value *RHS = IsSigned ? Builder.CreateSExtOrTrunc(V: RHSVals[I], DestTy: I32Ty)
413 : Builder.CreateZExtOrTrunc(V: RHSVals[I], DestTy: I32Ty);
414 Intrinsic::ID ID =
415 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
416 Value *Result = Builder.CreateIntrinsic(ID, OverloadTypes: {IntrinTy}, Args: {LHS, RHS});
417 Result = IsSigned ? Builder.CreateSExtOrTrunc(V: Result, DestTy: DstTy)
418 : Builder.CreateZExtOrTrunc(V: Result, DestTy: DstTy);
419 ResultVals.push_back(Elt: Result);
420 }
421
422 Value *NewVal = insertValues(Builder, Ty, Values&: ResultVals);
423 NewVal->takeName(V: &I);
424 I.replaceAllUsesWith(V: NewVal);
425 DeadVals.push_back(Elt: &I);
426
427 return true;
428}
429
430// Find a select instruction, which may have been casted. This is mostly to deal
431// with cases where i16 selects were promoted here to i32.
432static SelectInst *findSelectThroughCast(Value *V, CastInst *&Cast) {
433 Cast = nullptr;
434 if (SelectInst *Sel = dyn_cast<SelectInst>(Val: V))
435 return Sel;
436
437 if ((Cast = dyn_cast<CastInst>(Val: V))) {
438 if (SelectInst *Sel = dyn_cast<SelectInst>(Val: Cast->getOperand(i_nocapture: 0)))
439 return Sel;
440 }
441
442 return nullptr;
443}
444
445bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const {
446 // Don't do this unless the old select is going away. We want to eliminate the
447 // binary operator, not replace a binop with a select.
448 int SelOpNo = 0;
449
450 CastInst *CastOp;
451
452 // TODO: Should probably try to handle some cases with multiple
453 // users. Duplicating the select may be profitable for division.
454 SelectInst *Sel = findSelectThroughCast(V: BO.getOperand(i_nocapture: 0), Cast&: CastOp);
455 if (!Sel || !Sel->hasOneUse()) {
456 SelOpNo = 1;
457 Sel = findSelectThroughCast(V: BO.getOperand(i_nocapture: 1), Cast&: CastOp);
458 }
459
460 if (!Sel || !Sel->hasOneUse())
461 return false;
462
463 Constant *CT = dyn_cast<Constant>(Val: Sel->getTrueValue());
464 Constant *CF = dyn_cast<Constant>(Val: Sel->getFalseValue());
465 Constant *CBO = dyn_cast<Constant>(Val: BO.getOperand(i_nocapture: SelOpNo ^ 1));
466 if (!CBO || !CT || !CF)
467 return false;
468
469 if (CastOp) {
470 if (!CastOp->hasOneUse())
471 return false;
472 CT = ConstantFoldCastOperand(Opcode: CastOp->getOpcode(), C: CT, DestTy: BO.getType(), DL);
473 CF = ConstantFoldCastOperand(Opcode: CastOp->getOpcode(), C: CF, DestTy: BO.getType(), DL);
474 }
475
476 // TODO: Handle special 0/-1 cases DAG combine does, although we only really
477 // need to handle divisions here.
478 Constant *FoldedT =
479 SelOpNo ? ConstantFoldBinaryOpOperands(Opcode: BO.getOpcode(), LHS: CBO, RHS: CT, DL)
480 : ConstantFoldBinaryOpOperands(Opcode: BO.getOpcode(), LHS: CT, RHS: CBO, DL);
481 if (!FoldedT || isa<ConstantExpr>(Val: FoldedT))
482 return false;
483
484 Constant *FoldedF =
485 SelOpNo ? ConstantFoldBinaryOpOperands(Opcode: BO.getOpcode(), LHS: CBO, RHS: CF, DL)
486 : ConstantFoldBinaryOpOperands(Opcode: BO.getOpcode(), LHS: CF, RHS: CBO, DL);
487 if (!FoldedF || isa<ConstantExpr>(Val: FoldedF))
488 return false;
489
490 IRBuilder<> Builder(&BO);
491 Builder.SetCurrentDebugLocation(BO.getDebugLoc());
492 if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(Val: &BO))
493 Builder.setFastMathFlags(FPOp->getFastMathFlags());
494
495 Value *NewSelect = Builder.CreateSelect(C: Sel->getCondition(),
496 True: FoldedT, False: FoldedF);
497 NewSelect->takeName(V: &BO);
498 BO.replaceAllUsesWith(V: NewSelect);
499 DeadVals.push_back(Elt: &BO);
500 if (CastOp)
501 DeadVals.push_back(Elt: CastOp);
502 DeadVals.push_back(Elt: Sel);
503 return true;
504}
505
506std::pair<Value *, Value *>
507AMDGPUCodeGenPrepareImpl::getFrexpResults(IRBuilder<> &Builder,
508 Value *Src) const {
509 Type *Ty = Src->getType();
510 Value *Frexp = Builder.CreateIntrinsic(ID: Intrinsic::frexp,
511 OverloadTypes: {Ty, Builder.getInt32Ty()}, Args: Src);
512 Value *FrexpMant = Builder.CreateExtractValue(Agg: Frexp, Idxs: {0});
513
514 // Bypass the bug workaround for the exponent result since it doesn't matter.
515 // TODO: Does the bug workaround even really need to consider the exponent
516 // result? It's unspecified by the spec.
517
518 Value *FrexpExp =
519 ST.hasFractBug()
520 ? Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_frexp_exp,
521 OverloadTypes: {Builder.getInt32Ty(), Ty}, Args: Src)
522 : Builder.CreateExtractValue(Agg: Frexp, Idxs: {1});
523 return {FrexpMant, FrexpExp};
524}
525
526/// Emit an expansion of 1.0 / Src good for 1ulp that supports denormals.
527Value *AMDGPUCodeGenPrepareImpl::emitRcpIEEE1ULP(IRBuilder<> &Builder,
528 Value *Src,
529 bool IsNegative) const {
530 // Same as for 1.0, but expand the sign out of the constant.
531 // -1.0 / x -> rcp (fneg x)
532 if (IsNegative)
533 Src = Builder.CreateFNeg(V: Src);
534
535 // The rcp instruction doesn't support denormals, so scale the input
536 // out of the denormal range and convert at the end.
537 //
538 // Expand as 2^-n * (1.0 / (x * 2^n))
539
540 // TODO: Skip scaling if input is known never denormal and the input
541 // range won't underflow to denormal. The hard part is knowing the
542 // result. We need a range check, the result could be denormal for
543 // 0x1p+126 < den <= 0x1p+127.
544 auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);
545 Value *ScaleFactor = Builder.CreateNeg(V: FrexpExp);
546 Value *Rcp = Builder.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rcp, Op: FrexpMant);
547 return Builder.CreateCall(Callee: getLdexpF32(), Args: {Rcp, ScaleFactor});
548}
549
550/// Emit a 2ulp expansion for fdiv by using frexp for input scaling.
551Value *AMDGPUCodeGenPrepareImpl::emitFrexpDiv(IRBuilder<> &Builder, Value *LHS,
552 Value *RHS,
553 FastMathFlags FMF) const {
554 // If we have have to work around the fract/frexp bug, we're worse off than
555 // using the fdiv.fast expansion. The full safe expansion is faster if we have
556 // fast FMA.
557 if (HasFP32DenormalFlush && ST.hasFractBug() && !ST.hasFastFMAF32() &&
558 (!FMF.noNaNs() || !FMF.noInfs()))
559 return nullptr;
560
561 // We're scaling the LHS to avoid a denormal input, and scale the denominator
562 // to avoid large values underflowing the result.
563 auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, Src: RHS);
564
565 Value *Rcp =
566 Builder.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rcp, Op: FrexpMantRHS);
567
568 auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder, Src: LHS);
569 Value *Mul = Builder.CreateFMul(L: FrexpMantLHS, R: Rcp);
570
571 // We multiplied by 2^N/2^M, so we need to multiply by 2^(N-M) to scale the
572 // result.
573 Value *ExpDiff = Builder.CreateSub(LHS: FrexpExpLHS, RHS: FrexpExpRHS);
574 return Builder.CreateCall(Callee: getLdexpF32(), Args: {Mul, ExpDiff});
575}
576
577/// Emit a sqrt that handles denormals and is accurate to 2ulp.
578Value *AMDGPUCodeGenPrepareImpl::emitSqrtIEEE2ULP(IRBuilder<> &Builder,
579 Value *Src,
580 FastMathFlags FMF) const {
581 Type *Ty = Src->getType();
582 APFloat SmallestNormal =
583 APFloat::getSmallestNormalized(Sem: Ty->getFltSemantics());
584 Value *NeedScale =
585 Builder.CreateFCmpOLT(LHS: Src, RHS: ConstantFP::get(Ty, V: SmallestNormal));
586
587 ConstantInt *Zero = Builder.getInt32(C: 0);
588 Value *InputScaleFactor =
589 Builder.CreateSelect(C: NeedScale, True: Builder.getInt32(C: 32), False: Zero);
590
591 Value *Scaled = Builder.CreateCall(Callee: getLdexpF32(), Args: {Src, InputScaleFactor});
592
593 Value *Sqrt = Builder.CreateCall(Callee: getSqrtF32(), Args: Scaled);
594
595 Value *OutputScaleFactor =
596 Builder.CreateSelect(C: NeedScale, True: Builder.getInt32(C: -16), False: Zero);
597 return Builder.CreateCall(Callee: getLdexpF32(), Args: {Sqrt, OutputScaleFactor});
598}
599
600/// Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
601static Value *emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src,
602 bool IsNegative) {
603 // bool need_scale = x < 0x1p-126f;
604 // float input_scale = need_scale ? 0x1.0p+24f : 1.0f;
605 // float output_scale = need_scale ? 0x1.0p+12f : 1.0f;
606 // rsq(x * input_scale) * output_scale;
607
608 Type *Ty = Src->getType();
609 APFloat SmallestNormal =
610 APFloat::getSmallestNormalized(Sem: Ty->getFltSemantics());
611 Value *NeedScale =
612 Builder.CreateFCmpOLT(LHS: Src, RHS: ConstantFP::get(Ty, V: SmallestNormal));
613 Constant *One = ConstantFP::get(Ty, V: 1.0);
614 Constant *InputScale = ConstantFP::get(Ty, V: 0x1.0p+24);
615 Constant *OutputScale =
616 ConstantFP::get(Ty, V: IsNegative ? -0x1.0p+12 : 0x1.0p+12);
617
618 Value *InputScaleFactor = Builder.CreateSelect(C: NeedScale, True: InputScale, False: One);
619
620 Value *ScaledInput = Builder.CreateFMul(L: Src, R: InputScaleFactor);
621 Value *Rsq = Builder.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rsq, Op: ScaledInput);
622 Value *OutputScaleFactor = Builder.CreateSelect(
623 C: NeedScale, True: OutputScale, False: IsNegative ? ConstantFP::get(Ty, V: -1.0) : One);
624
625 return Builder.CreateFMul(L: Rsq, R: OutputScaleFactor);
626}
627
628/// Emit inverse sqrt expansion for f64 with a correction sequence on top of
629/// v_rsq_f64. This should give a 1ulp result.
630Value *AMDGPUCodeGenPrepareImpl::emitRsqF64(IRBuilder<> &Builder, Value *X,
631 FastMathFlags SqrtFMF,
632 FastMathFlags DivFMF,
633 const Instruction *CtxI,
634 bool IsNegative) const {
635 // rsq(x):
636 // double y0 = BUILTIN_AMDGPU_RSQRT_F64(x);
637 // double e = MATH_MAD(-y0 * (x == PINF_F64 || x == 0.0 ? y0 : x), y0, 1.0);
638 // return MATH_MAD(y0*e, MATH_MAD(e, 0.375, 0.5), y0);
639 //
640 // -rsq(x):
641 // double y0 = BUILTIN_AMDGPU_RSQRT_F64(x);
642 // double e = MATH_MAD(-y0 * (x == PINF_F64 || x == 0.0 ? y0 : x), y0, 1.0);
643 // return MATH_MAD(-y0*e, MATH_MAD(e, 0.375, 0.5), -y0);
644 //
645 // The rsq instruction handles the special cases correctly. We need to check
646 // for the edge case conditions to ensure the special case propagates through
647 // the later instructions.
648
649 Value *Y0 = Builder.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rsq, Op: X);
650
651 // Try to elide the edge case check.
652 //
653 // Fast math flags imply:
654 // sqrt ninf => !isinf(x)
655 // fdiv ninf => x != 0, !isinf(x)
656 bool MaybePosInf = !SqrtFMF.noInfs() && !DivFMF.noInfs();
657 bool MaybeZero = !DivFMF.noInfs();
658
659 DenormalMode DenormMode;
660 FPClassTest Interested = fcNone;
661 if (MaybePosInf)
662 Interested = fcPosInf;
663 if (MaybeZero)
664 Interested |= fcZero;
665
666 if (Interested != fcNone) {
667 KnownFPClass KnownSrc = computeKnownFPClass(V: X, Interested, CtxI);
668 if (KnownSrc.isKnownNeverPosInfinity())
669 MaybePosInf = false;
670
671 DenormMode = F.getDenormalMode(FPType: X->getType()->getFltSemantics());
672 if (KnownSrc.isKnownNeverLogicalZero(Mode: DenormMode))
673 MaybeZero = false;
674 }
675
676 Value *SpecialOrRsq = X;
677 if (MaybeZero || MaybePosInf) {
678 Value *Cond;
679 if (MaybePosInf && MaybeZero) {
680 if (DenormMode.Input != DenormalMode::DenormalModeKind::Dynamic) {
681 FPClassTest TestMask = fcPosInf | fcZero;
682 if (DenormMode.inputsAreZero())
683 TestMask |= fcSubnormal;
684
685 Cond = Builder.createIsFPClass(FPNum: X, Test: TestMask);
686 } else {
687 // Avoid using llvm.is.fpclass for dynamic denormal mode, since it
688 // doesn't respect the floating-point environment.
689 Value *IsZero =
690 Builder.CreateFCmpOEQ(LHS: X, RHS: ConstantFP::getZero(Ty: X->getType()));
691 Value *IsInf =
692 Builder.CreateFCmpOEQ(LHS: X, RHS: ConstantFP::getInfinity(Ty: X->getType()));
693 Cond = Builder.CreateOr(LHS: IsZero, RHS: IsInf);
694 }
695 } else if (MaybeZero) {
696 Cond = Builder.CreateFCmpOEQ(LHS: X, RHS: ConstantFP::getZero(Ty: X->getType()));
697 } else {
698 Cond = Builder.CreateFCmpOEQ(LHS: X, RHS: ConstantFP::getInfinity(Ty: X->getType()));
699 }
700
701 SpecialOrRsq = Builder.CreateSelect(C: Cond, True: Y0, False: X);
702 }
703
704 Value *NegY0 = Builder.CreateFNeg(V: Y0);
705 Value *NegXY0 = Builder.CreateFMul(L: SpecialOrRsq, R: NegY0);
706
707 // Could be fmuladd, but isFMAFasterThanFMulAndFAdd is always true for f64.
708 Value *E = Builder.CreateFMA(Factor1: NegXY0, Factor2: Y0, Summand: ConstantFP::get(Ty: X->getType(), V: 1.0));
709
710 Value *Y0E = Builder.CreateFMul(L: E, R: IsNegative ? NegY0 : Y0);
711
712 Value *EFMA = Builder.CreateFMA(Factor1: E, Factor2: ConstantFP::get(Ty: X->getType(), V: 0.375),
713 Summand: ConstantFP::get(Ty: X->getType(), V: 0.5));
714
715 return Builder.CreateFMA(Factor1: Y0E, Factor2: EFMA, Summand: IsNegative ? NegY0 : Y0);
716}
717
718bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(FastMathFlags DivFMF,
719 FastMathFlags SqrtFMF) const {
720 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp for f32 and
721 // f64.
722 return DivFMF.allowContract() && SqrtFMF.allowContract();
723}
724
725Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
726 IRBuilder<> &Builder, Value *Num, Value *Den, const FastMathFlags DivFMF,
727 const FastMathFlags SqrtFMF, const Instruction *CtxI) const {
728 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
729 assert(DivFMF.allowContract() && SqrtFMF.allowContract());
730
731 // rsq_f16 is accurate to 0.51 ulp.
732 // rsq_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
733 // rsq_f64 is never accurate.
734 const ConstantFP *CLHS = dyn_cast<ConstantFP>(Val: Num);
735 if (!CLHS)
736 return nullptr;
737
738 bool IsNegative = false;
739
740 // TODO: Handle other numerator values with arcp.
741 if (CLHS->isOne() || (IsNegative = CLHS->isMinusOne())) {
742 // Add in the sqrt flags.
743 IRBuilder<>::FastMathFlagGuard Guard(Builder);
744 Builder.setFastMathFlags(DivFMF | SqrtFMF);
745
746 if (Den->getType()->isFloatTy()) {
747 if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) ||
748 canIgnoreDenormalInput(V: Den, CtxI)) {
749 Value *Result =
750 Builder.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rsq, Op: Den);
751 // -1.0 / sqrt(x) -> fneg(rsq(x))
752 return IsNegative ? Builder.CreateFNeg(V: Result) : Result;
753 }
754
755 return emitRsqIEEE1ULP(Builder, Src: Den, IsNegative);
756 }
757
758 if (Den->getType()->isDoubleTy())
759 return emitRsqF64(Builder, X: Den, SqrtFMF, DivFMF, CtxI, IsNegative);
760 }
761
762 return nullptr;
763}
764
765// Optimize fdiv with rcp:
766//
767// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
768// allowed with afn.
769//
770// a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0
771Value *
772AMDGPUCodeGenPrepareImpl::optimizeWithRcp(IRBuilder<> &Builder, Value *Num,
773 Value *Den, FastMathFlags FMF,
774 const Instruction *CtxI) const {
775 // rcp_f16 is accurate to 0.51 ulp.
776 // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
777 // rcp_f64 is never accurate.
778 assert(Den->getType()->isFloatTy());
779
780 if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Val: Num)) {
781 bool IsNegative = false;
782 if (CLHS->isOne() || (IsNegative = CLHS->isMinusOne())) {
783 Value *Src = Den;
784
785 if (HasFP32DenormalFlush || FMF.approxFunc()) {
786 // -1.0 / x -> 1.0 / fneg(x)
787 if (IsNegative)
788 Src = Builder.CreateFNeg(V: Src);
789
790 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
791 // the CI documentation has a worst case error of 1 ulp.
792 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK
793 // to use it as long as we aren't trying to use denormals.
794 //
795 // v_rcp_f16 and v_rsq_f16 DO support denormals.
796
797 // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
798 // insert rsq intrinsic here.
799
800 // 1.0 / x -> rcp(x)
801 return Builder.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rcp, Op: Src);
802 }
803
804 // TODO: If the input isn't denormal, and we know the input exponent isn't
805 // big enough to introduce a denormal we can avoid the scaling.
806 return emitRcpIEEE1ULP(Builder, Src, IsNegative);
807 }
808 }
809
810 if (FMF.allowReciprocal()) {
811 // x / y -> x * (1.0 / y)
812
813 // TODO: Could avoid denormal scaling and use raw rcp if we knew the output
814 // will never underflow.
815 if (HasFP32DenormalFlush || FMF.approxFunc()) {
816 Value *Recip = Builder.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rcp, Op: Den);
817 return Builder.CreateFMul(L: Num, R: Recip);
818 }
819
820 Value *Recip = emitRcpIEEE1ULP(Builder, Src: Den, IsNegative: false);
821 return Builder.CreateFMul(L: Num, R: Recip);
822 }
823
824 return nullptr;
825}
826
827// optimize with fdiv.fast:
828//
829// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
830//
831// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
832//
833// NOTE: optimizeWithRcp should be tried first because rcp is the preference.
834Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
835 IRBuilder<> &Builder, Value *Num, Value *Den, float ReqdAccuracy) const {
836 // fdiv.fast can achieve 2.5 ULP accuracy.
837 if (ReqdAccuracy < 2.5f)
838 return nullptr;
839
840 // Only have fdiv.fast for f32.
841 assert(Den->getType()->isFloatTy());
842
843 bool NumIsOne = false;
844 if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Val: Num)) {
845 if (CNum->isOne() || CNum->isMinusOne())
846 NumIsOne = true;
847 }
848
849 // fdiv does not support denormals. But 1.0/x is always fine to use it.
850 //
851 // TODO: This works for any value with a specific known exponent range, don't
852 // just limit to constant 1.
853 if (!HasFP32DenormalFlush && !NumIsOne)
854 return nullptr;
855
856 return Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_fdiv_fast, Args: {Num, Den});
857}
858
859Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
860 IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF,
861 FastMathFlags SqrtFMF, Value *RsqOp, const Instruction *FDivInst,
862 float ReqdDivAccuracy) const {
863 if (RsqOp) {
864 Value *Rsq =
865 optimizeWithRsq(Builder, Num, Den: RsqOp, DivFMF, SqrtFMF, CtxI: FDivInst);
866 if (Rsq)
867 return Rsq;
868 }
869
870 if (!Num->getType()->isFloatTy())
871 return nullptr;
872
873 Value *Rcp = optimizeWithRcp(Builder, Num, Den, FMF: DivFMF, CtxI: FDivInst);
874 if (Rcp)
875 return Rcp;
876
877 // In the basic case fdiv_fast has the same instruction count as the frexp div
878 // expansion. Slightly prefer fdiv_fast since it ends in an fmul that can
879 // potentially be fused into a user. Also, materialization of the constants
880 // can be reused for multiple instances.
881 Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdAccuracy: ReqdDivAccuracy);
882 if (FDivFast)
883 return FDivFast;
884
885 return emitFrexpDiv(Builder, LHS: Num, RHS: Den, FMF: DivFMF);
886}
887
888// Optimizations is performed based on fpmath, fast math flags as well as
889// denormals to optimize fdiv with either rcp or fdiv.fast.
890//
891// With rcp:
892// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
893// allowed with afn.
894//
895// a/b -> a*rcp(b) when inaccurate rcp is allowed with afn.
896//
897// With fdiv.fast:
898// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
899//
900// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
901//
902// NOTE: rcp is the preference in cases that both are legal.
903bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
904 if (DisableFDivExpand)
905 return false;
906
907 Type *Ty = FDiv.getType()->getScalarType();
908 const bool IsFloat = Ty->isFloatTy();
909 if (!IsFloat && !Ty->isDoubleTy())
910 return false;
911
912 // The f64 rcp/rsq approximations are pretty inaccurate. We can do an
913 // expansion around them in codegen. f16 is good enough to always use.
914
915 const FPMathOperator *FPOp = cast<const FPMathOperator>(Val: &FDiv);
916 const FastMathFlags DivFMF = FPOp->getFastMathFlags();
917 const float ReqdAccuracy = FPOp->getFPAccuracy();
918
919 FastMathFlags SqrtFMF;
920
921 Value *Num = FDiv.getOperand(i_nocapture: 0);
922 Value *Den = FDiv.getOperand(i_nocapture: 1);
923
924 Value *RsqOp = nullptr;
925 auto *DenII = dyn_cast<IntrinsicInst>(Val: Den);
926 if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
927 DenII->hasOneUse()) {
928 const auto *SqrtOp = cast<FPMathOperator>(Val: DenII);
929 SqrtFMF = SqrtOp->getFastMathFlags();
930 if (canOptimizeWithRsq(DivFMF, SqrtFMF))
931 RsqOp = SqrtOp->getOperand(i: 0);
932 }
933
934 // rcp path not yet implemented for f64.
935 if (!IsFloat && !RsqOp)
936 return false;
937
938 // Inaccurate rcp is allowed with afn.
939 //
940 // Defer to codegen to handle this.
941 //
942 // TODO: Decide on an interpretation for interactions between afn + arcp +
943 // !fpmath, and make it consistent between here and codegen. For now, defer
944 // expansion of afn to codegen. The current interpretation is so aggressive we
945 // don't need any pre-consideration here when we have better information. A
946 // more conservative interpretation could use handling here.
947 const bool AllowInaccurateRcp = DivFMF.approxFunc();
948 if (!RsqOp && AllowInaccurateRcp)
949 return false;
950
951 // Defer the correct implementations to codegen.
952 if (IsFloat && ReqdAccuracy < 1.0f)
953 return false;
954
955 IRBuilder<> Builder(FDiv.getParent(), std::next(x: FDiv.getIterator()));
956 Builder.setFastMathFlags(DivFMF);
957 Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
958
959 SmallVector<Value *, 4> NumVals;
960 SmallVector<Value *, 4> DenVals;
961 SmallVector<Value *, 4> RsqDenVals;
962 extractValues(Builder, Values&: NumVals, V: Num);
963 extractValues(Builder, Values&: DenVals, V: Den);
964
965 if (RsqOp)
966 extractValues(Builder, Values&: RsqDenVals, V: RsqOp);
967
968 SmallVector<Value *, 4> ResultVals(NumVals.size());
969 for (int I = 0, E = NumVals.size(); I != E; ++I) {
970 Value *NumElt = NumVals[I];
971 Value *DenElt = DenVals[I];
972 Value *RsqDenElt = RsqOp ? RsqDenVals[I] : nullptr;
973
974 Value *NewElt =
975 visitFDivElement(Builder, Num: NumElt, Den: DenElt, DivFMF, SqrtFMF, RsqOp: RsqDenElt,
976 FDivInst: cast<Instruction>(Val: FPOp), ReqdDivAccuracy: ReqdAccuracy);
977 if (!NewElt) {
978 // Keep the original, but scalarized.
979
980 // This has the unfortunate side effect of sometimes scalarizing when
981 // we're not going to do anything.
982 NewElt = Builder.CreateFDiv(L: NumElt, R: DenElt);
983 if (auto *NewEltInst = dyn_cast<Instruction>(Val: NewElt))
984 NewEltInst->copyMetadata(SrcInst: FDiv);
985 }
986
987 ResultVals[I] = NewElt;
988 }
989
990 Value *NewVal = insertValues(Builder, Ty: FDiv.getType(), Values&: ResultVals);
991
992 if (NewVal) {
993 FDiv.replaceAllUsesWith(V: NewVal);
994 NewVal->takeName(V: &FDiv);
995 DeadVals.push_back(Elt: &FDiv);
996 }
997
998 return true;
999}
1000
1001static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,
1002 Value *LHS, Value *RHS) {
1003 Type *I32Ty = Builder.getInt32Ty();
1004 Type *I64Ty = Builder.getInt64Ty();
1005
1006 Value *LHS_EXT64 = Builder.CreateZExt(V: LHS, DestTy: I64Ty);
1007 Value *RHS_EXT64 = Builder.CreateZExt(V: RHS, DestTy: I64Ty);
1008 Value *MUL64 = Builder.CreateMul(LHS: LHS_EXT64, RHS: RHS_EXT64);
1009 Value *Lo = Builder.CreateTrunc(V: MUL64, DestTy: I32Ty);
1010 Value *Hi = Builder.CreateLShr(LHS: MUL64, RHS: Builder.getInt64(C: 32));
1011 Hi = Builder.CreateTrunc(V: Hi, DestTy: I32Ty);
1012 return std::pair(Lo, Hi);
1013}
1014
1015static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
1016 return getMul64(Builder, LHS, RHS).second;
1017}
1018
1019/// Figure out how many bits are really needed for this division.
1020/// \p MaxDivBits is an optimization hint to bypass the second
1021/// ComputeNumSignBits/computeKnownBits call if the first one is
1022/// insufficient.
1023unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,
1024 Value *Den,
1025 unsigned MaxDivBits,
1026 bool IsSigned) const {
1027 assert(Num->getType()->getScalarSizeInBits() ==
1028 Den->getType()->getScalarSizeInBits());
1029 unsigned SSBits = Num->getType()->getScalarSizeInBits();
1030 if (IsSigned) {
1031 unsigned RHSSignBits = ComputeNumSignBits(Op: Den, DL: SQ.DL, AC: SQ.AC, CxtI: &I, DT: SQ.DT);
1032 // A sign bit needs to be reserved for shrinking.
1033 unsigned DivBits = SSBits - RHSSignBits + 1;
1034 if (DivBits > MaxDivBits)
1035 return SSBits;
1036
1037 unsigned LHSSignBits = ComputeNumSignBits(Op: Num, DL: SQ.DL, AC: SQ.AC, CxtI: &I);
1038
1039 unsigned SignBits = std::min(a: LHSSignBits, b: RHSSignBits);
1040 DivBits = SSBits - SignBits + 1;
1041 return DivBits;
1042 }
1043
1044 // All bits are used for unsigned division for Num or Den in range
1045 // (SignedMax, UnsignedMax].
1046 KnownBits Known = computeKnownBits(V: Den, Q: SQ.getWithInstruction(I: &I));
1047 unsigned RHSBits = Known.countMaxActiveBits();
1048 if (RHSBits > MaxDivBits)
1049 return SSBits;
1050
1051 Known = computeKnownBits(V: Num, Q: SQ.getWithInstruction(I: &I));
1052 unsigned LHSBits = Known.countMaxActiveBits();
1053
1054 unsigned DivBits = std::max(a: LHSBits, b: RHSBits);
1055 return DivBits;
1056}
1057
1058Value *AMDGPUCodeGenPrepareImpl::expandDivRemToFloat(IRBuilder<> &Builder,
1059 BinaryOperator &I,
1060 Value *Num, Value *Den,
1061 bool IsDiv,
1062 bool IsSigned) const {
1063 unsigned DivBits = getDivNumBits(I, Num, Den, MaxDivBits: 23, IsSigned);
1064
1065 if (DivBits > (IsSigned ? 23 : 22))
1066 return nullptr;
1067 return expandDivRemToFloatImpl(Builder, I, Num, Den, NumBits: DivBits, IsDiv,
1068 IsSigned);
1069}
1070
1071Value *AMDGPUCodeGenPrepareImpl::expandDivRemToFloatImpl(
1072 IRBuilder<> &Builder, BinaryOperator &I, Value *Num, Value *Den,
1073 unsigned DivBits, bool IsDiv, bool IsSigned) const {
1074
1075 // v_rcp_f32(float(X)) can have an error of 1 ulp.
1076 // This would cause incorrect calculation of Y/X if:
1077 // Y = (0x7FFFFF/X)*(X-0)-1
1078 // were allowed.
1079 //
1080 // For example,
1081 // (0x7FF6D3/0x000FE7) would erroneously produce 2060 instead of 2059.
1082 // (0x7FF8F5/0x007EFB) would erroneously produce 258 instead of 257.
1083 //
1084 // Thus, we conservatively restrict expandDivRemToFloatImpl to
1085 // [-0x40000,0x3FFFFF] for IsSigned
1086 // [0x000000,0x3FFFFF] for !IsSigned.
1087 assert(0 < DivBits && DivBits <= (IsSigned ? 23 : 22) &&
1088 "abs(Num) must be <= than 0x40000 for expandDivRemToFloatImpl to work "
1089 "correctly");
1090
1091 Type *I32Ty = Builder.getInt32Ty();
1092 Num = Builder.CreateTrunc(V: Num, DestTy: I32Ty);
1093 Den = Builder.CreateTrunc(V: Den, DestTy: I32Ty);
1094
1095 Type *F32Ty = Builder.getFloatTy();
1096 ConstantInt *One = Builder.getInt32(C: 1);
1097 Value *JQ = One;
1098
1099 if (IsSigned) {
1100 // char|short jq = ia ^ ib;
1101 JQ = Builder.CreateXor(LHS: Num, RHS: Den);
1102
1103 // jq = jq >> (bitsize - 2)
1104 JQ = Builder.CreateAShr(LHS: JQ, RHS: Builder.getInt32(C: 30));
1105
1106 // jq = jq | 0x1
1107 JQ = Builder.CreateOr(LHS: JQ, RHS: One);
1108 }
1109
1110 // int ia = (int)LHS;
1111 Value *IA = Num;
1112
1113 // int ib, (int)RHS;
1114 Value *IB = Den;
1115
1116 // float fa = (float)ia;
1117 Value *FA = IsSigned ? Builder.CreateSIToFP(V: IA, DestTy: F32Ty)
1118 : Builder.CreateUIToFP(V: IA, DestTy: F32Ty);
1119
1120 // float fb = (float)ib;
1121 Value *FB = IsSigned ? Builder.CreateSIToFP(V: IB,DestTy: F32Ty)
1122 : Builder.CreateUIToFP(V: IB,DestTy: F32Ty);
1123
1124 Value *RCP = Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_rcp,
1125 OverloadTypes: Builder.getFloatTy(), Args: {FB});
1126 Value *FQM = Builder.CreateFMul(L: FA, R: RCP);
1127
1128 // fq = trunc(fqm);
1129 Value *FQ = Builder.CreateUnaryIntrinsic(ID: Intrinsic::trunc, Op: FQM);
1130 auto *FQI = dyn_cast<Instruction>(Val: FQ);
1131 if (FQI)
1132 FQI->copyFastMathFlags(FMF: Builder.getFastMathFlags());
1133
1134 // float fqneg = -fq;
1135 Value *FQNeg = Builder.CreateFNeg(V: FQ);
1136
1137 // float fr = mad(fqneg, fb, fa);
1138 auto FMAD = !ST.hasMadMacF32Insts()
1139 ? Intrinsic::fma
1140 : (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz;
1141 Value *FR =
1142 Builder.CreateIntrinsic(ID: FMAD, OverloadTypes: {FQNeg->getType()}, Args: {FQNeg, FB, FA}, FMFSource: FQI);
1143
1144 // int iq = (int)fq;
1145 Value *IQ = IsSigned ? Builder.CreateFPToSI(V: FQ, DestTy: I32Ty)
1146 : Builder.CreateFPToUI(V: FQ, DestTy: I32Ty);
1147
1148 // fr = fabs(fr);
1149 FR = Builder.CreateFAbs(V: FR, FMFSource: FQI);
1150
1151 // fb = fabs(fb);
1152 FB = Builder.CreateFAbs(V: FB, FMFSource: FQI);
1153
1154 // int cv = fr >= fb;
1155 Value *CV = Builder.CreateFCmpOGE(LHS: FR, RHS: FB);
1156
1157 // jq = (cv ? jq : 0);
1158 JQ = Builder.CreateSelect(C: CV, True: JQ, False: Builder.getInt32(C: 0));
1159
1160 // dst = iq + jq;
1161 Value *Div = Builder.CreateAdd(LHS: IQ, RHS: JQ);
1162
1163 Value *Res = Div;
1164 if (!IsDiv) {
1165 // Rem needs compensation, it's easier to recompute it
1166 Value *Rem = Builder.CreateMul(LHS: Div, RHS: Den);
1167 Res = Builder.CreateSub(LHS: Num, RHS: Rem);
1168 }
1169
1170 return Res;
1171}
1172
1173// Try to recognize special cases the DAG will emit special, better expansions
1174// than the general expansion we do here.
1175
1176// TODO: It would be better to just directly handle those optimizations here.
1177bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I,
1178 Value *Num,
1179 Value *Den) const {
1180 if (Constant *C = dyn_cast<Constant>(Val: Den)) {
1181 // Arbitrary constants get a better expansion as long as a wider mulhi is
1182 // legal.
1183 if (C->getType()->getScalarSizeInBits() <= 32)
1184 return true;
1185
1186 // TODO: Sdiv check for not exact for some reason.
1187
1188 // If there's no wider mulhi, there's only a better expansion for powers of
1189 // two.
1190 // TODO: Should really know for each vector element.
1191 if (isKnownToBeAPowerOfTwo(V: C, OrZero: true, Q: SQ.getWithInstruction(I: &I)))
1192 return true;
1193
1194 return false;
1195 }
1196
1197 if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Val: Den)) {
1198 // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
1199 if (BinOpDen->getOpcode() == Instruction::Shl &&
1200 isa<Constant>(Val: BinOpDen->getOperand(i_nocapture: 0)) &&
1201 isKnownToBeAPowerOfTwo(V: BinOpDen->getOperand(i_nocapture: 0), OrZero: true,
1202 Q: SQ.getWithInstruction(I: &I))) {
1203 return true;
1204 }
1205 }
1206
1207 return false;
1208}
1209
1210static Value *getSign32(Value *V, IRBuilder<> &Builder, const DataLayout DL) {
1211 // Check whether the sign can be determined statically.
1212 KnownBits Known = computeKnownBits(V, DL);
1213 if (Known.isNegative())
1214 return Constant::getAllOnesValue(Ty: V->getType());
1215 if (Known.isNonNegative())
1216 return Constant::getNullValue(Ty: V->getType());
1217 return Builder.CreateAShr(LHS: V, RHS: Builder.getInt32(C: 31));
1218}
1219
1220Value *AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder<> &Builder,
1221 BinaryOperator &I, Value *X,
1222 Value *Y) const {
1223 Instruction::BinaryOps Opc = I.getOpcode();
1224 assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
1225 Opc == Instruction::SRem || Opc == Instruction::SDiv);
1226
1227 FastMathFlags FMF;
1228 FMF.setFast();
1229 Builder.setFastMathFlags(FMF);
1230
1231 if (divHasSpecialOptimization(I, Num: X, Den: Y))
1232 return nullptr; // Keep it for later optimization.
1233
1234 bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
1235 bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
1236
1237 Type *Ty = X->getType();
1238 Type *I32Ty = Builder.getInt32Ty();
1239 Type *F32Ty = Builder.getFloatTy();
1240
1241 if (Ty->getScalarSizeInBits() != 32) {
1242 if (IsSigned) {
1243 X = Builder.CreateSExtOrTrunc(V: X, DestTy: I32Ty);
1244 Y = Builder.CreateSExtOrTrunc(V: Y, DestTy: I32Ty);
1245 } else {
1246 X = Builder.CreateZExtOrTrunc(V: X, DestTy: I32Ty);
1247 Y = Builder.CreateZExtOrTrunc(V: Y, DestTy: I32Ty);
1248 }
1249 }
1250
1251 if (Value *Res = expandDivRemToFloat(Builder, I, Num: X, Den: Y, IsDiv, IsSigned)) {
1252 return IsSigned ? Builder.CreateSExtOrTrunc(V: Res, DestTy: Ty) :
1253 Builder.CreateZExtOrTrunc(V: Res, DestTy: Ty);
1254 }
1255
1256 ConstantInt *Zero = Builder.getInt32(C: 0);
1257 ConstantInt *One = Builder.getInt32(C: 1);
1258
1259 Value *Sign = nullptr;
1260 if (IsSigned) {
1261 Value *SignX = getSign32(V: X, Builder, DL);
1262 Value *SignY = getSign32(V: Y, Builder, DL);
1263 // Remainder sign is the same as LHS
1264 Sign = IsDiv ? Builder.CreateXor(LHS: SignX, RHS: SignY) : SignX;
1265
1266 X = Builder.CreateAdd(LHS: X, RHS: SignX);
1267 Y = Builder.CreateAdd(LHS: Y, RHS: SignY);
1268
1269 X = Builder.CreateXor(LHS: X, RHS: SignX);
1270 Y = Builder.CreateXor(LHS: Y, RHS: SignY);
1271 }
1272
1273 // The algorithm here is based on ideas from "Software Integer Division", Tom
1274 // Rodeheffer, August 2008.
1275 //
1276 // unsigned udiv(unsigned x, unsigned y) {
1277 // // Initial estimate of inv(y). The constant is less than 2^32 to ensure
1278 // // that this is a lower bound on inv(y), even if some of the calculations
1279 // // round up.
1280 // unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y));
1281 //
1282 // // One round of UNR (Unsigned integer Newton-Raphson) to improve z.
1283 // // Empirically this is guaranteed to give a "two-y" lower bound on
1284 // // inv(y).
1285 // z += umulh(z, -y * z);
1286 //
1287 // // Quotient/remainder estimate.
1288 // unsigned q = umulh(x, z);
1289 // unsigned r = x - q * y;
1290 //
1291 // // Two rounds of quotient/remainder refinement.
1292 // if (r >= y) {
1293 // ++q;
1294 // r -= y;
1295 // }
1296 // if (r >= y) {
1297 // ++q;
1298 // r -= y;
1299 // }
1300 //
1301 // return q;
1302 // }
1303
1304 // Initial estimate of inv(y).
1305 Value *FloatY = Builder.CreateUIToFP(V: Y, DestTy: F32Ty);
1306 Value *RcpY = Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_rcp, OverloadTypes: F32Ty, Args: {FloatY});
1307 Constant *Scale = ConstantFP::get(Ty: F32Ty, V: llvm::bit_cast<float>(from: 0x4F7FFFFE));
1308 Value *ScaledY = Builder.CreateFMul(L: RcpY, R: Scale);
1309 Value *Z = Builder.CreateFPToUI(V: ScaledY, DestTy: I32Ty);
1310
1311 // One round of UNR.
1312 Value *NegY = Builder.CreateSub(LHS: Zero, RHS: Y);
1313 Value *NegYZ = Builder.CreateMul(LHS: NegY, RHS: Z);
1314 Z = Builder.CreateAdd(LHS: Z, RHS: getMulHu(Builder, LHS: Z, RHS: NegYZ));
1315
1316 // Quotient/remainder estimate.
1317 Value *Q = getMulHu(Builder, LHS: X, RHS: Z);
1318 Value *R = Builder.CreateSub(LHS: X, RHS: Builder.CreateMul(LHS: Q, RHS: Y));
1319
1320 // First quotient/remainder refinement.
1321 Value *Cond = Builder.CreateICmpUGE(LHS: R, RHS: Y);
1322 if (IsDiv)
1323 Q = Builder.CreateSelect(C: Cond, True: Builder.CreateAdd(LHS: Q, RHS: One), False: Q);
1324 R = Builder.CreateSelect(C: Cond, True: Builder.CreateSub(LHS: R, RHS: Y), False: R);
1325
1326 // Second quotient/remainder refinement.
1327 Cond = Builder.CreateICmpUGE(LHS: R, RHS: Y);
1328 Value *Res;
1329 if (IsDiv)
1330 Res = Builder.CreateSelect(C: Cond, True: Builder.CreateAdd(LHS: Q, RHS: One), False: Q);
1331 else
1332 Res = Builder.CreateSelect(C: Cond, True: Builder.CreateSub(LHS: R, RHS: Y), False: R);
1333
1334 if (IsSigned) {
1335 Res = Builder.CreateXor(LHS: Res, RHS: Sign);
1336 Res = Builder.CreateSub(LHS: Res, RHS: Sign);
1337 Res = Builder.CreateSExtOrTrunc(V: Res, DestTy: Ty);
1338 } else {
1339 Res = Builder.CreateZExtOrTrunc(V: Res, DestTy: Ty);
1340 }
1341 return Res;
1342}
1343
1344Value *AMDGPUCodeGenPrepareImpl::shrinkDivRem64(IRBuilder<> &Builder,
1345 BinaryOperator &I, Value *Num,
1346 Value *Den) const {
1347 if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den))
1348 return nullptr; // Keep it for later optimization.
1349
1350 Instruction::BinaryOps Opc = I.getOpcode();
1351
1352 bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
1353 bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
1354
1355 unsigned NumDivBits = getDivNumBits(I, Num, Den, MaxDivBits: 32, IsSigned);
1356 if (NumDivBits > 32)
1357 return nullptr;
1358
1359 Value *Narrowed = nullptr;
1360 if (NumDivBits <= (IsSigned ? 23 : 22)) {
1361 Narrowed = expandDivRemToFloatImpl(Builder, I, Num, Den, DivBits: NumDivBits, IsDiv,
1362 IsSigned);
1363 } else if (NumDivBits <= (IsSigned ? 31 : 32)) {
1364 // Do not use 32-bit division if dividend may be -2147483648.
1365 // Otherwise 32-bit division cannot be used safely.
1366 // -2147483648/1 and -2147483648/-1 are not equal,
1367 // but they produce the same lower 32-bit result.
1368 Narrowed = expandDivRem32(Builder, I, X: Num, Y: Den);
1369 }
1370
1371 if (Narrowed) {
1372 return IsSigned ? Builder.CreateSExt(V: Narrowed, DestTy: Num->getType()) :
1373 Builder.CreateZExt(V: Narrowed, DestTy: Num->getType());
1374 }
1375
1376 return nullptr;
1377}
1378
1379void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &I) const {
1380 Instruction::BinaryOps Opc = I.getOpcode();
1381 // Do the general expansion.
1382 if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {
1383 expandDivisionUpTo64Bits(Div: &I);
1384 return;
1385 }
1386
1387 if (Opc == Instruction::URem || Opc == Instruction::SRem) {
1388 expandRemainderUpTo64Bits(Rem: &I);
1389 return;
1390 }
1391
1392 llvm_unreachable("not a division");
1393}
1394
1395/*
1396This will cause non-byte load in consistency, for example:
1397```
1398 %load = load i1, ptr addrspace(4) %arg, align 4
1399 %zext = zext i1 %load to
1400 i64 %add = add i64 %zext
1401```
1402Instead of creating `s_and_b32 s0, s0, 1`,
1403it will create `s_and_b32 s0, s0, 0xff`.
1404We accept this change since the non-byte load assumes the upper bits
1405within the byte are all 0.
1406*/
1407bool AMDGPUCodeGenPrepareImpl::tryNarrowMathIfNoOverflow(Instruction *I) {
1408 unsigned Opc = I->getOpcode();
1409 Type *OldType = I->getType();
1410
1411 if (Opc != Instruction::Add && Opc != Instruction::Mul)
1412 return false;
1413
1414 unsigned OrigBit = OldType->getScalarSizeInBits();
1415
1416 if (Opc != Instruction::Add && Opc != Instruction::Mul)
1417 llvm_unreachable("Unexpected opcode, only valid for Instruction::Add and "
1418 "Instruction::Mul.");
1419
1420 unsigned MaxBitsNeeded = computeKnownBits(V: I, DL).countMaxActiveBits();
1421
1422 MaxBitsNeeded = std::max<unsigned>(a: bit_ceil(Value: MaxBitsNeeded), b: 8);
1423 Type *NewType = DL.getSmallestLegalIntType(C&: I->getContext(), Width: MaxBitsNeeded);
1424 if (!NewType)
1425 return false;
1426 unsigned NewBit = NewType->getIntegerBitWidth();
1427 if (NewBit >= OrigBit)
1428 return false;
1429 NewType = I->getType()->getWithNewBitWidth(NewBitWidth: NewBit);
1430
1431 // Old cost
1432 const TargetTransformInfo &TTI = TM.getTargetTransformInfo(F);
1433 InstructionCost OldCost =
1434 TTI.getArithmeticInstrCost(Opcode: Opc, Ty: OldType, CostKind: TTI::TCK_RecipThroughput);
1435 // New cost of new op
1436 InstructionCost NewCost =
1437 TTI.getArithmeticInstrCost(Opcode: Opc, Ty: NewType, CostKind: TTI::TCK_RecipThroughput);
1438 // New cost of narrowing 2 operands (use trunc)
1439 int NumOfNonConstOps = 2;
1440 if (isa<Constant>(Val: I->getOperand(i: 0)) || isa<Constant>(Val: I->getOperand(i: 1))) {
1441 // Cannot be both constant, should be propagated
1442 NumOfNonConstOps = 1;
1443 }
1444 NewCost += NumOfNonConstOps * TTI.getCastInstrCost(Opcode: Instruction::Trunc,
1445 Dst: NewType, Src: OldType,
1446 CCH: TTI.getCastContextHint(I),
1447 CostKind: TTI::TCK_RecipThroughput);
1448 // New cost of zext narrowed result to original type
1449 NewCost +=
1450 TTI.getCastInstrCost(Opcode: Instruction::ZExt, Dst: OldType, Src: NewType,
1451 CCH: TTI.getCastContextHint(I), CostKind: TTI::TCK_RecipThroughput);
1452 if (NewCost >= OldCost)
1453 return false;
1454
1455 IRBuilder<> Builder(I);
1456 Value *Trunc0 = Builder.CreateTrunc(V: I->getOperand(i: 0), DestTy: NewType);
1457 Value *Trunc1 = Builder.CreateTrunc(V: I->getOperand(i: 1), DestTy: NewType);
1458 Value *Arith =
1459 Builder.CreateBinOp(Opc: (Instruction::BinaryOps)Opc, LHS: Trunc0, RHS: Trunc1);
1460
1461 Value *Zext = Builder.CreateZExt(V: Arith, DestTy: OldType);
1462 I->replaceAllUsesWith(V: Zext);
1463 DeadVals.push_back(Elt: I);
1464 return true;
1465}
1466
1467bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
1468 if (foldBinOpIntoSelect(BO&: I))
1469 return true;
1470
1471 if (UseMul24Intrin && replaceMulWithMul24(I))
1472 return true;
1473 if (tryNarrowMathIfNoOverflow(I: &I))
1474 return true;
1475
1476 bool Changed = false;
1477 Instruction::BinaryOps Opc = I.getOpcode();
1478 Type *Ty = I.getType();
1479 Value *NewDiv = nullptr;
1480 unsigned ScalarSize = Ty->getScalarSizeInBits();
1481
1482 SmallVector<BinaryOperator *, 8> Div64ToExpand;
1483
1484 if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
1485 Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
1486 ScalarSize <= 64 &&
1487 !DisableIDivExpand) {
1488 Value *Num = I.getOperand(i_nocapture: 0);
1489 Value *Den = I.getOperand(i_nocapture: 1);
1490 IRBuilder<> Builder(&I);
1491 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1492
1493 if (auto *VT = dyn_cast<FixedVectorType>(Val: Ty)) {
1494 NewDiv = PoisonValue::get(T: VT);
1495
1496 for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {
1497 Value *NumEltN = Builder.CreateExtractElement(Vec: Num, Idx: N);
1498 Value *DenEltN = Builder.CreateExtractElement(Vec: Den, Idx: N);
1499
1500 Value *NewElt;
1501 if (ScalarSize <= 32) {
1502 NewElt = expandDivRem32(Builder, I, X: NumEltN, Y: DenEltN);
1503 if (!NewElt)
1504 NewElt = Builder.CreateBinOp(Opc, LHS: NumEltN, RHS: DenEltN);
1505 } else {
1506 // See if this 64-bit division can be shrunk to 32/24-bits before
1507 // producing the general expansion.
1508 NewElt = shrinkDivRem64(Builder, I, Num: NumEltN, Den: DenEltN);
1509 if (!NewElt) {
1510 // The general 64-bit expansion introduces control flow and doesn't
1511 // return the new value. Just insert a scalar copy and defer
1512 // expanding it.
1513 NewElt = Builder.CreateBinOp(Opc, LHS: NumEltN, RHS: DenEltN);
1514 // CreateBinOp does constant folding. If the operands are constant,
1515 // it will return a Constant instead of a BinaryOperator.
1516 if (auto *NewEltBO = dyn_cast<BinaryOperator>(Val: NewElt))
1517 Div64ToExpand.push_back(Elt: NewEltBO);
1518 }
1519 }
1520
1521 if (auto *NewEltI = dyn_cast<Instruction>(Val: NewElt))
1522 NewEltI->copyIRFlags(V: &I);
1523
1524 NewDiv = Builder.CreateInsertElement(Vec: NewDiv, NewElt, Idx: N);
1525 }
1526 } else {
1527 if (ScalarSize <= 32)
1528 NewDiv = expandDivRem32(Builder, I, X: Num, Y: Den);
1529 else {
1530 NewDiv = shrinkDivRem64(Builder, I, Num, Den);
1531 if (!NewDiv)
1532 Div64ToExpand.push_back(Elt: &I);
1533 }
1534 }
1535
1536 if (NewDiv) {
1537 I.replaceAllUsesWith(V: NewDiv);
1538 DeadVals.push_back(Elt: &I);
1539 Changed = true;
1540 }
1541 }
1542
1543 if (ExpandDiv64InIR) {
1544 // TODO: We get much worse code in specially handled constant cases.
1545 for (BinaryOperator *Div : Div64ToExpand) {
1546 expandDivRem64(I&: *Div);
1547 FlowChanged = true;
1548 Changed = true;
1549 }
1550 }
1551
1552 return Changed;
1553}
1554
1555bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
1556 if (!WidenLoads)
1557 return false;
1558
1559 if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
1560 I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1561 canWidenScalarExtLoad(I)) {
1562 IRBuilder<> Builder(&I);
1563 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1564
1565 Type *I32Ty = Builder.getInt32Ty();
1566 LoadInst *WidenLoad = Builder.CreateLoad(Ty: I32Ty, Ptr: I.getPointerOperand());
1567 AMDGPU::copyMetadataForWidenedLoad(Dest&: *WidenLoad, Source: I);
1568
1569 // The widened load reads the original bytes in the low bits, so a !range
1570 // lower bound still holds. Convert it to the new type and don't make
1571 // assumptions about the high bits.
1572 if (auto *Range = I.getMetadata(KindID: LLVMContext::MD_range)) {
1573 ConstantInt *Lower = mdconst::extract<ConstantInt>(MD: Range->getOperand(I: 0));
1574
1575 if (!Lower->isNullValue()) {
1576 Metadata *LowAndHigh[] = {
1577 ConstantAsMetadata::get(C: ConstantInt::get(Ty: I32Ty, V: Lower->getValue().zext(width: 32))),
1578 // Don't make assumptions about the high bits.
1579 ConstantAsMetadata::get(C: ConstantInt::get(Ty: I32Ty, V: 0))
1580 };
1581
1582 WidenLoad->setMetadata(KindID: LLVMContext::MD_range,
1583 Node: MDNode::get(Context&: F.getContext(), MDs: LowAndHigh));
1584 }
1585 }
1586
1587 int TySize = DL.getTypeSizeInBits(Ty: I.getType());
1588 Type *IntNTy = Builder.getIntNTy(N: TySize);
1589 Value *ValTrunc = Builder.CreateTrunc(V: WidenLoad, DestTy: IntNTy);
1590 Value *ValOrig = Builder.CreateBitCast(V: ValTrunc, DestTy: I.getType());
1591 I.replaceAllUsesWith(V: ValOrig);
1592 DeadVals.push_back(Elt: &I);
1593 return true;
1594 }
1595
1596 return false;
1597}
1598
1599bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {
1600 FPMathOperator *FPOp = dyn_cast<FPMathOperator>(Val: &I);
1601 if (!FPOp)
1602 return false;
1603
1604 Value *X;
1605 Value *Fract = nullptr;
1606
1607 // Match:
1608 // (x - floor(x)) >= MIN_CONSTANT ? MIN_CONSTANT : (x - floor(x))
1609 //
1610 // This is the preferred way to implement fract.
1611 // TODO: Could also match with compare against 1.0
1612 const APFloat *C;
1613 if (match(V: &I, P: m_UnordFMin(L: m_Value(V&: X), R: m_APFloatAllowPoison(Res&: C)))) {
1614 Value *FractSrc = matchFractPatImpl(V&: *X, C: *C);
1615 if (!FractSrc)
1616 return false;
1617 IRBuilder<> Builder(&I);
1618 Builder.setFastMathFlags(FPOp->getFastMathFlags());
1619 Fract = applyFractPat(Builder, FractArg: FractSrc);
1620 } else {
1621 // Match patterns which may appear in legacy implementations of the fract()
1622 // function, built around the nan-avoidant minnum intrinsic. These are the
1623 // core pattern plus additional clamping of inf and nan values on the
1624 // result.
1625 Value *Cond = I.getCondition();
1626 Value *TrueVal = I.getTrueValue();
1627 Value *FalseVal = I.getFalseValue();
1628 Value *CmpVal;
1629 CmpPredicate IsNanPred;
1630
1631 // Match fract pattern with nan check.
1632 if (!match(V: Cond, P: m_FCmp(Pred&: IsNanPred, L: m_Value(V&: CmpVal), R: m_NonNaN())))
1633 return false;
1634
1635 IRBuilder<> Builder(&I);
1636 Builder.setFastMathFlags(FPOp->getFastMathFlags());
1637
1638 if (IsNanPred == FCmpInst::FCMP_UNO && TrueVal == CmpVal &&
1639 CmpVal == matchFractPatNanAvoidant(V&: *FalseVal)) {
1640 // isnan(x) ? x : fract(x)
1641 Fract = applyFractPat(Builder, FractArg: CmpVal);
1642 } else if (IsNanPred == FCmpInst::FCMP_ORD && FalseVal == CmpVal) {
1643 if (CmpVal == matchFractPatNanAvoidant(V&: *TrueVal)) {
1644 // !isnan(x) ? fract(x) : x
1645 Fract = applyFractPat(Builder, FractArg: CmpVal);
1646 } else {
1647 // Match an intermediate clamp infinity to 0 pattern. i.e.
1648 // !isnan(x) ? (!isinf(x) ? fract(x) : 0.0) : x
1649 CmpPredicate PredInf;
1650 Value *IfNotInf;
1651
1652 if (!match(V: TrueVal, P: m_Select(C: m_FCmp(Pred&: PredInf, L: m_FAbs(Op0: m_Specific(V: CmpVal)),
1653 R: m_PosInf()),
1654 L: m_Value(V&: IfNotInf), R: m_PosZeroFP())) ||
1655 PredInf != FCmpInst::FCMP_UNE ||
1656 CmpVal != matchFractPatNanAvoidant(V&: *IfNotInf))
1657 return false;
1658
1659 SelectInst *ClampInfSelect = cast<SelectInst>(Val: TrueVal);
1660
1661 // Insert before the fabs
1662 Value *InsertPt =
1663 cast<Instruction>(Val: ClampInfSelect->getCondition())->getOperand(i: 0);
1664
1665 Builder.SetInsertPoint(cast<Instruction>(Val: InsertPt));
1666 Value *NewFract = applyFractPat(Builder, FractArg: CmpVal);
1667 NewFract->takeName(V: TrueVal);
1668
1669 // Thread the new fract into the inf clamping sequence.
1670 DeadVals.push_back(Elt: ClampInfSelect->getOperand(i_nocapture: 1));
1671 ClampInfSelect->setOperand(i_nocapture: 1, Val_nocapture: NewFract);
1672
1673 // The outer select nan handling is also absorbed into the fract.
1674 Fract = ClampInfSelect;
1675 }
1676 } else
1677 return false;
1678 }
1679
1680 Fract->takeName(V: &I);
1681 I.replaceAllUsesWith(V: Fract);
1682 DeadVals.push_back(Elt: &I);
1683 return true;
1684}
1685
1686static bool areInSameBB(const Value *A, const Value *B) {
1687 const auto *IA = dyn_cast<Instruction>(Val: A);
1688 const auto *IB = dyn_cast<Instruction>(Val: B);
1689 return IA && IB && IA->getParent() == IB->getParent();
1690}
1691
1692// Helper for breaking large PHIs that returns true when an extractelement on V
1693// is likely to be folded away by the DAG combiner.
1694static bool isInterestingPHIIncomingValue(const Value *V) {
1695 const auto *FVT = dyn_cast<FixedVectorType>(Val: V->getType());
1696 if (!FVT)
1697 return false;
1698
1699 const Value *CurVal = V;
1700
1701 // Check for insertelements, keeping track of the elements covered.
1702 BitVector EltsCovered(FVT->getNumElements());
1703 while (const auto *IE = dyn_cast<InsertElementInst>(Val: CurVal)) {
1704 const auto *Idx = dyn_cast<ConstantInt>(Val: IE->getOperand(i_nocapture: 2));
1705
1706 // Non constant index/out of bounds index -> folding is unlikely.
1707 // The latter is more of a sanity check because canonical IR should just
1708 // have replaced those with poison.
1709 if (!Idx || Idx->getZExtValue() >= FVT->getNumElements())
1710 return false;
1711
1712 const auto *VecSrc = IE->getOperand(i_nocapture: 0);
1713
1714 // If the vector source is another instruction, it must be in the same basic
1715 // block. Otherwise, the DAGCombiner won't see the whole thing and is
1716 // unlikely to be able to do anything interesting here.
1717 if (isa<Instruction>(Val: VecSrc) && !areInSameBB(A: VecSrc, B: IE))
1718 return false;
1719
1720 CurVal = VecSrc;
1721 EltsCovered.set(Idx->getZExtValue());
1722
1723 // All elements covered.
1724 if (EltsCovered.all())
1725 return true;
1726 }
1727
1728 // We either didn't find a single insertelement, or the insertelement chain
1729 // ended before all elements were covered. Check for other interesting values.
1730
1731 // Constants are always interesting because we can just constant fold the
1732 // extractelements.
1733 if (isa<Constant>(Val: CurVal))
1734 return true;
1735
1736 // shufflevector is likely to be profitable if either operand is a constant,
1737 // or if either source is in the same block.
1738 // This is because shufflevector is most often lowered as a series of
1739 // insert/extract elements anyway.
1740 if (const auto *SV = dyn_cast<ShuffleVectorInst>(Val: CurVal)) {
1741 return isa<Constant>(Val: SV->getOperand(i_nocapture: 1)) ||
1742 areInSameBB(A: SV, B: SV->getOperand(i_nocapture: 0)) ||
1743 areInSameBB(A: SV, B: SV->getOperand(i_nocapture: 1));
1744 }
1745
1746 return false;
1747}
1748
1749static void collectPHINodes(const PHINode &I,
1750 SmallPtrSet<const PHINode *, 8> &SeenPHIs) {
1751 const auto [It, Inserted] = SeenPHIs.insert(Ptr: &I);
1752 if (!Inserted)
1753 return;
1754
1755 for (const Value *Inc : I.incoming_values()) {
1756 if (const auto *PhiInc = dyn_cast<PHINode>(Val: Inc))
1757 collectPHINodes(I: *PhiInc, SeenPHIs);
1758 }
1759
1760 for (const User *U : I.users()) {
1761 if (const auto *PhiU = dyn_cast<PHINode>(Val: U))
1762 collectPHINodes(I: *PhiU, SeenPHIs);
1763 }
1764}
1765
1766bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(const PHINode &I) {
1767 // Check in the cache first.
1768 if (const auto It = BreakPhiNodesCache.find(Val: &I);
1769 It != BreakPhiNodesCache.end())
1770 return It->second;
1771
1772 // We consider PHI nodes as part of "chains", so given a PHI node I, we
1773 // recursively consider all its users and incoming values that are also PHI
1774 // nodes. We then make a decision about all of those PHIs at once. Either they
1775 // all get broken up, or none of them do. That way, we avoid cases where a
1776 // single PHI is/is not broken and we end up reforming/exploding a vector
1777 // multiple times, or even worse, doing it in a loop.
1778 SmallPtrSet<const PHINode *, 8> WorkList;
1779 collectPHINodes(I, SeenPHIs&: WorkList);
1780
1781#ifndef NDEBUG
1782 // Check that none of the PHI nodes in the worklist are in the map. If some of
1783 // them are, it means we're not good enough at collecting related PHIs.
1784 for (const PHINode *WLP : WorkList) {
1785 assert(BreakPhiNodesCache.count(WLP) == 0);
1786 }
1787#endif
1788
1789 // To consider a PHI profitable to break, we need to see some interesting
1790 // incoming values. At least 2/3rd (rounded up) of all PHIs in the worklist
1791 // must have one to consider all PHIs breakable.
1792 //
1793 // This threshold has been determined through performance testing.
1794 //
1795 // Note that the computation below is equivalent to
1796 //
1797 // (unsigned)ceil((K / 3.0) * 2)
1798 //
1799 // It's simply written this way to avoid mixing integral/FP arithmetic.
1800 const auto Threshold = (alignTo(Value: WorkList.size() * 2, Align: 3) / 3);
1801 unsigned NumBreakablePHIs = 0;
1802 bool CanBreak = false;
1803 for (const PHINode *Cur : WorkList) {
1804 // Don't break PHIs that have no interesting incoming values. That is, where
1805 // there is no clear opportunity to fold the "extractelement" instructions
1806 // we would add.
1807 //
1808 // Note: IC does not run after this pass, so we're only interested in the
1809 // foldings that the DAG combiner can do.
1810 if (any_of(Range: Cur->incoming_values(), P: isInterestingPHIIncomingValue)) {
1811 if (++NumBreakablePHIs >= Threshold) {
1812 CanBreak = true;
1813 break;
1814 }
1815 }
1816 }
1817
1818 for (const PHINode *Cur : WorkList)
1819 BreakPhiNodesCache[Cur] = CanBreak;
1820
1821 return CanBreak;
1822}
1823
1824/// Helper class for "break large PHIs" (visitPHINode).
1825///
1826/// This represents a slice of a PHI's incoming value, which is made up of:
1827/// - The type of the slice (Ty)
1828/// - The index in the incoming value's vector where the slice starts (Idx)
1829/// - The number of elements in the slice (NumElts).
1830/// It also keeps track of the NewPHI node inserted for this particular slice.
1831///
1832/// Slice examples:
1833/// <4 x i64> -> Split into four i64 slices.
1834/// -> [i64, 0, 1], [i64, 1, 1], [i64, 2, 1], [i64, 3, 1]
1835/// <5 x i16> -> Split into 2 <2 x i16> slices + a i16 tail.
1836/// -> [<2 x i16>, 0, 2], [<2 x i16>, 2, 2], [i16, 4, 1]
1837class VectorSlice {
1838public:
1839 VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
1840 : Ty(Ty), Idx(Idx), NumElts(NumElts) {}
1841
1842 Type *Ty = nullptr;
1843 unsigned Idx = 0;
1844 unsigned NumElts = 0;
1845 PHINode *NewPHI = nullptr;
1846
1847 /// Slice \p Inc according to the information contained within this slice.
1848 /// This is cached, so if called multiple times for the same \p BB & \p Inc
1849 /// pair, it returns the same Sliced value as well.
1850 ///
1851 /// Note this *intentionally* does not return the same value for, say,
1852 /// [%bb.0, %0] & [%bb.1, %0] as:
1853 /// - It could cause issues with dominance (e.g. if bb.1 is seen first, then
1854 /// the value in bb.1 may not be reachable from bb.0 if it's its
1855 /// predecessor.)
1856 /// - We also want to make our extract instructions as local as possible so
1857 /// the DAG has better chances of folding them out. Duplicating them like
1858 /// that is beneficial in that regard.
1859 ///
1860 /// This is both a minor optimization to avoid creating duplicate
1861 /// instructions, but also a requirement for correctness. It is not forbidden
1862 /// for a PHI node to have the same [BB, Val] pair multiple times. If we
1863 /// returned a new value each time, those previously identical pairs would all
1864 /// have different incoming values (from the same block) and it'd cause a "PHI
1865 /// node has multiple entries for the same basic block with different incoming
1866 /// values!" verifier error.
1867 Value *getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName) {
1868 Value *&Res = SlicedVals[{BB, Inc}];
1869 if (Res)
1870 return Res;
1871
1872 IRBuilder<> B(BB->getTerminator());
1873 if (Instruction *IncInst = dyn_cast<Instruction>(Val: Inc))
1874 B.SetCurrentDebugLocation(IncInst->getDebugLoc());
1875
1876 if (NumElts > 1) {
1877 SmallVector<int, 4> Mask;
1878 for (unsigned K = Idx; K < (Idx + NumElts); ++K)
1879 Mask.push_back(Elt: K);
1880 Res = B.CreateShuffleVector(V: Inc, Mask, Name: NewValName);
1881 } else
1882 Res = B.CreateExtractElement(Vec: Inc, Idx, Name: NewValName);
1883
1884 return Res;
1885 }
1886
1887private:
1888 SmallDenseMap<std::pair<BasicBlock *, Value *>, Value *> SlicedVals;
1889};
1890
1891bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
1892 // Break-up fixed-vector PHIs into smaller pieces.
1893 // Default threshold is 32, so it breaks up any vector that's >32 bits into
1894 // its elements, or into 32-bit pieces (for 8/16 bit elts).
1895 //
1896 // This is only helpful for DAGISel because it doesn't handle large PHIs as
1897 // well as GlobalISel. DAGISel lowers PHIs by using CopyToReg/CopyFromReg.
1898 // With large, odd-sized PHIs we may end up needing many `build_vector`
1899 // operations with most elements being "undef". This inhibits a lot of
1900 // optimization opportunities and can result in unreasonably high register
1901 // pressure and the inevitable stack spilling.
1902 if (!BreakLargePHIs || getCGPassBuilderOption().EnableGlobalISelOption ==
1903 cl::boolOrDefault::BOU_TRUE)
1904 return false;
1905
1906 FixedVectorType *FVT = dyn_cast<FixedVectorType>(Val: I.getType());
1907 if (!FVT || FVT->getNumElements() == 1 ||
1908 DL.getTypeSizeInBits(Ty: FVT) <= BreakLargePHIsThreshold)
1909 return false;
1910
1911 if (!ForceBreakLargePHIs && !canBreakPHINode(I))
1912 return false;
1913
1914 std::vector<VectorSlice> Slices;
1915
1916 Type *EltTy = FVT->getElementType();
1917 {
1918 unsigned Idx = 0;
1919 // For 8/16 bits type, don't scalarize fully but break it up into as many
1920 // 32-bit slices as we can, and scalarize the tail.
1921 const unsigned EltSize = DL.getTypeSizeInBits(Ty: EltTy);
1922 const unsigned NumElts = FVT->getNumElements();
1923 if (EltSize == 8 || EltSize == 16) {
1924 const unsigned SubVecSize = (32 / EltSize);
1925 Type *SubVecTy = FixedVectorType::get(ElementType: EltTy, NumElts: SubVecSize);
1926 for (unsigned End = alignDown(Value: NumElts, Align: SubVecSize); Idx < End;
1927 Idx += SubVecSize)
1928 Slices.emplace_back(args&: SubVecTy, args&: Idx, args: SubVecSize);
1929 }
1930
1931 // Scalarize all remaining elements.
1932 for (; Idx < NumElts; ++Idx)
1933 Slices.emplace_back(args&: EltTy, args&: Idx, args: 1);
1934 }
1935
1936 assert(Slices.size() > 1);
1937
1938 // Create one PHI per vector piece. The "VectorSlice" class takes care of
1939 // creating the necessary instruction to extract the relevant slices of each
1940 // incoming value.
1941 IRBuilder<> B(I.getParent());
1942 B.SetCurrentDebugLocation(I.getDebugLoc());
1943
1944 unsigned IncNameSuffix = 0;
1945 for (VectorSlice &S : Slices) {
1946 // We need to reset the build on each iteration, because getSlicedVal may
1947 // have inserted something into I's BB.
1948 B.SetInsertPoint(I.getParent()->getFirstNonPHIIt());
1949 S.NewPHI = B.CreatePHI(Ty: S.Ty, NumReservedValues: I.getNumIncomingValues());
1950
1951 for (const auto &[Idx, BB] : enumerate(First: I.blocks())) {
1952 S.NewPHI->addIncoming(V: S.getSlicedVal(BB, Inc: I.getIncomingValue(i: Idx),
1953 NewValName: "largephi.extractslice" +
1954 std::to_string(val: IncNameSuffix++)),
1955 BB);
1956 }
1957 }
1958
1959 // And replace this PHI with a vector of all the previous PHI values.
1960 Value *Vec = PoisonValue::get(T: FVT);
1961 unsigned NameSuffix = 0;
1962 for (VectorSlice &S : Slices) {
1963 const auto ValName = "largephi.insertslice" + std::to_string(val: NameSuffix++);
1964 if (S.NumElts > 1)
1965 Vec = B.CreateInsertVector(DstType: FVT, SrcVec: Vec, SubVec: S.NewPHI, Idx: S.Idx, Name: ValName);
1966 else
1967 Vec = B.CreateInsertElement(Vec, NewElt: S.NewPHI, Idx: S.Idx, Name: ValName);
1968 }
1969
1970 I.replaceAllUsesWith(V: Vec);
1971 DeadVals.push_back(Elt: &I);
1972 return true;
1973}
1974
1975/// \param V Value to check
1976/// \param DL DataLayout
1977/// \param TM TargetMachine (TODO: remove once DL contains nullptr values)
1978/// \param AS Target Address Space
1979/// \return true if \p V cannot be the null value of \p AS, false otherwise.
1980static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL,
1981 const AMDGPUTargetMachine &TM, unsigned AS) {
1982 // Pointer cannot be null if it's a block address, GV or alloca.
1983 // NOTE: We don't support extern_weak, but if we did, we'd need to check for
1984 // it as the symbol could be null in such cases.
1985 if (isa<BlockAddress, GlobalValue, AllocaInst>(Val: V))
1986 return true;
1987
1988 // Check nonnull arguments.
1989 if (const auto *Arg = dyn_cast<Argument>(Val: V); Arg && Arg->hasNonNullAttr())
1990 return true;
1991
1992 // Check nonnull loads.
1993 if (const auto *Load = dyn_cast<LoadInst>(Val: V);
1994 Load && Load->hasMetadata(KindID: LLVMContext::MD_nonnull))
1995 return true;
1996
1997 // getUnderlyingObject may have looked through another addrspacecast, although
1998 // the optimizable situations most likely folded out by now.
1999 if (AS != cast<PointerType>(Val: V->getType())->getAddressSpace())
2000 return false;
2001
2002 // TODO: Calls that return nonnull?
2003
2004 // For all other things, use KnownBits.
2005 // We either use 0 or all bits set to indicate null, so check whether the
2006 // value can be zero or all ones.
2007 //
2008 // TODO: Use ValueTracking's isKnownNeverNull if it becomes aware that some
2009 // address spaces have non-zero null values.
2010 auto SrcPtrKB = computeKnownBits(V, DL);
2011 const auto NullVal = AMDGPU::getNullPointerValue(AS);
2012
2013 assert(SrcPtrKB.getBitWidth() == DL.getPointerSizeInBits(AS));
2014 assert((NullVal == 0 || NullVal == -1) &&
2015 "don't know how to check for this null value!");
2016 return NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero();
2017}
2018
2019bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
2020 // Intrinsic doesn't support vectors, also it seems that it's often difficult
2021 // to prove that a vector cannot have any nulls in it so it's unclear if it's
2022 // worth supporting.
2023 if (I.getType()->isVectorTy())
2024 return false;
2025
2026 // Check if this can be lowered to a amdgcn.addrspacecast.nonnull.
2027 // This is only worthwhile for casts from/to priv/local to flat.
2028 const unsigned SrcAS = I.getSrcAddressSpace();
2029 const unsigned DstAS = I.getDestAddressSpace();
2030
2031 bool CanLower = false;
2032 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
2033 CanLower = (DstAS == AMDGPUAS::LOCAL_ADDRESS ||
2034 DstAS == AMDGPUAS::PRIVATE_ADDRESS);
2035 else if (DstAS == AMDGPUAS::FLAT_ADDRESS)
2036 CanLower = (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2037 SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
2038 if (!CanLower)
2039 return false;
2040
2041 SmallVector<const Value *, 4> WorkList;
2042 getUnderlyingObjects(V: I.getOperand(i_nocapture: 0), Objects&: WorkList);
2043 if (!all_of(Range&: WorkList, P: [&](const Value *V) {
2044 return isPtrKnownNeverNull(V, DL, TM, AS: SrcAS);
2045 }))
2046 return false;
2047
2048 IRBuilder<> B(&I);
2049 auto *Intrin = B.CreateIntrinsic(
2050 RetTy: I.getType(), ID: Intrinsic::amdgcn_addrspacecast_nonnull, Args: {I.getOperand(i_nocapture: 0)});
2051 I.replaceAllUsesWith(V: Intrin);
2052 DeadVals.push_back(Elt: &I);
2053 return true;
2054}
2055
2056bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
2057 Intrinsic::ID IID = I.getIntrinsicID();
2058 switch (IID) {
2059 case Intrinsic::minnum:
2060 case Intrinsic::minimumnum:
2061 case Intrinsic::minimum:
2062 return visitFMinLike(I);
2063 case Intrinsic::sqrt:
2064 return visitSqrt(I);
2065 case Intrinsic::log:
2066 case Intrinsic::log10:
2067 return visitLog(Log&: cast<FPMathOperator>(Val&: I), IID);
2068 case Intrinsic::log2:
2069 // No reason to handle log2.
2070 return false;
2071 case Intrinsic::amdgcn_mbcnt_lo:
2072 return visitMbcntLo(I);
2073 case Intrinsic::amdgcn_mbcnt_hi:
2074 return visitMbcntHi(I);
2075 case Intrinsic::vector_reduce_add:
2076 return visitVectorReduceAdd(I);
2077 case Intrinsic::uadd_sat:
2078 case Intrinsic::sadd_sat:
2079 return visitSaturatingAdd(I);
2080 default:
2081 return false;
2082 }
2083}
2084
2085/// Match the core sequence in the fract pattern (x - floor(x), which doesn't
2086/// need to consider edge case handling.
2087Value *AMDGPUCodeGenPrepareImpl::matchFractPatImpl(Value &FractSrc,
2088 const APFloat &C) const {
2089 if (ST.hasFractBug())
2090 return nullptr;
2091
2092 Type *Ty = FractSrc.getType();
2093 if (!isLegalFloatingTy(Ty: Ty->getScalarType()))
2094 return nullptr;
2095
2096 APFloat OneNextDown = APFloat::getOne(Sem: C.getSemantics());
2097 OneNextDown.next(nextDown: true);
2098
2099 // Match nextafter(1.0, -1)
2100 if (OneNextDown != C)
2101 return nullptr;
2102
2103 Value *FloorSrc;
2104 if (match(V: &FractSrc, P: m_FSub(L: m_Value(V&: FloorSrc), R: m_Intrinsic<Intrinsic::floor>(
2105 Op0: m_Deferred(V: FloorSrc)))))
2106 return FloorSrc;
2107 return nullptr;
2108}
2109
2110/// Match non-nan fract pattern.
2111// MIN_CONSTANT = nextafter(1.0, -1.0)
2112/// minnum(fsub(x, floor(x)), MIN_CONSTANT)
2113/// minimumnum(fsub(x, floor(x)), MIN_CONSTANT)
2114/// minimum(fsub(x, floor(x)), MIN_CONSTANT)
2115
2116// x_sub_floor >= MIN_CONSTANT ? MIN_CONSTANT : x_sub_floor;
2117///
2118/// If fract is a useful instruction for the subtarget. Does not account for the
2119/// nan handling; the instruction has a nan check on the input value.
2120Value *AMDGPUCodeGenPrepareImpl::matchFractPatNanAvoidant(Value &V) {
2121 Value *Arg0;
2122 const APFloat *C;
2123
2124 // The value is only used in contexts where we know the input isn't a nan, so
2125 // any of the fmin variants are fine.
2126 if (!match(V: &V,
2127 P: m_CombineOr(Ps: m_FMinNum_or_FMinimumNum(Op0: m_Value(V&: Arg0),
2128 Op1: m_APFloatAllowPoison(Res&: C)),
2129 Ps: m_FMinimum(Op0: m_Value(V&: Arg0), Op1: m_APFloatAllowPoison(Res&: C)))))
2130 return nullptr;
2131
2132 return matchFractPatImpl(FractSrc&: *Arg0, C: *C);
2133}
2134
2135Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,
2136 Value *FractArg) {
2137 SmallVector<Value *, 4> FractVals;
2138 extractValues(Builder, Values&: FractVals, V: FractArg);
2139
2140 SmallVector<Value *, 4> ResultVals(FractVals.size());
2141
2142 Type *Ty = FractArg->getType()->getScalarType();
2143 for (unsigned I = 0, E = FractVals.size(); I != E; ++I) {
2144 ResultVals[I] =
2145 Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_fract, OverloadTypes: {Ty}, Args: {FractVals[I]});
2146 }
2147
2148 return insertValues(Builder, Ty: FractArg->getType(), Values&: ResultVals);
2149}
2150
2151bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) {
2152 const APFloat *C;
2153 Value *FractArg;
2154
2155 // minimum(x - floor(x), MIN_CONSTANT)
2156 Value *X;
2157 if (!ST.hasFractBug() &&
2158 match(V: &I, P: m_FMinimum(Op0: m_Value(V&: X), Op1: m_APFloatAllowPoison(Res&: C)))) {
2159 FractArg = matchFractPatImpl(FractSrc&: *X, C: *C);
2160 if (!FractArg)
2161 return false;
2162 } else {
2163 // minnum(x - floor(x), MIN_CONSTANT)
2164 FractArg = matchFractPatNanAvoidant(V&: I);
2165 if (!FractArg)
2166 return false;
2167
2168 // Match pattern for fract intrinsic in contexts where the nan check has
2169 // been optimized out (and hope the knowledge the source can't be nan wasn't
2170 // lost).
2171 if (!I.hasNoNaNs() && !isKnownNeverNaN(V: FractArg, SQ: SQ.getWithInstruction(I: &I)))
2172 return false;
2173 }
2174
2175 IRBuilder<> Builder(&I);
2176 FastMathFlags FMF = I.getFastMathFlags();
2177 FMF.setNoNaNs();
2178 Builder.setFastMathFlags(FMF);
2179
2180 Value *Fract = applyFractPat(Builder, FractArg);
2181 Fract->takeName(V: &I);
2182 I.replaceAllUsesWith(V: Fract);
2183 DeadVals.push_back(Elt: &I);
2184 return true;
2185}
2186
2187// Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way.
2188bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
2189 Type *Ty = Sqrt.getType()->getScalarType();
2190 if (!Ty->isFloatTy() && (!Ty->isHalfTy() || ST.has16BitInsts()))
2191 return false;
2192
2193 const FPMathOperator *FPOp = cast<const FPMathOperator>(Val: &Sqrt);
2194 FastMathFlags SqrtFMF = FPOp->getFastMathFlags();
2195
2196 // We're trying to handle the fast-but-not-that-fast case only. The lowering
2197 // of fast llvm.sqrt will give the raw instruction anyway.
2198 if (SqrtFMF.approxFunc())
2199 return false;
2200
2201 const float ReqdAccuracy = FPOp->getFPAccuracy();
2202
2203 // Defer correctly rounded expansion to codegen.
2204 if (ReqdAccuracy < 1.0f)
2205 return false;
2206
2207 Value *SrcVal = Sqrt.getOperand(i_nocapture: 0);
2208 bool CanTreatAsDAZ = canIgnoreDenormalInput(V: SrcVal, CtxI: &Sqrt);
2209
2210 // The raw instruction is 1 ulp, but the correction for denormal handling
2211 // brings it to 2.
2212 if (!CanTreatAsDAZ && ReqdAccuracy < 2.0f)
2213 return false;
2214
2215 IRBuilder<> Builder(&Sqrt);
2216 SmallVector<Value *, 4> SrcVals;
2217 extractValues(Builder, Values&: SrcVals, V: SrcVal);
2218
2219 SmallVector<Value *, 4> ResultVals(SrcVals.size());
2220 for (int I = 0, E = SrcVals.size(); I != E; ++I) {
2221 if (CanTreatAsDAZ)
2222 ResultVals[I] = Builder.CreateCall(Callee: getSqrtF32(), Args: SrcVals[I]);
2223 else
2224 ResultVals[I] = emitSqrtIEEE2ULP(Builder, Src: SrcVals[I], FMF: SqrtFMF);
2225 }
2226
2227 Value *NewSqrt = insertValues(Builder, Ty: Sqrt.getType(), Values&: ResultVals);
2228 NewSqrt->takeName(V: &Sqrt);
2229 Sqrt.replaceAllUsesWith(V: NewSqrt);
2230 DeadVals.push_back(Elt: &Sqrt);
2231 return true;
2232}
2233
2234/// Replace log and log10 intrinsic calls based on fpmath metadata.
2235bool AMDGPUCodeGenPrepareImpl::visitLog(FPMathOperator &Log,
2236 Intrinsic::ID IID) {
2237 Type *Ty = Log.getType();
2238 if (!Ty->getScalarType()->isHalfTy() || !ST.has16BitInsts())
2239 return false;
2240
2241 FastMathFlags FMF = Log.getFastMathFlags();
2242
2243 // Defer fast math cases to codegen.
2244 if (FMF.approxFunc())
2245 return false;
2246
2247 // Limit experimentally determined from OpenCL conformance test (1.79)
2248 if (Log.getFPAccuracy() < 1.80f)
2249 return false;
2250
2251 IRBuilder<> Builder(&cast<CallInst>(Val&: Log));
2252
2253 // Use the generic intrinsic for convenience in the vector case. Codegen will
2254 // recognize the denormal handling is not necessary from the fpext.
2255 // TODO: Move to generic code
2256 Value *Log2 =
2257 Builder.CreateUnaryIntrinsic(ID: Intrinsic::log2, Op: Log.getOperand(i: 0), FMFSource: FMF);
2258
2259 double Log2BaseInverted =
2260 IID == Intrinsic::log10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
2261 Value *Mul =
2262 Builder.CreateFMulFMF(L: Log2, R: ConstantFP::get(Ty, V: Log2BaseInverted), FMFSource: FMF);
2263
2264 Mul->takeName(V: &Log);
2265
2266 Log.replaceAllUsesWith(V: Mul);
2267 DeadVals.push_back(Elt: &Log);
2268 return true;
2269}
2270
2271bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
2272 if (skipFunction(F))
2273 return false;
2274
2275 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
2276 if (!TPC)
2277 return false;
2278
2279 const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
2280 const TargetLibraryInfo *TLI =
2281 &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
2282 AssumptionCache *AC =
2283 &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2284 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
2285 const DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
2286 const UniformityInfo &UA =
2287 getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
2288 return AMDGPUCodeGenPrepareImpl(F, TM, TLI, AC, DT, UA).run();
2289}
2290
2291PreservedAnalyses AMDGPUCodeGenPreparePass::run(Function &F,
2292 FunctionAnalysisManager &FAM) {
2293 const AMDGPUTargetMachine &ATM = static_cast<const AMDGPUTargetMachine &>(TM);
2294 const TargetLibraryInfo *TLI = &FAM.getResult<TargetLibraryAnalysis>(IR&: F);
2295 AssumptionCache *AC = &FAM.getResult<AssumptionAnalysis>(IR&: F);
2296 const DominatorTree *DT = FAM.getCachedResult<DominatorTreeAnalysis>(IR&: F);
2297 const UniformityInfo &UA = FAM.getResult<UniformityInfoAnalysis>(IR&: F);
2298 AMDGPUCodeGenPrepareImpl Impl(F, ATM, TLI, AC, DT, UA);
2299 if (!Impl.run())
2300 return PreservedAnalyses::all();
2301 PreservedAnalyses PA = PreservedAnalyses::none();
2302 if (!Impl.FlowChanged)
2303 PA.preserveSet<CFGAnalyses>();
2304 return PA;
2305}
2306
2307INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
2308 "AMDGPU IR optimizations", false, false)
2309INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
2310INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
2311INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
2312INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
2313 false, false)
2314
2315/// Create a workitem.id.x intrinsic call with range metadata.
2316CallInst *AMDGPUCodeGenPrepareImpl::createWorkitemIdX(IRBuilder<> &B) const {
2317 CallInst *Tid =
2318 B.CreateIntrinsicWithoutFolding(ID: Intrinsic::amdgcn_workitem_id_x, Args: {});
2319 ST.makeLIDRangeMetadata(I: Tid);
2320 return Tid;
2321}
2322
2323/// Replace the instruction with a direct workitem.id.x call.
2324void AMDGPUCodeGenPrepareImpl::replaceWithWorkitemIdX(Instruction &I) const {
2325 IRBuilder<> B(&I);
2326 CallInst *Tid = createWorkitemIdX(B);
2327 BasicBlock::iterator BI(&I);
2328 ReplaceInstWithValue(BI, V: Tid);
2329}
2330
2331/// Replace the instruction with (workitem.id.x & mask).
2332void AMDGPUCodeGenPrepareImpl::replaceWithMaskedWorkitemIdX(
2333 Instruction &I, unsigned WaveSize) const {
2334 IRBuilder<> B(&I);
2335 CallInst *Tid = createWorkitemIdX(B);
2336 Constant *Mask = ConstantInt::get(Ty: Tid->getType(), V: WaveSize - 1);
2337 Value *AndInst = B.CreateAnd(LHS: Tid, RHS: Mask);
2338 BasicBlock::iterator BI(&I);
2339 ReplaceInstWithValue(BI, V: AndInst);
2340}
2341
2342/// Try to optimize mbcnt instruction by replacing with workitem.id.x when
2343/// work group size allows direct computation of lane ID.
2344/// Returns true if optimization was applied, false otherwise.
2345bool AMDGPUCodeGenPrepareImpl::tryReplaceWithWorkitemId(Instruction &I,
2346 unsigned Wave) const {
2347 std::optional<unsigned> MaybeX = ST.getReqdWorkGroupSize(F, Dim: 0);
2348 if (!MaybeX)
2349 return false;
2350
2351 // When work group size == wave_size, each work group contains exactly one
2352 // wave, so the instruction can be replaced with workitem.id.x directly.
2353 if (*MaybeX == Wave) {
2354 replaceWithWorkitemIdX(I);
2355 return true;
2356 }
2357
2358 // When work group evenly splits into waves, compute lane ID within wave
2359 // using bit masking: lane_id = workitem.id.x & (wave_size - 1).
2360 if (ST.hasWavefrontsEvenlySplittingXDim(F, /*RequiresUniformYZ=*/REquiresUniformYZ: true)) {
2361 replaceWithMaskedWorkitemIdX(I, WaveSize: Wave);
2362 return true;
2363 }
2364
2365 return false;
2366}
2367
2368/// Optimize mbcnt.lo calls on wave32 architectures for lane ID computation.
2369bool AMDGPUCodeGenPrepareImpl::visitMbcntLo(IntrinsicInst &I) const {
2370 // This optimization only applies to wave32 targets where mbcnt.lo operates on
2371 // the full execution mask.
2372 if (!ST.isWave32())
2373 return false;
2374
2375 // Only optimize the pattern mbcnt.lo(~0, 0) which counts active lanes with
2376 // lower IDs.
2377 if (!match(V: &I,
2378 P: m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>(Op0: m_AllOnes(), Op1: m_Zero())))
2379 return false;
2380
2381 return tryReplaceWithWorkitemId(I, Wave: ST.getWavefrontSize());
2382}
2383
2384/// Optimize mbcnt.hi calls for lane ID computation.
2385bool AMDGPUCodeGenPrepareImpl::visitMbcntHi(IntrinsicInst &I) const {
2386 // Abort if wave size is not known at compile time.
2387 if (!ST.isWaveSizeKnown())
2388 return false;
2389
2390 unsigned Wave = ST.getWavefrontSize();
2391
2392 // On wave32, the upper 32 bits of execution mask are always 0, so
2393 // mbcnt.hi(mask, val) always returns val unchanged.
2394 if (ST.isWave32()) {
2395 if (auto MaybeX = ST.getReqdWorkGroupSize(F, Dim: 0)) {
2396 // Replace mbcnt.hi(mask, val) with val only when work group size matches
2397 // wave size (single wave per work group).
2398 if (*MaybeX == Wave) {
2399 BasicBlock::iterator BI(&I);
2400 ReplaceInstWithValue(BI, V: I.getArgOperand(i: 1));
2401 return true;
2402 }
2403 }
2404 }
2405
2406 // Optimize the complete lane ID computation pattern:
2407 // mbcnt.hi(~0, mbcnt.lo(~0, 0)) which counts all active lanes with lower IDs
2408 // across the full execution mask.
2409 using namespace PatternMatch;
2410
2411 // Check for pattern: mbcnt.hi(~0, mbcnt.lo(~0, 0))
2412 if (!match(V: &I, P: m_Intrinsic<Intrinsic::amdgcn_mbcnt_hi>(
2413 Op0: m_AllOnes(), Op1: m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>(
2414 Op0: m_AllOnes(), Op1: m_Zero()))))
2415 return false;
2416
2417 return tryReplaceWithWorkitemId(I, Wave);
2418}
2419
2420/// Check if type is <4 x i8>.
2421static bool isV4I8(Type *Ty) {
2422 FixedVectorType *VTy = dyn_cast<FixedVectorType>(Val: Ty);
2423 return VTy && VTy->getNumElements() == 4 &&
2424 VTy->getElementType()->isIntegerTy(BitWidth: 8);
2425}
2426
2427/// Helper to match the dot4 pattern: mul(zext/sext <4 x i8>, zext/sext <4 x
2428/// i8>) Returns true if pattern matches and signedness matches IsSigned.
2429/// Sets A, B to the <4 x i8> sources.
2430static bool matchDot4Pattern(Value *MulOp, Value *&A, Value *&B,
2431 bool IsSigned) {
2432 Value *Src0, *Src1;
2433 if (!match(V: MulOp, P: m_Mul(L: m_Value(V&: Src0), R: m_Value(V&: Src1))))
2434 return false;
2435
2436 // Check that result type is <4 x i32>
2437 FixedVectorType *MulTy = dyn_cast<FixedVectorType>(Val: MulOp->getType());
2438 if (!MulTy || MulTy->getNumElements() != 4 ||
2439 !MulTy->getElementType()->isIntegerTy(BitWidth: 32))
2440 return false;
2441
2442 // Match zext or sext based on IsSigned
2443 Value *ExtSrc0, *ExtSrc1;
2444 if (IsSigned) {
2445 if (!match(V: Src0, P: m_SExt(Op: m_Value(V&: ExtSrc0))) || !isV4I8(Ty: ExtSrc0->getType()))
2446 return false;
2447 if (!match(V: Src1, P: m_SExt(Op: m_Value(V&: ExtSrc1))) || !isV4I8(Ty: ExtSrc1->getType()))
2448 return false;
2449 } else {
2450 if (!match(V: Src0, P: m_ZExt(Op: m_Value(V&: ExtSrc0))) || !isV4I8(Ty: ExtSrc0->getType()))
2451 return false;
2452 if (!match(V: Src1, P: m_ZExt(Op: m_Value(V&: ExtSrc1))) || !isV4I8(Ty: ExtSrc1->getType()))
2453 return false;
2454 }
2455
2456 A = ExtSrc0;
2457 B = ExtSrc1;
2458 return true;
2459}
2460
2461/// Try to convert vector.reduce.add(mul(zext/sext <4 x i8>, zext/sext <4 x
2462/// i8>)) to a dot4 intrinsic call (non-saturating case only).
2463bool AMDGPUCodeGenPrepareImpl::visitVectorReduceAdd(IntrinsicInst &I) {
2464 // Check if we have dot4 instructions available
2465 if (!ST.hasDot7Insts() || (!ST.hasDot1Insts() && !ST.hasDot8Insts()))
2466 return false;
2467
2468 Value *A = nullptr, *B = nullptr;
2469
2470 // Try unsigned first, then signed
2471 bool IsSigned = false;
2472 if (!matchDot4Pattern(MulOp: I.getArgOperand(i: 0), A, B, /*IsSigned=*/false)) {
2473 if (!matchDot4Pattern(MulOp: I.getArgOperand(i: 0), A, B, /*IsSigned=*/true))
2474 return false;
2475 IsSigned = true;
2476 }
2477
2478 LLVMContext &Ctx = I.getContext();
2479 Type *I32Ty = Type::getInt32Ty(C&: Ctx);
2480 IRBuilder<> Builder(&I);
2481
2482 // Bitcast <4 x i8> to i32
2483 Value *ASrc = Builder.CreateBitCast(V: A, DestTy: I32Ty);
2484 Value *BSrc = Builder.CreateBitCast(V: B, DestTy: I32Ty);
2485
2486 // Non-saturating case: accumulator is 0, clamp is false
2487 Value *Acc = ConstantInt::get(Ty: I32Ty, V: 0);
2488 Value *Clamp = ConstantInt::getFalse(Context&: Ctx);
2489
2490 Intrinsic::ID DotIID =
2491 IsSigned ? Intrinsic::amdgcn_sdot4 : Intrinsic::amdgcn_udot4;
2492
2493 Value *Dot = Builder.CreateIntrinsic(ID: DotIID, OverloadTypes: {}, Args: {ASrc, BSrc, Acc, Clamp});
2494 Dot->takeName(V: &I);
2495
2496 I.replaceAllUsesWith(V: Dot);
2497 DeadVals.push_back(Elt: &I);
2498
2499 return true;
2500}
2501
2502/// Try to convert uadd.sat/sadd.sat(vector.reduce.add(mul(...)), c) to a
2503/// saturating dot4 intrinsic. This combine starts at the root (saturating add)
2504/// and looks at its operands.
2505bool AMDGPUCodeGenPrepareImpl::visitSaturatingAdd(IntrinsicInst &I) {
2506 // Check if we have dot4 instructions available
2507 if (!ST.hasDot7Insts() || (!ST.hasDot1Insts() && !ST.hasDot8Insts()))
2508 return false;
2509
2510 Intrinsic::ID IID = I.getIntrinsicID();
2511 bool IsSigned = (IID == Intrinsic::sadd_sat);
2512
2513 // Look for vector.reduce.add as one of the operands (commutative match)
2514 Value *Op0 = I.getArgOperand(i: 0);
2515 Value *Op1 = I.getArgOperand(i: 1);
2516 Value *MulOp = nullptr;
2517 Value *Accum = nullptr;
2518 IntrinsicInst *ReduceInst = nullptr;
2519
2520 if (match(V: Op0, P: m_Intrinsic<Intrinsic::vector_reduce_add>(Op0: m_Value(V&: MulOp)))) {
2521 ReduceInst = cast<IntrinsicInst>(Val: Op0);
2522 Accum = Op1;
2523 } else if (match(V: Op1,
2524 P: m_Intrinsic<Intrinsic::vector_reduce_add>(Op0: m_Value(V&: MulOp)))) {
2525 ReduceInst = cast<IntrinsicInst>(Val: Op1);
2526 Accum = Op0;
2527 } else {
2528 return false;
2529 }
2530
2531 Value *A = nullptr, *B = nullptr;
2532
2533 if (!matchDot4Pattern(MulOp, A, B, IsSigned))
2534 return false;
2535
2536 LLVMContext &Ctx = I.getContext();
2537 Type *I32Ty = Type::getInt32Ty(C&: Ctx);
2538 IRBuilder<> Builder(&I);
2539
2540 // Bitcast <4 x i8> to i32
2541 Value *ASrc = Builder.CreateBitCast(V: A, DestTy: I32Ty);
2542 Value *BSrc = Builder.CreateBitCast(V: B, DestTy: I32Ty);
2543
2544 // Saturating case: use the accumulator and set clamp to true
2545 Value *Clamp = ConstantInt::getTrue(Context&: Ctx);
2546
2547 Intrinsic::ID DotIID =
2548 IsSigned ? Intrinsic::amdgcn_sdot4 : Intrinsic::amdgcn_udot4;
2549
2550 Value *Dot = Builder.CreateIntrinsic(ID: DotIID, OverloadTypes: {}, Args: {ASrc, BSrc, Accum, Clamp});
2551 Dot->takeName(V: &I);
2552
2553 I.replaceAllUsesWith(V: Dot);
2554 DeadVals.push_back(Elt: &I);
2555 // The reduce.add will be dead after this and cleaned up later
2556 if (ReduceInst->use_empty())
2557 DeadVals.push_back(Elt: ReduceInst);
2558
2559 return true;
2560}
2561
2562char AMDGPUCodeGenPrepare::ID = 0;
2563
2564FunctionPass *llvm::createAMDGPUCodeGenPreparePass() {
2565 return new AMDGPUCodeGenPrepare();
2566}
2567