AMDGPUCodeGenPrepare.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp]

1	//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// This pass does misc. AMDGPU optimizations on IR before instruction
11	/// selection.
12	//
13	//===----------------------------------------------------------------------===//
14
15	#include "AMDGPU.h"
16	#include "AMDGPUTargetMachine.h"
17	#include "SIModeRegisterDefaults.h"
18	#include "llvm/ADT/SetVector.h"
19	#include "llvm/Analysis/AssumptionCache.h"
20	#include "llvm/Analysis/ConstantFolding.h"
21	#include "llvm/Analysis/TargetLibraryInfo.h"
22	#include "llvm/Analysis/TargetTransformInfo.h"
23	#include "llvm/Analysis/UniformityAnalysis.h"
24	#include "llvm/Analysis/ValueTracking.h"
25	#include "llvm/CodeGen/TargetPassConfig.h"
26	#include "llvm/IR/Dominators.h"
27	#include "llvm/IR/IRBuilder.h"
28	#include "llvm/IR/InstVisitor.h"
29	#include "llvm/IR/IntrinsicsAMDGPU.h"
30	#include "llvm/IR/PatternMatch.h"
31	#include "llvm/IR/ValueHandle.h"
32	#include "llvm/InitializePasses.h"
33	#include "llvm/Pass.h"
34	#include "llvm/Support/KnownBits.h"
35	#include "llvm/Support/KnownFPClass.h"
36	#include "llvm/Transforms/Utils/BasicBlockUtils.h"
37	#include "llvm/Transforms/Utils/IntegerDivision.h"
38	#include "llvm/Transforms/Utils/Local.h"
39
40	#define DEBUG_TYPE "amdgpu-codegenprepare"
41
42	using namespace llvm;
43	using namespace llvm::PatternMatch;
44
45	namespace {
46
47	static cl::opt<bool> WidenLoads(
48	"amdgpu-codegenprepare-widen-constant-loads",
49	cl::desc ("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
50	cl::ReallyHidden,
51	cl::init(Val: false));
52
53	static cl::opt<bool>
54	BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
55	cl::desc ("Break large PHI nodes for DAGISel"),
56	cl::ReallyHidden, cl::init(Val: true));
57
58	static cl::opt<bool>
59	ForceBreakLargePHIs("amdgpu-codegenprepare-force-break-large-phis",
60	cl::desc ("For testing purposes, always break large "
61	"PHIs even if it isn't profitable."),
62	cl::ReallyHidden, cl::init(Val: false));
63
64	static cl::opt<unsigned> BreakLargePHIsThreshold(
65	"amdgpu-codegenprepare-break-large-phis-threshold",
66	cl::desc ("Minimum type size in bits for breaking large PHI nodes"),
67	cl::ReallyHidden, cl::init(Val: `32`));
68
69	static cl::opt<bool> UseMul24Intrin(
70	"amdgpu-codegenprepare-mul24",
71	cl::desc ("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
72	cl::ReallyHidden,
73	cl::init(Val: true));
74
75	// Legalize 64-bit division by using the generic IR expansion.
76	static cl::opt<bool> ExpandDiv64InIR(
77	"amdgpu-codegenprepare-expand-div64",
78	cl::desc ("Expand 64-bit division in AMDGPUCodeGenPrepare"),
79	cl::ReallyHidden,
80	cl::init(Val: false));
81
82	// Leave all division operations as they are. This supersedes ExpandDiv64InIR
83	// and is used for testing the legalizer.
84	static cl::opt<bool> DisableIDivExpand(
85	"amdgpu-codegenprepare-disable-idiv-expansion",
86	cl::desc ("Prevent expanding integer division in AMDGPUCodeGenPrepare"),
87	cl::ReallyHidden,
88	cl::init(Val: false));
89
90	// Disable processing of fdiv so we can better test the backend implementations.
91	static cl::opt<bool> DisableFDivExpand(
92	"amdgpu-codegenprepare-disable-fdiv-expansion",
93	cl::desc ("Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
94	cl::ReallyHidden,
95	cl::init(Val: false));
96
97	class AMDGPUCodeGenPrepareImpl
98	: public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
99	public:
100	Function &F;
101	const GCNSubtarget &ST;
102	const AMDGPUTargetMachine &TM;
103	const TargetLibraryInfo *TLI;
104	const UniformityInfo &UA;
105	const DataLayout &DL;
106	SimplifyQuery SQ;
107	const bool HasFP32DenormalFlush;
108	bool FlowChanged = false;
109	mutable Function SqrtF32 = nullptr*;
110	mutable Function LdexpF32 = nullptr*;
111	mutable SmallVector<WeakVH> DeadVals;
112
113	DenseMap<const PHINode , bool*> BreakPhiNodesCache;
114
115	AMDGPUCodeGenPrepareImpl(Function &F, const AMDGPUTargetMachine &TM,
116	const TargetLibraryInfo TLI, AssumptionCache AC,
117	const DominatorTree DT, const* UniformityInfo &UA)
118	: F(F), ST(TM.getSubtarget<GCNSubtarget>(F)), TM(TM), TLI(TLI), UA(UA),
119	DL(F.getDataLayout()), SQ (DL, TLI, DT, AC),
120	HasFP32DenormalFlush(SIModeRegisterDefaults (F, ST).FP32Denormals ==
121	DenormalMode::getPreserveSign()) {}
122
123	Function getSqrtF32() const* {
124	if (SqrtF32)
125	return SqrtF32;
126
127	LLVMContext &Ctx = F.getContext();
128	SqrtF32 = Intrinsic::getOrInsertDeclaration(
129	M: F.getParent(), id: Intrinsic::amdgcn_sqrt, Tys: {Type::getFloatTy(C&: Ctx)});
130	return SqrtF32;
131	}
132
133	Function getLdexpF32() const* {
134	if (LdexpF32)
135	return LdexpF32;
136
137	LLVMContext &Ctx = F.getContext();
138	LdexpF32 = Intrinsic::getOrInsertDeclaration(
139	M: F.getParent(), id: Intrinsic::ldexp,
140	Tys: {Type::getFloatTy(C&: Ctx), Type::getInt32Ty(C&: Ctx)});
141	return LdexpF32;
142	}
143
144	bool canBreakPHINode(const PHINode &I);
145
146	/// Return true if \p T is a legal scalar floating point type.
147	bool isLegalFloatingTy(const Type T) const*;
148
149	/// Wrapper to pass all the arguments to computeKnownFPClass
150	KnownFPClass computeKnownFPClass(const Value *V, FPClassTest Interested,
151	const Instruction CtxI) const* {
152	return llvm::computeKnownFPClass(V, InterestedClasses: Interested,
153	SQ: SQ.getWithInstruction(I: CtxI));
154	}
155
156	bool canIgnoreDenormalInput(const Value V, const* Instruction CtxI) const* {
157	return HasFP32DenormalFlush \|\|
158	computeKnownFPClass(V, Interested: fcSubnormal, CtxI).isKnownNeverSubnormal();
159	}
160
161	/// \returns The minimum number of bits needed to store the value of \Op as an
162	/// unsigned integer. Truncating to this size and then zero-extending to
163	/// the original will not change the value.
164	unsigned numBitsUnsigned(Value Op, const* Instruction CtxI) const*;
165
166	/// \returns The minimum number of bits needed to store the value of \Op as a
167	/// signed integer. Truncating to this size and then sign-extending to
168	/// the original size will not change the value.
169	unsigned numBitsSigned(Value Op, const* Instruction CtxI) const*;
170
171	/// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
172	/// SelectionDAG has an issue where an and asserting the bits are known
173	bool replaceMulWithMul24(BinaryOperator &I) const;
174
175	/// Perform same function as equivalently named function in DAGCombiner. Since
176	/// we expand some divisions here, we need to perform this before obscuring.
177	bool foldBinOpIntoSelect(BinaryOperator &I) const;
178
179	bool divHasSpecialOptimization(BinaryOperator &I,
180	Value Num, Value Den) const;
181	unsigned getDivNumBits(BinaryOperator &I, Value Num, Value Den,
182	unsigned MaxDivBits, bool Signed) const;
183
184	/// Expands 24 bit div or rem.
185	Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
186	Value Num, Value Den,
187	bool IsDiv, bool IsSigned) const;
188
189	Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I,
190	Value Num, Value Den, unsigned NumBits,
191	bool IsDiv, bool IsSigned) const;
192
193	/// Expands 32 bit div or rem.
194	Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
195	Value Num, Value Den) const;
196
197	Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I,
198	Value Num, Value Den) const;
199	void expandDivRem64(BinaryOperator &I) const;
200
201	/// Widen a scalar load.
202	///
203	/// \details \p Widen scalar load for uniform, small type loads from constant
204	// memory / to a full 32-bits and then truncate the input to allow a scalar
205	// load instead of a vector load.
206	//
207	/// \returns True.
208
209	bool canWidenScalarExtLoad(LoadInst &I) const;
210
211	Value *matchFractPat(IntrinsicInst &I);
212	Value applyFractPat(IRBuilder<> &Builder, Value FractArg);
213
214	bool canOptimizeWithRsq(FastMathFlags DivFMF, FastMathFlags SqrtFMF) const;
215
216	Value optimizeWithRsq(IRBuilder<> &Builder, Value Num, Value *Den,
217	FastMathFlags DivFMF, FastMathFlags SqrtFMF,
218	const Instruction CtxI) const*;
219
220	Value optimizeWithRcp(IRBuilder<> &Builder, Value Num, Value *Den,
221	FastMathFlags FMF, const Instruction CtxI) const*;
222	Value optimizeWithFDivFast(IRBuilder<> &Builder, Value Num, Value *Den,
223	float ReqdAccuracy) const;
224
225	Value visitFDivElement(IRBuilder<> &Builder, Value Num, Value *Den,
226	FastMathFlags DivFMF, FastMathFlags SqrtFMF,
227	Value RsqOp, const* Instruction *FDiv,
228	float ReqdAccuracy) const;
229
230	std::pair<Value , Value > getFrexpResults(IRBuilder<> &Builder,
231	Value Src) const*;
232
233	Value emitRcpIEEE1ULP(IRBuilder<> &Builder, Value Src,
234	bool IsNegative) const;
235	Value emitFrexpDiv(IRBuilder<> &Builder, Value LHS, Value *RHS,
236	FastMathFlags FMF) const;
237	Value emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value Src,
238	FastMathFlags FMF) const;
239	Value emitRsqF64(IRBuilder<> &Builder, Value X, FastMathFlags SqrtFMF,
240	FastMathFlags DivFMF, const Instruction *CtxI,
241	bool IsNegative) const;
242
243	CallInst createWorkitemIdX(IRBuilder<> &B) const*;
244	void replaceWithWorkitemIdX(Instruction &I) const;
245	void replaceWithMaskedWorkitemIdX(Instruction &I, unsigned WaveSize) const;
246	bool tryReplaceWithWorkitemId(Instruction &I, unsigned Wave) const;
247
248	bool tryNarrowMathIfNoOverflow(Instruction *I);
249
250	public:
251	bool visitFDiv(BinaryOperator &I);
252
253	bool visitInstruction(Instruction &I) { return false; }
254	bool visitBinaryOperator(BinaryOperator &I);
255	bool visitLoadInst(LoadInst &I);
256	bool visitSelectInst(SelectInst &I);
257	bool visitPHINode(PHINode &I);
258	bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
259
260	bool visitIntrinsicInst(IntrinsicInst &I);
261	bool visitFMinLike(IntrinsicInst &I);
262	bool visitSqrt(IntrinsicInst &I);
263	bool visitLog(FPMathOperator &Log, Intrinsic::ID IID);
264	bool visitMbcntLo(IntrinsicInst &I) const;
265	bool visitMbcntHi(IntrinsicInst &I) const;
266	bool run();
267	};
268
269	class AMDGPUCodeGenPrepare : public FunctionPass {
270	public:
271	static char ID;
272	AMDGPUCodeGenPrepare() : FunctionPass (ID) {}
273	void getAnalysisUsage(AnalysisUsage &AU) const override {
274	AU.addRequired<AssumptionCacheTracker>();
275	AU.addRequired<UniformityInfoWrapperPass>();
276	AU.addRequired<TargetLibraryInfoWrapperPass>();
277
278	// FIXME: Division expansion needs to preserve the dominator tree.
279	if (!ExpandDiv64InIR)
280	AU.setPreservesAll();
281	}
282	bool runOnFunction(Function &F) override;
283	StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
284	};
285
286	} // end anonymous namespace
287
288	bool AMDGPUCodeGenPrepareImpl::run() {
289	BreakPhiNodesCache.clear();
290	bool MadeChange = false;
291
292	// Need to use make_early_inc_range because integer division expansion is
293	// handled by Transform/Utils, and it can delete instructions such as the
294	// terminator of the BB.
295	for (BasicBlock &BB : reverse(C&: F)) {
296	for (Instruction &I : make_early_inc_range(Range: reverse(C&: BB))) {
297	if (!isInstructionTriviallyDead(I: &I, TLI))
298	MadeChange \|= visit(I);
299	}
300	}
301
302	while (!DeadVals.empty()) {
303	if (auto *I = dyn_cast_or_null<Instruction>(Val: DeadVals.pop_back_val()))
304	RecursivelyDeleteTriviallyDeadInstructions(V: I, TLI);
305	}
306
307	return MadeChange;
308	}
309
310	bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type Ty) const* {
311	return Ty->isFloatTy() \|\| Ty->isDoubleTy() \|\|
312	(Ty->isHalfTy() && ST.has16BitInsts());
313	}
314
315	bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {
316	Type *Ty = I.getType();
317	int TySize = DL.getTypeSizeInBits(Ty);
318	Align Alignment = DL.getValueOrABITypeAlignment(Alignment: I.getAlign(), Ty);
319
320	return I.isSimple() && TySize < `32` && Alignment >= `4` && UA.isUniform(I: &I);
321	}
322
323	unsigned
324	AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op,
325	const Instruction CtxI) const* {
326	return computeKnownBits(V: Op, Q: SQ.getWithInstruction(I: CtxI)).countMaxActiveBits();
327	}
328
329	unsigned
330	AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op,
331	const Instruction CtxI) const* {
332	return ComputeMaxSignificantBits(Op, DL: SQ.DL, AC: SQ.AC, CxtI: CtxI, DT: SQ.DT);
333	}
334
335	static void extractValues(IRBuilder<> &Builder,
336	SmallVectorImpl<Value > &Values, Value V) {
337	auto *VT = dyn_cast<FixedVectorType>(Val: V->getType());
338	if (!VT) {
339	Values.push_back(Elt: V);
340	return;
341	}
342
343	for (int I = `0`, E = VT->getNumElements(); I != E; ++I)
344	Values.push_back(Elt: Builder.CreateExtractElement(Vec: V, Idx: I));
345	}
346
347	static Value *insertValues(IRBuilder<> &Builder,
348	Type *Ty,
349	SmallVectorImpl<Value *> &Values) {
350	if (!Ty->isVectorTy()) {
351	assert(Values.size() == `1`);
352	return Values [`0`];
353	}
354
355	Value *NewVal = PoisonValue::get(T: Ty);
356	for (int I = `0`, E = Values.size(); I != E; ++I)
357	NewVal = Builder.CreateInsertElement(Vec: NewVal, NewElt: Values [I], Idx: I);
358
359	return NewVal;
360	}
361
362	bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
363	if (I.getOpcode() != Instruction::Mul)
364	return false;
365
366	Type *Ty = I.getType();
367	unsigned Size = Ty->getScalarSizeInBits();
368	if (Size <= `16` && ST.has16BitInsts())
369	return false;
370
371	// Prefer scalar if this could be s_mul_i32
372	if (UA.isUniform(I: &I))
373	return false;
374
375	Value *LHS = I.getOperand(i_nocapture: `0`);
376	Value *RHS = I.getOperand(i_nocapture: `1`);
377	IRBuilder<> Builder(&I);
378	Builder.SetCurrentDebugLocation(I.getDebugLoc());
379
380	unsigned LHSBits = `0`, RHSBits = `0`;
381	bool IsSigned = false;
382
383	if (ST.hasMulU24() && (LHSBits = numBitsUnsigned(Op: LHS, CtxI: &I)) <= `24` &&
384	(RHSBits = numBitsUnsigned(Op: RHS, CtxI: &I)) <= `24`) {
385	IsSigned = false;
386
387	} else if (ST.hasMulI24() && (LHSBits = numBitsSigned(Op: LHS, CtxI: &I)) <= `24` &&
388	(RHSBits = numBitsSigned(Op: RHS, CtxI: &I)) <= `24`) {
389	IsSigned = true;
390
391	} else
392	return false;
393
394	SmallVector<Value *, `4`> LHSVals;
395	SmallVector<Value *, `4`> RHSVals;
396	SmallVector<Value *, `4`> ResultVals;
397	extractValues(Builder, Values&: LHSVals, V: LHS);
398	extractValues(Builder, Values&: RHSVals, V: RHS);
399
400	IntegerType *I32Ty = Builder.getInt32Ty();
401	IntegerType *IntrinTy = Size > `32` ? Builder.getInt64Ty() : I32Ty;
402	Type *DstTy = LHSVals [`0`]->getType();
403
404	for (int I = `0`, E = LHSVals.size(); I != E; ++I) {
405	Value *LHS = IsSigned ? Builder.CreateSExtOrTrunc(V: LHSVals [I], DestTy: I32Ty)
406	: Builder.CreateZExtOrTrunc(V: LHSVals [I], DestTy: I32Ty);
407	Value *RHS = IsSigned ? Builder.CreateSExtOrTrunc(V: RHSVals [I], DestTy: I32Ty)
408	: Builder.CreateZExtOrTrunc(V: RHSVals [I], DestTy: I32Ty);
409	Intrinsic::ID ID =
410	IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
411	Value *Result = Builder.CreateIntrinsic(ID, Types: {IntrinTy}, Args: {LHS, RHS});
412	Result = IsSigned ? Builder.CreateSExtOrTrunc(V: Result, DestTy: DstTy)
413	: Builder.CreateZExtOrTrunc(V: Result, DestTy: DstTy);
414	ResultVals.push_back(Elt: Result);
415	}
416
417	Value *NewVal = insertValues(Builder, Ty, Values&: ResultVals);
418	NewVal->takeName(V: &I);
419	I.replaceAllUsesWith(V: NewVal);
420	DeadVals.push_back(Elt: &I);
421
422	return true;
423	}
424
425	// Find a select instruction, which may have been casted. This is mostly to deal
426	// with cases where i16 selects were promoted here to i32.
427	static SelectInst findSelectThroughCast(Value V, CastInst *&Cast) {
428	Cast = nullptr;
429	if (SelectInst *Sel = dyn_cast<SelectInst>(Val: V))
430	return Sel;
431
432	if ((Cast = dyn_cast<CastInst>(Val: V))) {
433	if (SelectInst *Sel = dyn_cast<SelectInst>(Val: Cast->getOperand(i_nocapture: `0`)))
434	return Sel;
435	}
436
437	return nullptr;
438	}
439
440	bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const {
441	// Don't do this unless the old select is going away. We want to eliminate the
442	// binary operator, not replace a binop with a select.
443	int SelOpNo = `0`;
444
445	CastInst *CastOp;
446
447	// TODO: Should probably try to handle some cases with multiple
448	// users. Duplicating the select may be profitable for division.
449	SelectInst *Sel = findSelectThroughCast(V: BO.getOperand(i_nocapture: `0`), Cast&: CastOp);
450	if (!Sel \|\| !Sel->hasOneUse()) {
451	SelOpNo = `1`;
452	Sel = findSelectThroughCast(V: BO.getOperand(i_nocapture: `1`), Cast&: CastOp);
453	}
454
455	if (!Sel \|\| !Sel->hasOneUse())
456	return false;
457
458	Constant *CT = dyn_cast<Constant>(Val: Sel->getTrueValue());
459	Constant *CF = dyn_cast<Constant>(Val: Sel->getFalseValue());
460	Constant *CBO = dyn_cast<Constant>(Val: BO.getOperand(i_nocapture: SelOpNo ^ `1`));
461	if (!CBO \|\| !CT \|\| !CF)
462	return false;
463
464	if (CastOp) {
465	if (!CastOp->hasOneUse())
466	return false;
467	CT = ConstantFoldCastOperand(Opcode: CastOp->getOpcode(), C: CT, DestTy: BO.getType(), DL);
468	CF = ConstantFoldCastOperand(Opcode: CastOp->getOpcode(), C: CF, DestTy: BO.getType(), DL);
469	}
470
471	// TODO: Handle special 0/-1 cases DAG combine does, although we only really
472	// need to handle divisions here.
473	Constant *FoldedT =
474	SelOpNo ? ConstantFoldBinaryOpOperands(Opcode: BO.getOpcode(), LHS: CBO, RHS: CT, DL)
475	: ConstantFoldBinaryOpOperands(Opcode: BO.getOpcode(), LHS: CT, RHS: CBO, DL);
476	if (!FoldedT \|\| isa<ConstantExpr>(Val: FoldedT))
477	return false;
478
479	Constant *FoldedF =
480	SelOpNo ? ConstantFoldBinaryOpOperands(Opcode: BO.getOpcode(), LHS: CBO, RHS: CF, DL)
481	: ConstantFoldBinaryOpOperands(Opcode: BO.getOpcode(), LHS: CF, RHS: CBO, DL);
482	if (!FoldedF \|\| isa<ConstantExpr>(Val: FoldedF))
483	return false;
484
485	IRBuilder<> Builder(&BO);
486	Builder.SetCurrentDebugLocation(BO.getDebugLoc());
487	if (const FPMathOperator FPOp = dyn_cast<const* FPMathOperator>(Val: &BO))
488	Builder.setFastMathFlags(FPOp->getFastMathFlags());
489
490	Value *NewSelect = Builder.CreateSelect(C: Sel->getCondition(),
491	True: FoldedT, False: FoldedF);
492	NewSelect->takeName(V: &BO);
493	BO.replaceAllUsesWith(V: NewSelect);
494	DeadVals.push_back(Elt: &BO);
495	if (CastOp)
496	DeadVals.push_back(Elt: CastOp);
497	DeadVals.push_back(Elt: Sel);
498	return true;
499	}
500
501	std::pair<Value , Value >
502	AMDGPUCodeGenPrepareImpl::getFrexpResults(IRBuilder<> &Builder,
503	Value Src) const* {
504	Type *Ty = Src->getType();
505	Value *Frexp = Builder.CreateIntrinsic(ID: Intrinsic::frexp,
506	Types: {Ty, Builder.getInt32Ty()}, Args: Src);
507	Value *FrexpMant = Builder.CreateExtractValue(Agg: Frexp, Idxs: {`0`});
508
509	// Bypass the bug workaround for the exponent result since it doesn't matter.
510	// TODO: Does the bug workaround even really need to consider the exponent
511	// result? It's unspecified by the spec.
512
513	Value *FrexpExp =
514	ST.hasFractBug()
515	? Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_frexp_exp,
516	Types: {Builder.getInt32Ty(), Ty}, Args: Src)
517	: Builder.CreateExtractValue(Agg: Frexp, Idxs: {`1`});
518	return {FrexpMant, FrexpExp};
519	}
520
521	/// Emit an expansion of 1.0 / Src good for 1ulp that supports denormals.
522	Value *AMDGPUCodeGenPrepareImpl::emitRcpIEEE1ULP(IRBuilder<> &Builder,
523	Value *Src,
524	bool IsNegative) const {
525	// Same as for 1.0, but expand the sign out of the constant.
526	// -1.0 / x -> rcp (fneg x)
527	if (IsNegative)
528	Src = Builder.CreateFNeg(V: Src);
529
530	// The rcp instruction doesn't support denormals, so scale the input
531	// out of the denormal range and convert at the end.
532	//
533	// Expand as 2^-n (1.0 / (x * 2^n))*
534
535	// TODO: Skip scaling if input is known never denormal and the input
536	// range won't underflow to denormal. The hard part is knowing the
537	// result. We need a range check, the result could be denormal for
538	// 0x1p+126 < den <= 0x1p+127.
539	auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);
540	Value *ScaleFactor = Builder.CreateNeg(V: FrexpExp);
541	Value *Rcp = Builder.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rcp, V: FrexpMant);
542	return Builder.CreateCall(Callee: getLdexpF32(), Args: {Rcp, ScaleFactor});
543	}
544
545	/// Emit a 2ulp expansion for fdiv by using frexp for input scaling.
546	Value AMDGPUCodeGenPrepareImpl::emitFrexpDiv(IRBuilder<> &Builder, Value LHS,
547	Value *RHS,
548	FastMathFlags FMF) const {
549	// If we have have to work around the fract/frexp bug, we're worse off than
550	// using the fdiv.fast expansion. The full safe expansion is faster if we have
551	// fast FMA.
552	if (HasFP32DenormalFlush && ST.hasFractBug() && !ST.hasFastFMAF32() &&
553	(!FMF.noNaNs() \|\| !FMF.noInfs()))
554	return nullptr;
555
556	// We're scaling the LHS to avoid a denormal input, and scale the denominator
557	// to avoid large values underflowing the result.
558	auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, Src: RHS);
559
560	Value *Rcp =
561	Builder.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rcp, V: FrexpMantRHS);
562
563	auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder, Src: LHS);
564	Value *Mul = Builder.CreateFMul(L: FrexpMantLHS, R: Rcp);
565
566	// We multiplied by 2^N/2^M, so we need to multiply by 2^(N-M) to scale the
567	// result.
568	Value *ExpDiff = Builder.CreateSub(LHS: FrexpExpLHS, RHS: FrexpExpRHS);
569	return Builder.CreateCall(Callee: getLdexpF32(), Args: {Mul, ExpDiff});
570	}
571
572	/// Emit a sqrt that handles denormals and is accurate to 2ulp.
573	Value *AMDGPUCodeGenPrepareImpl::emitSqrtIEEE2ULP(IRBuilder<> &Builder,
574	Value *Src,
575	FastMathFlags FMF) const {
576	Type *Ty = Src->getType();
577	APFloat SmallestNormal =
578	APFloat::getSmallestNormalized(Sem: Ty->getFltSemantics());
579	Value *NeedScale =
580	Builder.CreateFCmpOLT(LHS: Src, RHS: ConstantFP::get(Ty, V: SmallestNormal));
581
582	ConstantInt *Zero = Builder.getInt32(C: `0`);
583	Value *InputScaleFactor =
584	Builder.CreateSelect(C: NeedScale, True: Builder.getInt32(C: `32`), False: Zero);
585
586	Value *Scaled = Builder.CreateCall(Callee: getLdexpF32(), Args: {Src, InputScaleFactor});
587
588	Value *Sqrt = Builder.CreateCall(Callee: getSqrtF32(), Args: Scaled);
589
590	Value *OutputScaleFactor =
591	Builder.CreateSelect(C: NeedScale, True: Builder.getInt32(C: -`16`), False: Zero);
592	return Builder.CreateCall(Callee: getLdexpF32(), Args: {Sqrt, OutputScaleFactor});
593	}
594
595	/// Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
596	static Value emitRsqIEEE1ULP(IRBuilder<> &Builder, Value Src,
597	bool IsNegative) {
598	// bool need_scale = x < 0x1p-126f;
599	// float input_scale = need_scale ? 0x1.0p+24f : 1.0f;
600	// float output_scale = need_scale ? 0x1.0p+12f : 1.0f;
601	// rsq(x input_scale) * output_scale;*
602
603	Type *Ty = Src->getType();
604	APFloat SmallestNormal =
605	APFloat::getSmallestNormalized(Sem: Ty->getFltSemantics());
606	Value *NeedScale =
607	Builder.CreateFCmpOLT(LHS: Src, RHS: ConstantFP::get(Ty, V: SmallestNormal));
608	Constant *One = ConstantFP::get(Ty, V: `1.0`);
609	Constant *InputScale = ConstantFP::get(Ty, V: `0x1.0p+24`);
610	Constant *OutputScale =
611	ConstantFP::get(Ty, V: IsNegative ? -`0x1.0p+12` : `0x1.0p+12`);
612
613	Value *InputScaleFactor = Builder.CreateSelect(C: NeedScale, True: InputScale, False: One);
614
615	Value *ScaledInput = Builder.CreateFMul(L: Src, R: InputScaleFactor);
616	Value *Rsq = Builder.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rsq, V: ScaledInput);
617	Value *OutputScaleFactor = Builder.CreateSelect(
618	C: NeedScale, True: OutputScale, False: IsNegative ? ConstantFP::get(Ty, V: -`1.0`) : One);
619
620	return Builder.CreateFMul(L: Rsq, R: OutputScaleFactor);
621	}
622
623	/// Emit inverse sqrt expansion for f64 with a correction sequence on top of
624	/// v_rsq_f64. This should give a 1ulp result.
625	Value AMDGPUCodeGenPrepareImpl::emitRsqF64(IRBuilder<> &Builder, Value X,
626	FastMathFlags SqrtFMF,
627	FastMathFlags DivFMF,
628	const Instruction *CtxI,
629	bool IsNegative) const {
630	// rsq(x):
631	// double y0 = BUILTIN_AMDGPU_RSQRT_F64(x);
632	// double e = MATH_MAD(-y0 (x == PINF_F64 \|\| x == 0.0 ? y0 : x), y0, 1.0);*
633	// return MATH_MAD(y0e, MATH_MAD(e, 0.375, 0.5), y0);*
634	//
635	// -rsq(x):
636	// double y0 = BUILTIN_AMDGPU_RSQRT_F64(x);
637	// double e = MATH_MAD(-y0 (x == PINF_F64 \|\| x == 0.0 ? y0 : x), y0, 1.0);*
638	// return MATH_MAD(-y0e, MATH_MAD(e, 0.375, 0.5), -y0);*
639	//
640	// The rsq instruction handles the special cases correctly. We need to check
641	// for the edge case conditions to ensure the special case propagates through
642	// the later instructions.
643
644	Value *Y0 = Builder.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rsq, V: X);
645
646	// Try to elide the edge case check.
647	//
648	// Fast math flags imply:
649	// sqrt ninf => !isinf(x)
650	// fdiv ninf => x != 0, !isinf(x)
651	bool MaybePosInf = !SqrtFMF.noInfs() && !DivFMF.noInfs();
652	bool MaybeZero = !DivFMF.noInfs();
653
654	DenormalMode DenormMode;
655	FPClassTest Interested = fcNone;
656	if (MaybePosInf)
657	Interested = fcPosInf;
658	if (MaybeZero)
659	Interested \|= fcZero;
660
661	if (Interested != fcNone) {
662	KnownFPClass KnownSrc = computeKnownFPClass(V: X, Interested, CtxI);
663	if (KnownSrc.isKnownNeverPosInfinity())
664	MaybePosInf = false;
665
666	DenormMode = F.getDenormalMode(FPType: X->getType()->getFltSemantics());
667	if (KnownSrc.isKnownNeverLogicalZero(Mode: DenormMode))
668	MaybeZero = false;
669	}
670
671	Value *SpecialOrRsq = X;
672	if (MaybeZero \|\| MaybePosInf) {
673	Value *Cond;
674	if (MaybePosInf && MaybeZero) {
675	if (DenormMode.Input != DenormalMode::DenormalModeKind::Dynamic) {
676	FPClassTest TestMask = fcPosInf \| fcZero;
677	if (DenormMode.inputsAreZero())
678	TestMask \|= fcSubnormal;
679
680	Cond = Builder.createIsFPClass(FPNum: X, Test: TestMask);
681	} else {
682	// Avoid using llvm.is.fpclass for dynamic denormal mode, since it
683	// doesn't respect the floating-point environment.
684	Value *IsZero =
685	Builder.CreateFCmpOEQ(LHS: X, RHS: ConstantFP::getZero(Ty: X->getType()));
686	Value *IsInf =
687	Builder.CreateFCmpOEQ(LHS: X, RHS: ConstantFP::getInfinity(Ty: X->getType()));
688	Cond = Builder.CreateOr(LHS: IsZero, RHS: IsInf);
689	}
690	} else if (MaybeZero) {
691	Cond = Builder.CreateFCmpOEQ(LHS: X, RHS: ConstantFP::getZero(Ty: X->getType()));
692	} else {
693	Cond = Builder.CreateFCmpOEQ(LHS: X, RHS: ConstantFP::getInfinity(Ty: X->getType()));
694	}
695
696	SpecialOrRsq = Builder.CreateSelect(C: Cond, True: Y0, False: X);
697	}
698
699	Value *NegY0 = Builder.CreateFNeg(V: Y0);
700	Value *NegXY0 = Builder.CreateFMul(L: SpecialOrRsq, R: NegY0);
701
702	// Could be fmuladd, but isFMAFasterThanFMulAndFAdd is always true for f64.
703	Value *E = Builder.CreateFMA(Factor1: NegXY0, Factor2: Y0, Summand: ConstantFP::get(Ty: X->getType(), V: `1.0`));
704
705	Value *Y0E = Builder.CreateFMul(L: E, R: IsNegative ? NegY0 : Y0);
706
707	Value *EFMA = Builder.CreateFMA(Factor1: E, Factor2: ConstantFP::get(Ty: X->getType(), V: `0.375`),
708	Summand: ConstantFP::get(Ty: X->getType(), V: `0.5`));
709
710	return Builder.CreateFMA(Factor1: Y0E, Factor2: EFMA, Summand: IsNegative ? NegY0 : Y0);
711	}
712
713	bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(FastMathFlags DivFMF,
714	FastMathFlags SqrtFMF) const {
715	// The rsqrt contraction increases accuracy from ~2ulp to ~1ulp for f32 and
716	// f64.
717	return DivFMF.allowContract() && SqrtFMF.allowContract();
718	}
719
720	Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
721	IRBuilder<> &Builder, Value Num, Value Den, const FastMathFlags DivFMF,
722	const FastMathFlags SqrtFMF, const Instruction CtxI) const* {
723	// The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
724	assert(DivFMF.allowContract() && SqrtFMF.allowContract());
725
726	// rsq_f16 is accurate to 0.51 ulp.
727	// rsq_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
728	// rsq_f64 is never accurate.
729	const ConstantFP *CLHS = dyn_cast<ConstantFP>(Val: Num);
730	if (!CLHS)
731	return nullptr;
732
733	bool IsNegative = false;
734
735	// TODO: Handle other numerator values with arcp.
736	if (CLHS->isExactlyValue(V: `1.0`) \|\| (IsNegative = CLHS->isExactlyValue(V: -`1.0`))) {
737	// Add in the sqrt flags.
738	IRBuilder<>::FastMathFlagGuard Guard(Builder);
739	Builder.setFastMathFlags(DivFMF \| SqrtFMF);
740
741	if (Den->getType()->isFloatTy()) {
742	if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) \|\|
743	canIgnoreDenormalInput(V: Den, CtxI)) {
744	Value *Result =
745	Builder.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rsq, V: Den);
746	// -1.0 / sqrt(x) -> fneg(rsq(x))
747	return IsNegative ? Builder.CreateFNeg(V: Result) : Result;
748	}
749
750	return emitRsqIEEE1ULP(Builder, Src: Den, IsNegative);
751	}
752
753	if (Den->getType()->isDoubleTy())
754	return emitRsqF64(Builder, X: Den, SqrtFMF, DivFMF, CtxI, IsNegative);
755	}
756
757	return nullptr;
758	}
759
760	// Optimize fdiv with rcp:
761	//
762	// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
763	// allowed with afn.
764	//
765	// a/b -> arcp(b) when arcp is allowed, and we only need provide ULP 1.0*
766	Value *
767	AMDGPUCodeGenPrepareImpl::optimizeWithRcp(IRBuilder<> &Builder, Value *Num,
768	Value *Den, FastMathFlags FMF,
769	const Instruction CtxI) const* {
770	// rcp_f16 is accurate to 0.51 ulp.
771	// rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
772	// rcp_f64 is never accurate.
773	assert(Den->getType()->isFloatTy());
774
775	if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Val: Num)) {
776	bool IsNegative = false;
777	if (CLHS->isExactlyValue(V: `1.0`) \|\|
778	(IsNegative = CLHS->isExactlyValue(V: -`1.0`))) {
779	Value *Src = Den;
780
781	if (HasFP32DenormalFlush \|\| FMF.approxFunc()) {
782	// -1.0 / x -> 1.0 / fneg(x)
783	if (IsNegative)
784	Src = Builder.CreateFNeg(V: Src);
785
786	// v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
787	// the CI documentation has a worst case error of 1 ulp.
788	// OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK
789	// to use it as long as we aren't trying to use denormals.
790	//
791	// v_rcp_f16 and v_rsq_f16 DO support denormals.
792
793	// NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
794	// insert rsq intrinsic here.
795
796	// 1.0 / x -> rcp(x)
797	return Builder.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rcp, V: Src);
798	}
799
800	// TODO: If the input isn't denormal, and we know the input exponent isn't
801	// big enough to introduce a denormal we can avoid the scaling.
802	return emitRcpIEEE1ULP(Builder, Src, IsNegative);
803	}
804	}
805
806	if (FMF.allowReciprocal()) {
807	// x / y -> x (1.0 / y)*
808
809	// TODO: Could avoid denormal scaling and use raw rcp if we knew the output
810	// will never underflow.
811	if (HasFP32DenormalFlush \|\| FMF.approxFunc()) {
812	Value *Recip = Builder.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rcp, V: Den);
813	return Builder.CreateFMul(L: Num, R: Recip);
814	}
815
816	Value Recip = emitRcpIEEE1ULP(Builder, Src: Den, IsNegative: false*);
817	return Builder.CreateFMul(L: Num, R: Recip);
818	}
819
820	return nullptr;
821	}
822
823	// optimize with fdiv.fast:
824	//
825	// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
826	//
827	// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
828	//
829	// NOTE: optimizeWithRcp should be tried first because rcp is the preference.
830	Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
831	IRBuilder<> &Builder, Value Num, Value Den, float ReqdAccuracy) const {
832	// fdiv.fast can achieve 2.5 ULP accuracy.
833	if (ReqdAccuracy < `2.5f`)
834	return nullptr;
835
836	// Only have fdiv.fast for f32.
837	assert(Den->getType()->isFloatTy());
838
839	bool NumIsOne = false;
840	if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Val: Num)) {
841	if (CNum->isExactlyValue(V: +`1.0`) \|\| CNum->isExactlyValue(V: -`1.0`))
842	NumIsOne = true;
843	}
844
845	// fdiv does not support denormals. But 1.0/x is always fine to use it.
846	//
847	// TODO: This works for any value with a specific known exponent range, don't
848	// just limit to constant 1.
849	if (!HasFP32DenormalFlush && !NumIsOne)
850	return nullptr;
851
852	return Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_fdiv_fast, Args: {Num, Den});
853	}
854
855	Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
856	IRBuilder<> &Builder, Value Num, Value Den, FastMathFlags DivFMF,
857	FastMathFlags SqrtFMF, Value RsqOp, const* Instruction *FDivInst,
858	float ReqdDivAccuracy) const {
859	if (RsqOp) {
860	Value *Rsq =
861	optimizeWithRsq(Builder, Num, Den: RsqOp, DivFMF, SqrtFMF, CtxI: FDivInst);
862	if (Rsq)
863	return Rsq;
864	}
865
866	if (!Num->getType()->isFloatTy())
867	return nullptr;
868
869	Value *Rcp = optimizeWithRcp(Builder, Num, Den, FMF: DivFMF, CtxI: FDivInst);
870	if (Rcp)
871	return Rcp;
872
873	// In the basic case fdiv_fast has the same instruction count as the frexp div
874	// expansion. Slightly prefer fdiv_fast since it ends in an fmul that can
875	// potentially be fused into a user. Also, materialization of the constants
876	// can be reused for multiple instances.
877	Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdAccuracy: ReqdDivAccuracy);
878	if (FDivFast)
879	return FDivFast;
880
881	return emitFrexpDiv(Builder, LHS: Num, RHS: Den, FMF: DivFMF);
882	}
883
884	// Optimizations is performed based on fpmath, fast math flags as well as
885	// denormals to optimize fdiv with either rcp or fdiv.fast.
886	//
887	// With rcp:
888	// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
889	// allowed with afn.
890	//
891	// a/b -> arcp(b) when inaccurate rcp is allowed with afn.*
892	//
893	// With fdiv.fast:
894	// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
895	//
896	// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
897	//
898	// NOTE: rcp is the preference in cases that both are legal.
899	bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
900	if (DisableFDivExpand)
901	return false;
902
903	Type *Ty = FDiv.getType()->getScalarType();
904	const bool IsFloat = Ty->isFloatTy();
905	if (!IsFloat && !Ty->isDoubleTy())
906	return false;
907
908	// The f64 rcp/rsq approximations are pretty inaccurate. We can do an
909	// expansion around them in codegen. f16 is good enough to always use.
910
911	const FPMathOperator FPOp = cast<const* FPMathOperator>(Val: &FDiv);
912	const FastMathFlags DivFMF = FPOp->getFastMathFlags();
913	const float ReqdAccuracy = FPOp->getFPAccuracy();
914
915	FastMathFlags SqrtFMF;
916
917	Value *Num = FDiv.getOperand(i_nocapture: `0`);
918	Value *Den = FDiv.getOperand(i_nocapture: `1`);
919
920	Value RsqOp = nullptr*;
921	auto *DenII = dyn_cast<IntrinsicInst>(Val: Den);
922	if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
923	DenII->hasOneUse()) {
924	const auto *SqrtOp = cast<FPMathOperator>(Val: DenII);
925	SqrtFMF = SqrtOp->getFastMathFlags();
926	if (canOptimizeWithRsq(DivFMF, SqrtFMF))
927	RsqOp = SqrtOp->getOperand(i: `0`);
928	}
929
930	// rcp path not yet implemented for f64.
931	if (!IsFloat && !RsqOp)
932	return false;
933
934	// Inaccurate rcp is allowed with afn.
935	//
936	// Defer to codegen to handle this.
937	//
938	// TODO: Decide on an interpretation for interactions between afn + arcp +
939	// !fpmath, and make it consistent between here and codegen. For now, defer
940	// expansion of afn to codegen. The current interpretation is so aggressive we
941	// don't need any pre-consideration here when we have better information. A
942	// more conservative interpretation could use handling here.
943	const bool AllowInaccurateRcp = DivFMF.approxFunc();
944	if (!RsqOp && AllowInaccurateRcp)
945	return false;
946
947	// Defer the correct implementations to codegen.
948	if (IsFloat && ReqdAccuracy < `1.0f`)
949	return false;
950
951	IRBuilder<> Builder(FDiv.getParent(), std::next(x: FDiv.getIterator()));
952	Builder.setFastMathFlags(DivFMF);
953	Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
954
955	SmallVector<Value *, `4`> NumVals;
956	SmallVector<Value *, `4`> DenVals;
957	SmallVector<Value *, `4`> RsqDenVals;
958	extractValues(Builder, Values&: NumVals, V: Num);
959	extractValues(Builder, Values&: DenVals, V: Den);
960
961	if (RsqOp)
962	extractValues(Builder, Values&: RsqDenVals, V: RsqOp);
963
964	SmallVector<Value *, `4`> ResultVals(NumVals.size());
965	for (int I = `0`, E = NumVals.size(); I != E; ++I) {
966	Value *NumElt = NumVals [I];
967	Value *DenElt = DenVals [I];
968	Value RsqDenElt = RsqOp ? RsqDenVals [I] : nullptr*;
969
970	Value *NewElt =
971	visitFDivElement(Builder, Num: NumElt, Den: DenElt, DivFMF, SqrtFMF, RsqOp: RsqDenElt,
972	FDivInst: cast<Instruction>(Val: FPOp), ReqdDivAccuracy: ReqdAccuracy);
973	if (!NewElt) {
974	// Keep the original, but scalarized.
975
976	// This has the unfortunate side effect of sometimes scalarizing when
977	// we're not going to do anything.
978	NewElt = Builder.CreateFDiv(L: NumElt, R: DenElt);
979	if (auto *NewEltInst = dyn_cast<Instruction>(Val: NewElt))
980	NewEltInst->copyMetadata(SrcInst: FDiv);
981	}
982
983	ResultVals [I] = NewElt;
984	}
985
986	Value *NewVal = insertValues(Builder, Ty: FDiv.getType(), Values&: ResultVals);
987
988	if (NewVal) {
989	FDiv.replaceAllUsesWith(V: NewVal);
990	NewVal->takeName(V: &FDiv);
991	DeadVals.push_back(Elt: &FDiv);
992	}
993
994	return true;
995	}
996
997	static std::pair<Value, Value> getMul64(IRBuilder<> &Builder,
998	Value LHS, Value RHS) {
999	Type *I32Ty = Builder.getInt32Ty();
1000	Type *I64Ty = Builder.getInt64Ty();
1001
1002	Value *LHS_EXT64 = Builder.CreateZExt(V: LHS, DestTy: I64Ty);
1003	Value *RHS_EXT64 = Builder.CreateZExt(V: RHS, DestTy: I64Ty);
1004	Value *MUL64 = Builder.CreateMul(LHS: LHS_EXT64, RHS: RHS_EXT64);
1005	Value *Lo = Builder.CreateTrunc(V: MUL64, DestTy: I32Ty);
1006	Value *Hi = Builder.CreateLShr(LHS: MUL64, RHS: Builder.getInt64(C: `32`));
1007	Hi = Builder.CreateTrunc(V: Hi, DestTy: I32Ty);
1008	return std::pair(Lo, Hi);
1009	}
1010
1011	static Value* getMulHu(IRBuilder<> &Builder, Value LHS, Value RHS) {
1012	return getMul64(Builder, LHS, RHS).second;
1013	}
1014
1015	/// Figure out how many bits are really needed for this division.
1016	/// \p MaxDivBits is an optimization hint to bypass the second
1017	/// ComputeNumSignBits/computeKnownBits call if the first one is
1018	/// insufficient.
1019	unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,
1020	Value *Den,
1021	unsigned MaxDivBits,
1022	bool IsSigned) const {
1023	assert(Num->getType()->getScalarSizeInBits() ==
1024	Den->getType()->getScalarSizeInBits());
1025	unsigned SSBits = Num->getType()->getScalarSizeInBits();
1026	if (IsSigned) {
1027	unsigned RHSSignBits = ComputeNumSignBits(Op: Den, DL: SQ.DL, AC: SQ.AC, CxtI: &I, DT: SQ.DT);
1028	// A sign bit needs to be reserved for shrinking.
1029	unsigned DivBits = SSBits - RHSSignBits + `1`;
1030	if (DivBits > MaxDivBits)
1031	return SSBits;
1032
1033	unsigned LHSSignBits = ComputeNumSignBits(Op: Num, DL: SQ.DL, AC: SQ.AC, CxtI: &I);
1034
1035	unsigned SignBits = std::min(a: LHSSignBits, b: RHSSignBits);
1036	DivBits = SSBits - SignBits + `1`;
1037	return DivBits;
1038	}
1039
1040	// All bits are used for unsigned division for Num or Den in range
1041	// (SignedMax, UnsignedMax].
1042	KnownBits Known = computeKnownBits(V: Den, Q: SQ.getWithInstruction(I: &I));
1043	if (Known.isNegative() \|\| !Known.isNonNegative())
1044	return SSBits;
1045	unsigned RHSSignBits = Known.countMinLeadingZeros();
1046	unsigned DivBits = SSBits - RHSSignBits;
1047	if (DivBits > MaxDivBits)
1048	return SSBits;
1049
1050	Known = computeKnownBits(V: Num, Q: SQ.getWithInstruction(I: &I));
1051	if (Known.isNegative() \|\| !Known.isNonNegative())
1052	return SSBits;
1053	unsigned LHSSignBits = Known.countMinLeadingZeros();
1054
1055	unsigned SignBits = std::min(a: LHSSignBits, b: RHSSignBits);
1056	DivBits = SSBits - SignBits;
1057	return DivBits;
1058	}
1059
1060	// The fractional part of a float is enough to accurately represent up to
1061	// a 24-bit signed integer.
1062	Value *AMDGPUCodeGenPrepareImpl::expandDivRem24(IRBuilder<> &Builder,
1063	BinaryOperator &I, Value *Num,
1064	Value Den, bool* IsDiv,
1065	bool IsSigned) const {
1066	unsigned DivBits = getDivNumBits(I, Num, Den, MaxDivBits: `24`, IsSigned);
1067	if (DivBits > `24`)
1068	return nullptr;
1069	return expandDivRem24Impl(Builder, I, Num, Den, NumBits: DivBits, IsDiv, IsSigned);
1070	}
1071
1072	Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
1073	IRBuilder<> &Builder, BinaryOperator &I, Value Num, Value Den,
1074	unsigned DivBits, bool IsDiv, bool IsSigned) const {
1075	Type *I32Ty = Builder.getInt32Ty();
1076	Num = Builder.CreateTrunc(V: Num, DestTy: I32Ty);
1077	Den = Builder.CreateTrunc(V: Den, DestTy: I32Ty);
1078
1079	Type *F32Ty = Builder.getFloatTy();
1080	ConstantInt *One = Builder.getInt32(C: `1`);
1081	Value *JQ = One;
1082
1083	if (IsSigned) {
1084	// char\|short jq = ia ^ ib;
1085	JQ = Builder.CreateXor(LHS: Num, RHS: Den);
1086
1087	// jq = jq >> (bitsize - 2)
1088	JQ = Builder.CreateAShr(LHS: JQ, RHS: Builder.getInt32(C: `30`));
1089
1090	// jq = jq \| 0x1
1091	JQ = Builder.CreateOr(LHS: JQ, RHS: One);
1092	}
1093
1094	// int ia = (int)LHS;
1095	Value *IA = Num;
1096
1097	// int ib, (int)RHS;
1098	Value *IB = Den;
1099
1100	// float fa = (float)ia;
1101	Value *FA = IsSigned ? Builder.CreateSIToFP(V: IA, DestTy: F32Ty)
1102	: Builder.CreateUIToFP(V: IA, DestTy: F32Ty);
1103
1104	// float fb = (float)ib;
1105	Value *FB = IsSigned ? Builder.CreateSIToFP(V: IB,DestTy: F32Ty)
1106	: Builder.CreateUIToFP(V: IB,DestTy: F32Ty);
1107
1108	Value *RCP = Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_rcp,
1109	Types: Builder.getFloatTy(), Args: {FB});
1110	Value *FQM = Builder.CreateFMul(L: FA, R: RCP);
1111
1112	// fq = trunc(fqm);
1113	CallInst *FQ = Builder.CreateUnaryIntrinsic(ID: Intrinsic::trunc, V: FQM);
1114	FQ->copyFastMathFlags(FMF: Builder.getFastMathFlags());
1115
1116	// float fqneg = -fq;
1117	Value *FQNeg = Builder.CreateFNeg(V: FQ);
1118
1119	// float fr = mad(fqneg, fb, fa);
1120	auto FMAD = !ST.hasMadMacF32Insts()
1121	? Intrinsic::fma
1122	: (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz;
1123	Value *FR = Builder.CreateIntrinsic(ID: FMAD,
1124	Types: {FQNeg->getType()}, Args: {FQNeg, FB, FA}, FMFSource: FQ);
1125
1126	// int iq = (int)fq;
1127	Value *IQ = IsSigned ? Builder.CreateFPToSI(V: FQ, DestTy: I32Ty)
1128	: Builder.CreateFPToUI(V: FQ, DestTy: I32Ty);
1129
1130	// fr = fabs(fr);
1131	FR = Builder.CreateUnaryIntrinsic(ID: Intrinsic::fabs, V: FR, FMFSource: FQ);
1132
1133	// fb = fabs(fb);
1134	FB = Builder.CreateUnaryIntrinsic(ID: Intrinsic::fabs, V: FB, FMFSource: FQ);
1135
1136	// int cv = fr >= fb;
1137	Value *CV = Builder.CreateFCmpOGE(LHS: FR, RHS: FB);
1138
1139	// jq = (cv ? jq : 0);
1140	JQ = Builder.CreateSelect(C: CV, True: JQ, False: Builder.getInt32(C: `0`));
1141
1142	// dst = iq + jq;
1143	Value *Div = Builder.CreateAdd(LHS: IQ, RHS: JQ);
1144
1145	Value *Res = Div;
1146	if (!IsDiv) {
1147	// Rem needs compensation, it's easier to recompute it
1148	Value *Rem = Builder.CreateMul(LHS: Div, RHS: Den);
1149	Res = Builder.CreateSub(LHS: Num, RHS: Rem);
1150	}
1151
1152	if (DivBits != `0` && DivBits < `32`) {
1153	// Extend in register from the number of bits this divide really is.
1154	if (IsSigned) {
1155	int InRegBits = `32` - DivBits;
1156
1157	Res = Builder.CreateShl(LHS: Res, RHS: InRegBits);
1158	Res = Builder.CreateAShr(LHS: Res, RHS: InRegBits);
1159	} else {
1160	ConstantInt *TruncMask
1161	= Builder.getInt32(C: (UINT64_C(`1`) << DivBits) - `1`);
1162	Res = Builder.CreateAnd(LHS: Res, RHS: TruncMask);
1163	}
1164	}
1165
1166	return Res;
1167	}
1168
1169	// Try to recognize special cases the DAG will emit special, better expansions
1170	// than the general expansion we do here.
1171
1172	// TODO: It would be better to just directly handle those optimizations here.
1173	bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I,
1174	Value *Num,
1175	Value Den) const* {
1176	if (Constant *C = dyn_cast<Constant>(Val: Den)) {
1177	// Arbitrary constants get a better expansion as long as a wider mulhi is
1178	// legal.
1179	if (C->getType()->getScalarSizeInBits() <= `32`)
1180	return true;
1181
1182	// TODO: Sdiv check for not exact for some reason.
1183
1184	// If there's no wider mulhi, there's only a better expansion for powers of
1185	// two.
1186	// TODO: Should really know for each vector element.
1187	if (isKnownToBeAPowerOfTwo(V: C, OrZero: true, Q: SQ.getWithInstruction(I: &I)))
1188	return true;
1189
1190	return false;
1191	}
1192
1193	if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Val: Den)) {
1194	// fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
1195	if (BinOpDen->getOpcode() == Instruction::Shl &&
1196	isa<Constant>(Val: BinOpDen->getOperand(i_nocapture: `0`)) &&
1197	isKnownToBeAPowerOfTwo(V: BinOpDen->getOperand(i_nocapture: `0`), OrZero: true,
1198	Q: SQ.getWithInstruction(I: &I))) {
1199	return true;
1200	}
1201	}
1202
1203	return false;
1204	}
1205
1206	static Value getSign32(Value V, IRBuilder<> &Builder, const DataLayout DL) {
1207	// Check whether the sign can be determined statically.
1208	KnownBits Known = computeKnownBits(V, DL);
1209	if (Known.isNegative())
1210	return Constant::getAllOnesValue(Ty: V->getType());
1211	if (Known.isNonNegative())
1212	return Constant::getNullValue(Ty: V->getType());
1213	return Builder.CreateAShr(LHS: V, RHS: Builder.getInt32(C: `31`));
1214	}
1215
1216	Value *AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder<> &Builder,
1217	BinaryOperator &I, Value *X,
1218	Value Y) const* {
1219	Instruction::BinaryOps Opc = I.getOpcode();
1220	assert(Opc == Instruction::URem \|\| Opc == Instruction::UDiv \|\|
1221	Opc == Instruction::SRem \|\| Opc == Instruction::SDiv);
1222
1223	FastMathFlags FMF;
1224	FMF.setFast();
1225	Builder.setFastMathFlags(FMF);
1226
1227	if (divHasSpecialOptimization(I, Num: X, Den: Y))
1228	return nullptr; // Keep it for later optimization.
1229
1230	bool IsDiv = Opc == Instruction::UDiv \|\| Opc == Instruction::SDiv;
1231	bool IsSigned = Opc == Instruction::SRem \|\| Opc == Instruction::SDiv;
1232
1233	Type *Ty = X->getType();
1234	Type *I32Ty = Builder.getInt32Ty();
1235	Type *F32Ty = Builder.getFloatTy();
1236
1237	if (Ty->getScalarSizeInBits() != `32`) {
1238	if (IsSigned) {
1239	X = Builder.CreateSExtOrTrunc(V: X, DestTy: I32Ty);
1240	Y = Builder.CreateSExtOrTrunc(V: Y, DestTy: I32Ty);
1241	} else {
1242	X = Builder.CreateZExtOrTrunc(V: X, DestTy: I32Ty);
1243	Y = Builder.CreateZExtOrTrunc(V: Y, DestTy: I32Ty);
1244	}
1245	}
1246
1247	if (Value *Res = expandDivRem24(Builder, I, Num: X, Den: Y, IsDiv, IsSigned)) {
1248	return IsSigned ? Builder.CreateSExtOrTrunc(V: Res, DestTy: Ty) :
1249	Builder.CreateZExtOrTrunc(V: Res, DestTy: Ty);
1250	}
1251
1252	ConstantInt *Zero = Builder.getInt32(C: `0`);
1253	ConstantInt *One = Builder.getInt32(C: `1`);
1254
1255	Value Sign = nullptr*;
1256	if (IsSigned) {
1257	Value *SignX = getSign32(V: X, Builder, DL);
1258	Value *SignY = getSign32(V: Y, Builder, DL);
1259	// Remainder sign is the same as LHS
1260	Sign = IsDiv ? Builder.CreateXor(LHS: SignX, RHS: SignY) : SignX;
1261
1262	X = Builder.CreateAdd(LHS: X, RHS: SignX);
1263	Y = Builder.CreateAdd(LHS: Y, RHS: SignY);
1264
1265	X = Builder.CreateXor(LHS: X, RHS: SignX);
1266	Y = Builder.CreateXor(LHS: Y, RHS: SignY);
1267	}
1268
1269	// The algorithm here is based on ideas from "Software Integer Division", Tom
1270	// Rodeheffer, August 2008.
1271	//
1272	// unsigned udiv(unsigned x, unsigned y) {
1273	// // Initial estimate of inv(y). The constant is less than 2^32 to ensure
1274	// // that this is a lower bound on inv(y), even if some of the calculations
1275	// // round up.
1276	// unsigned z = (unsigned)((4294967296.0 - 512.0) v_rcp_f32((float)y));*
1277	//
1278	// // One round of UNR (Unsigned integer Newton-Raphson) to improve z.
1279	// // Empirically this is guaranteed to give a "two-y" lower bound on
1280	// // inv(y).
1281	// z += umulh(z, -y z);*
1282	//
1283	// // Quotient/remainder estimate.
1284	// unsigned q = umulh(x, z);
1285	// unsigned r = x - q y;*
1286	//
1287	// // Two rounds of quotient/remainder refinement.
1288	// if (r >= y) {
1289	// ++q;
1290	// r -= y;
1291	// }
1292	// if (r >= y) {
1293	// ++q;
1294	// r -= y;
1295	// }
1296	//
1297	// return q;
1298	// }
1299
1300	// Initial estimate of inv(y).
1301	Value *FloatY = Builder.CreateUIToFP(V: Y, DestTy: F32Ty);
1302	Value *RcpY = Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_rcp, Types: F32Ty, Args: {FloatY});
1303	Constant Scale = ConstantFP::get(Ty: F32Ty, V: llvm::bit_cast<float*>(from: `0x4F7FFFFE`));
1304	Value *ScaledY = Builder.CreateFMul(L: RcpY, R: Scale);
1305	Value *Z = Builder.CreateFPToUI(V: ScaledY, DestTy: I32Ty);
1306
1307	// One round of UNR.
1308	Value *NegY = Builder.CreateSub(LHS: Zero, RHS: Y);
1309	Value *NegYZ = Builder.CreateMul(LHS: NegY, RHS: Z);
1310	Z = Builder.CreateAdd(LHS: Z, RHS: getMulHu(Builder, LHS: Z, RHS: NegYZ));
1311
1312	// Quotient/remainder estimate.
1313	Value *Q = getMulHu(Builder, LHS: X, RHS: Z);
1314	Value *R = Builder.CreateSub(LHS: X, RHS: Builder.CreateMul(LHS: Q, RHS: Y));
1315
1316	// First quotient/remainder refinement.
1317	Value *Cond = Builder.CreateICmpUGE(LHS: R, RHS: Y);
1318	if (IsDiv)
1319	Q = Builder.CreateSelect(C: Cond, True: Builder.CreateAdd(LHS: Q, RHS: One), False: Q);
1320	R = Builder.CreateSelect(C: Cond, True: Builder.CreateSub(LHS: R, RHS: Y), False: R);
1321
1322	// Second quotient/remainder refinement.
1323	Cond = Builder.CreateICmpUGE(LHS: R, RHS: Y);
1324	Value *Res;
1325	if (IsDiv)
1326	Res = Builder.CreateSelect(C: Cond, True: Builder.CreateAdd(LHS: Q, RHS: One), False: Q);
1327	else
1328	Res = Builder.CreateSelect(C: Cond, True: Builder.CreateSub(LHS: R, RHS: Y), False: R);
1329
1330	if (IsSigned) {
1331	Res = Builder.CreateXor(LHS: Res, RHS: Sign);
1332	Res = Builder.CreateSub(LHS: Res, RHS: Sign);
1333	Res = Builder.CreateSExtOrTrunc(V: Res, DestTy: Ty);
1334	} else {
1335	Res = Builder.CreateZExtOrTrunc(V: Res, DestTy: Ty);
1336	}
1337	return Res;
1338	}
1339
1340	Value *AMDGPUCodeGenPrepareImpl::shrinkDivRem64(IRBuilder<> &Builder,
1341	BinaryOperator &I, Value *Num,
1342	Value Den) const* {
1343	if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den))
1344	return nullptr; // Keep it for later optimization.
1345
1346	Instruction::BinaryOps Opc = I.getOpcode();
1347
1348	bool IsDiv = Opc == Instruction::SDiv \|\| Opc == Instruction::UDiv;
1349	bool IsSigned = Opc == Instruction::SDiv \|\| Opc == Instruction::SRem;
1350
1351	unsigned NumDivBits = getDivNumBits(I, Num, Den, MaxDivBits: `32`, IsSigned);
1352	if (NumDivBits > `32`)
1353	return nullptr;
1354
1355	Value Narrowed = nullptr*;
1356	if (NumDivBits <= `24`) {
1357	Narrowed = expandDivRem24Impl(Builder, I, Num, Den, DivBits: NumDivBits,
1358	IsDiv, IsSigned);
1359	} else if (NumDivBits <= `32`) {
1360	Narrowed = expandDivRem32(Builder, I, X: Num, Y: Den);
1361	}
1362
1363	if (Narrowed) {
1364	return IsSigned ? Builder.CreateSExt(V: Narrowed, DestTy: Num->getType()) :
1365	Builder.CreateZExt(V: Narrowed, DestTy: Num->getType());
1366	}
1367
1368	return nullptr;
1369	}
1370
1371	void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &I) const {
1372	Instruction::BinaryOps Opc = I.getOpcode();
1373	// Do the general expansion.
1374	if (Opc == Instruction::UDiv \|\| Opc == Instruction::SDiv) {
1375	expandDivisionUpTo64Bits(Div: &I);
1376	return;
1377	}
1378
1379	if (Opc == Instruction::URem \|\| Opc == Instruction::SRem) {
1380	expandRemainderUpTo64Bits(Rem: &I);
1381	return;
1382	}
1383
1384	llvm_unreachable("not a division");
1385	}
1386
1387	/*
1388	This will cause non-byte load in consistency, for example:
1389	```
1390	%load = load i1, ptr addrspace(4) %arg, align 4
1391	%zext = zext i1 %load to
1392	i64 %add = add i64 %zext
1393	```
1394	Instead of creating `s_and_b32 s0, s0, 1`,
1395	it will create `s_and_b32 s0, s0, 0xff`.
1396	We accept this change since the non-byte load assumes the upper bits
1397	within the byte are all 0.
1398	*/
1399	bool AMDGPUCodeGenPrepareImpl::tryNarrowMathIfNoOverflow(Instruction *I) {
1400	unsigned Opc = I->getOpcode();
1401	Type *OldType = I->getType();
1402
1403	if (Opc != Instruction::Add && Opc != Instruction::Mul)
1404	return false;
1405
1406	unsigned OrigBit = OldType->getScalarSizeInBits();
1407
1408	if (Opc != Instruction::Add && Opc != Instruction::Mul)
1409	llvm_unreachable("Unexpected opcode, only valid for Instruction::Add and "
1410	"Instruction::Mul.");
1411
1412	unsigned MaxBitsNeeded = computeKnownBits(V: I, DL).countMaxActiveBits();
1413
1414	MaxBitsNeeded = std::max<unsigned>(a: bit_ceil(Value: MaxBitsNeeded), b: `8`);
1415	Type *NewType = DL.getSmallestLegalIntType(C&: I->getContext(), Width: MaxBitsNeeded);
1416	if (!NewType)
1417	return false;
1418	unsigned NewBit = NewType->getIntegerBitWidth();
1419	if (NewBit >= OrigBit)
1420	return false;
1421	NewType = I->getType()->getWithNewBitWidth(NewBitWidth: NewBit);
1422
1423	// Old cost
1424	const TargetTransformInfo &TTI = TM.getTargetTransformInfo(F);
1425	InstructionCost OldCost =
1426	TTI.getArithmeticInstrCost(Opcode: Opc, Ty: OldType, CostKind: TTI::TCK_RecipThroughput);
1427	// New cost of new op
1428	InstructionCost NewCost =
1429	TTI.getArithmeticInstrCost(Opcode: Opc, Ty: NewType, CostKind: TTI::TCK_RecipThroughput);
1430	// New cost of narrowing 2 operands (use trunc)
1431	int NumOfNonConstOps = `2`;
1432	if (isa<Constant>(Val: I->getOperand(i: `0`)) \|\| isa<Constant>(Val: I->getOperand(i: `1`))) {
1433	// Cannot be both constant, should be propagated
1434	NumOfNonConstOps = `1`;
1435	}
1436	NewCost += NumOfNonConstOps * TTI.getCastInstrCost(Opcode: Instruction::Trunc,
1437	Dst: NewType, Src: OldType,
1438	CCH: TTI.getCastContextHint(I),
1439	CostKind: TTI::TCK_RecipThroughput);
1440	// New cost of zext narrowed result to original type
1441	NewCost +=
1442	TTI.getCastInstrCost(Opcode: Instruction::ZExt, Dst: OldType, Src: NewType,
1443	CCH: TTI.getCastContextHint(I), CostKind: TTI::TCK_RecipThroughput);
1444	if (NewCost >= OldCost)
1445	return false;
1446
1447	IRBuilder<> Builder(I);
1448	Value *Trunc0 = Builder.CreateTrunc(V: I->getOperand(i: `0`), DestTy: NewType);
1449	Value *Trunc1 = Builder.CreateTrunc(V: I->getOperand(i: `1`), DestTy: NewType);
1450	Value *Arith =
1451	Builder.CreateBinOp(Opc: (Instruction::BinaryOps)Opc, LHS: Trunc0, RHS: Trunc1);
1452
1453	Value *Zext = Builder.CreateZExt(V: Arith, DestTy: OldType);
1454	I->replaceAllUsesWith(V: Zext);
1455	DeadVals.push_back(Elt: I);
1456	return true;
1457	}
1458
1459	bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
1460	if (foldBinOpIntoSelect(BO&: I))
1461	return true;
1462
1463	if (UseMul24Intrin && replaceMulWithMul24(I))
1464	return true;
1465	if (tryNarrowMathIfNoOverflow(I: &I))
1466	return true;
1467
1468	bool Changed = false;
1469	Instruction::BinaryOps Opc = I.getOpcode();
1470	Type *Ty = I.getType();
1471	Value NewDiv = nullptr*;
1472	unsigned ScalarSize = Ty->getScalarSizeInBits();
1473
1474	SmallVector<BinaryOperator *, `8`> Div64ToExpand;
1475
1476	if ((Opc == Instruction::URem \|\| Opc == Instruction::UDiv \|\|
1477	Opc == Instruction::SRem \|\| Opc == Instruction::SDiv) &&
1478	ScalarSize <= `64` &&
1479	!DisableIDivExpand) {
1480	Value *Num = I.getOperand(i_nocapture: `0`);
1481	Value *Den = I.getOperand(i_nocapture: `1`);
1482	IRBuilder<> Builder(&I);
1483	Builder.SetCurrentDebugLocation(I.getDebugLoc());
1484
1485	if (auto *VT = dyn_cast<FixedVectorType>(Val: Ty)) {
1486	NewDiv = PoisonValue::get(T: VT);
1487
1488	for (unsigned N = `0`, E = VT->getNumElements(); N != E; ++N) {
1489	Value *NumEltN = Builder.CreateExtractElement(Vec: Num, Idx: N);
1490	Value *DenEltN = Builder.CreateExtractElement(Vec: Den, Idx: N);
1491
1492	Value *NewElt;
1493	if (ScalarSize <= `32`) {
1494	NewElt = expandDivRem32(Builder, I, X: NumEltN, Y: DenEltN);
1495	if (!NewElt)
1496	NewElt = Builder.CreateBinOp(Opc, LHS: NumEltN, RHS: DenEltN);
1497	} else {
1498	// See if this 64-bit division can be shrunk to 32/24-bits before
1499	// producing the general expansion.
1500	NewElt = shrinkDivRem64(Builder, I, Num: NumEltN, Den: DenEltN);
1501	if (!NewElt) {
1502	// The general 64-bit expansion introduces control flow and doesn't
1503	// return the new value. Just insert a scalar copy and defer
1504	// expanding it.
1505	NewElt = Builder.CreateBinOp(Opc, LHS: NumEltN, RHS: DenEltN);
1506	// CreateBinOp does constant folding. If the operands are constant,
1507	// it will return a Constant instead of a BinaryOperator.
1508	if (auto *NewEltBO = dyn_cast<BinaryOperator>(Val: NewElt))
1509	Div64ToExpand.push_back(Elt: NewEltBO);
1510	}
1511	}
1512
1513	if (auto *NewEltI = dyn_cast<Instruction>(Val: NewElt))
1514	NewEltI->copyIRFlags(V: &I);
1515
1516	NewDiv = Builder.CreateInsertElement(Vec: NewDiv, NewElt, Idx: N);
1517	}
1518	} else {
1519	if (ScalarSize <= `32`)
1520	NewDiv = expandDivRem32(Builder, I, X: Num, Y: Den);
1521	else {
1522	NewDiv = shrinkDivRem64(Builder, I, Num, Den);
1523	if (!NewDiv)
1524	Div64ToExpand.push_back(Elt: &I);
1525	}
1526	}
1527
1528	if (NewDiv) {
1529	I.replaceAllUsesWith(V: NewDiv);
1530	DeadVals.push_back(Elt: &I);
1531	Changed = true;
1532	}
1533	}
1534
1535	if (ExpandDiv64InIR) {
1536	// TODO: We get much worse code in specially handled constant cases.
1537	for (BinaryOperator *Div : Div64ToExpand) {
1538	expandDivRem64(I&: *Div);
1539	FlowChanged = true;
1540	Changed = true;
1541	}
1542	}
1543
1544	return Changed;
1545	}
1546
1547	bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
1548	if (!WidenLoads)
1549	return false;
1550
1551	if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS \|\|
1552	I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1553	canWidenScalarExtLoad(I)) {
1554	IRBuilder<> Builder(&I);
1555	Builder.SetCurrentDebugLocation(I.getDebugLoc());
1556
1557	Type *I32Ty = Builder.getInt32Ty();
1558	LoadInst *WidenLoad = Builder.CreateLoad(Ty: I32Ty, Ptr: I.getPointerOperand());
1559	WidenLoad->copyMetadata(SrcInst: I);
1560
1561	// If we have range metadata, we need to convert the type, and not make
1562	// assumptions about the high bits.
1563	if (auto *Range = WidenLoad->getMetadata(KindID: LLVMContext::MD_range)) {
1564	ConstantInt *Lower =
1565	mdconst::extract<ConstantInt>(MD: Range->getOperand(I: `0`));
1566
1567	if (Lower->isNullValue()) {
1568	WidenLoad->setMetadata(KindID: LLVMContext::MD_range, Node: nullptr);
1569	} else {
1570	Metadata *LowAndHigh[] = {
1571	ConstantAsMetadata::get(C: ConstantInt::get(Ty: I32Ty, V: Lower->getValue().zext(width: `32`))),
1572	// Don't make assumptions about the high bits.
1573	ConstantAsMetadata::get(C: ConstantInt::get(Ty: I32Ty, V: `0`))
1574	};
1575
1576	WidenLoad->setMetadata(KindID: LLVMContext::MD_range,
1577	Node: MDNode::get(Context&: F.getContext(), MDs: LowAndHigh));
1578	}
1579	}
1580
1581	int TySize = DL.getTypeSizeInBits(Ty: I.getType());
1582	Type *IntNTy = Builder.getIntNTy(N: TySize);
1583	Value *ValTrunc = Builder.CreateTrunc(V: WidenLoad, DestTy: IntNTy);
1584	Value *ValOrig = Builder.CreateBitCast(V: ValTrunc, DestTy: I.getType());
1585	I.replaceAllUsesWith(V: ValOrig);
1586	DeadVals.push_back(Elt: &I);
1587	return true;
1588	}
1589
1590	return false;
1591	}
1592
1593	bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {
1594	Value *Cond = I.getCondition();
1595	Value *TrueVal = I.getTrueValue();
1596	Value *FalseVal = I.getFalseValue();
1597	Value *CmpVal;
1598	CmpPredicate Pred;
1599
1600	// Match fract pattern with nan check.
1601	if (!match(V: Cond, P: m_FCmp(Pred, L: m_Value(V&: CmpVal), R: m_NonNaN())))
1602	return false;
1603
1604	FPMathOperator *FPOp = dyn_cast<FPMathOperator>(Val: &I);
1605	if (!FPOp)
1606	return false;
1607
1608	IRBuilder<> Builder(&I);
1609	Builder.setFastMathFlags(FPOp->getFastMathFlags());
1610
1611	auto *IITrue = dyn_cast<IntrinsicInst>(Val: TrueVal);
1612	auto *IIFalse = dyn_cast<IntrinsicInst>(Val: FalseVal);
1613
1614	Value Fract = nullptr*;
1615	if (Pred == FCmpInst::FCMP_UNO && TrueVal == CmpVal && IIFalse &&
1616	CmpVal == matchFractPat(I&: *IIFalse)) {
1617	// isnan(x) ? x : fract(x)
1618	Fract = applyFractPat(Builder, FractArg: CmpVal);
1619	} else if (Pred == FCmpInst::FCMP_ORD && FalseVal == CmpVal && IITrue &&
1620	CmpVal == matchFractPat(I&: *IITrue)) {
1621	// !isnan(x) ? fract(x) : x
1622	Fract = applyFractPat(Builder, FractArg: CmpVal);
1623	} else
1624	return false;
1625
1626	Fract->takeName(V: &I);
1627	I.replaceAllUsesWith(V: Fract);
1628	DeadVals.push_back(Elt: &I);
1629	return true;
1630	}
1631
1632	static bool areInSameBB(const Value A, const* Value *B) {
1633	const auto *IA = dyn_cast<Instruction>(Val: A);
1634	const auto *IB = dyn_cast<Instruction>(Val: B);
1635	return IA && IB && IA->getParent() == IB->getParent();
1636	}
1637
1638	// Helper for breaking large PHIs that returns true when an extractelement on V
1639	// is likely to be folded away by the DAG combiner.
1640	static bool isInterestingPHIIncomingValue(const Value *V) {
1641	const auto *FVT = dyn_cast<FixedVectorType>(Val: V->getType());
1642	if (!FVT)
1643	return false;
1644
1645	const Value *CurVal = V;
1646
1647	// Check for insertelements, keeping track of the elements covered.
1648	BitVector EltsCovered(FVT->getNumElements());
1649	while (const auto *IE = dyn_cast<InsertElementInst>(Val: CurVal)) {
1650	const auto *Idx = dyn_cast<ConstantInt>(Val: IE->getOperand(i_nocapture: `2`));
1651
1652	// Non constant index/out of bounds index -> folding is unlikely.
1653	// The latter is more of a sanity check because canonical IR should just
1654	// have replaced those with poison.
1655	if (!Idx \|\| Idx->getZExtValue() >= FVT->getNumElements())
1656	return false;
1657
1658	const auto *VecSrc = IE->getOperand(i_nocapture: `0`);
1659
1660	// If the vector source is another instruction, it must be in the same basic
1661	// block. Otherwise, the DAGCombiner won't see the whole thing and is
1662	// unlikely to be able to do anything interesting here.
1663	if (isa<Instruction>(Val: VecSrc) && !areInSameBB(A: VecSrc, B: IE))
1664	return false;
1665
1666	CurVal = VecSrc;
1667	EltsCovered.set(Idx->getZExtValue());
1668
1669	// All elements covered.
1670	if (EltsCovered.all())
1671	return true;
1672	}
1673
1674	// We either didn't find a single insertelement, or the insertelement chain
1675	// ended before all elements were covered. Check for other interesting values.
1676
1677	// Constants are always interesting because we can just constant fold the
1678	// extractelements.
1679	if (isa<Constant>(Val: CurVal))
1680	return true;
1681
1682	// shufflevector is likely to be profitable if either operand is a constant,
1683	// or if either source is in the same block.
1684	// This is because shufflevector is most often lowered as a series of
1685	// insert/extract elements anyway.
1686	if (const auto *SV = dyn_cast<ShuffleVectorInst>(Val: CurVal)) {
1687	return isa<Constant>(Val: SV->getOperand(i_nocapture: `1`)) \|\|
1688	areInSameBB(A: SV, B: SV->getOperand(i_nocapture: `0`)) \|\|
1689	areInSameBB(A: SV, B: SV->getOperand(i_nocapture: `1`));
1690	}
1691
1692	return false;
1693	}
1694
1695	static void collectPHINodes(const PHINode &I,
1696	SmallPtrSet<const PHINode *, `8`> &SeenPHIs) {
1697	const auto [It, Inserted] = SeenPHIs.insert(Ptr: &I);
1698	if (!Inserted)
1699	return;
1700
1701	for (const Value *Inc : I.incoming_values()) {
1702	if (const auto *PhiInc = dyn_cast<PHINode>(Val: Inc))
1703	collectPHINodes(I: *PhiInc, SeenPHIs);
1704	}
1705
1706	for (const User *U : I.users()) {
1707	if (const auto *PhiU = dyn_cast<PHINode>(Val: U))
1708	collectPHINodes(I: *PhiU, SeenPHIs);
1709	}
1710	}
1711
1712	bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(const PHINode &I) {
1713	// Check in the cache first.
1714	if (const auto It = BreakPhiNodesCache.find(Val: &I);
1715	It != BreakPhiNodesCache.end())
1716	return It ->second;
1717
1718	// We consider PHI nodes as part of "chains", so given a PHI node I, we
1719	// recursively consider all its users and incoming values that are also PHI
1720	// nodes. We then make a decision about all of those PHIs at once. Either they
1721	// all get broken up, or none of them do. That way, we avoid cases where a
1722	// single PHI is/is not broken and we end up reforming/exploding a vector
1723	// multiple times, or even worse, doing it in a loop.
1724	SmallPtrSet<const PHINode *, `8`> WorkList;
1725	collectPHINodes(I, SeenPHIs&: WorkList);
1726
1727	#ifndef NDEBUG
1728	// Check that none of the PHI nodes in the worklist are in the map. If some of
1729	// them are, it means we're not good enough at collecting related PHIs.
1730	for (const PHINode *WLP : WorkList) {
1731	assert(BreakPhiNodesCache.count(WLP) == `0`);
1732	}
1733	#endif
1734
1735	// To consider a PHI profitable to break, we need to see some interesting
1736	// incoming values. At least 2/3rd (rounded up) of all PHIs in the worklist
1737	// must have one to consider all PHIs breakable.
1738	//
1739	// This threshold has been determined through performance testing.
1740	//
1741	// Note that the computation below is equivalent to
1742	//
1743	// (unsigned)ceil((K / 3.0) 2)*
1744	//
1745	// It's simply written this way to avoid mixing integral/FP arithmetic.
1746	const auto Threshold = (alignTo(Value: WorkList.size() * `2`, Align: `3`) / `3`);
1747	unsigned NumBreakablePHIs = `0`;
1748	bool CanBreak = false;
1749	for (const PHINode *Cur : WorkList) {
1750	// Don't break PHIs that have no interesting incoming values. That is, where
1751	// there is no clear opportunity to fold the "extractelement" instructions
1752	// we would add.
1753	//
1754	// Note: IC does not run after this pass, so we're only interested in the
1755	// foldings that the DAG combiner can do.
1756	if (any_of(Range: Cur->incoming_values(), P: isInterestingPHIIncomingValue)) {
1757	if (++NumBreakablePHIs >= Threshold) {
1758	CanBreak = true;
1759	break;
1760	}
1761	}
1762	}
1763
1764	for (const PHINode *Cur : WorkList)
1765	BreakPhiNodesCache [Cur] = CanBreak;
1766
1767	return CanBreak;
1768	}
1769
1770	/// Helper class for "break large PHIs" (visitPHINode).
1771	///
1772	/// This represents a slice of a PHI's incoming value, which is made up of:
1773	/// - The type of the slice (Ty)
1774	/// - The index in the incoming value's vector where the slice starts (Idx)
1775	/// - The number of elements in the slice (NumElts).
1776	/// It also keeps track of the NewPHI node inserted for this particular slice.
1777	///
1778	/// Slice examples:
1779	/// <4 x i64> -> Split into four i64 slices.
1780	/// -> [i64, 0, 1], [i64, 1, 1], [i64, 2, 1], [i64, 3, 1]
1781	/// <5 x i16> -> Split into 2 <2 x i16> slices + a i16 tail.
1782	/// -> [<2 x i16>, 0, 2], [<2 x i16>, 2, 2], [i16, 4, 1]
1783	class VectorSlice {
1784	public:
1785	VectorSlice(Type Ty, unsigned* Idx, unsigned NumElts)
1786	: Ty(Ty), Idx(Idx), NumElts(NumElts) {}
1787
1788	Type Ty = nullptr*;
1789	unsigned Idx = `0`;
1790	unsigned NumElts = `0`;
1791	PHINode NewPHI = nullptr*;
1792
1793	/// Slice \p Inc according to the information contained within this slice.
1794	/// This is cached, so if called multiple times for the same \p BB & \p Inc
1795	/// pair, it returns the same Sliced value as well.
1796	///
1797	/// Note this intentionally* does not return the same value for, say,*
1798	/// [%bb.0, %0] & [%bb.1, %0] as:
1799	/// - It could cause issues with dominance (e.g. if bb.1 is seen first, then
1800	/// the value in bb.1 may not be reachable from bb.0 if it's its
1801	/// predecessor.)
1802	/// - We also want to make our extract instructions as local as possible so
1803	/// the DAG has better chances of folding them out. Duplicating them like
1804	/// that is beneficial in that regard.
1805	///
1806	/// This is both a minor optimization to avoid creating duplicate
1807	/// instructions, but also a requirement for correctness. It is not forbidden
1808	/// for a PHI node to have the same [BB, Val] pair multiple times. If we
1809	/// returned a new value each time, those previously identical pairs would all
1810	/// have different incoming values (from the same block) and it'd cause a "PHI
1811	/// node has multiple entries for the same basic block with different incoming
1812	/// values!" verifier error.
1813	Value getSlicedVal(BasicBlock BB, Value *Inc, StringRef NewValName) {
1814	Value *&Res = SlicedVals [{BB, Inc}];
1815	if (Res)
1816	return Res;
1817
1818	IRBuilder<> B(BB->getTerminator());
1819	if (Instruction *IncInst = dyn_cast<Instruction>(Val: Inc))
1820	B.SetCurrentDebugLocation(IncInst->getDebugLoc());
1821
1822	if (NumElts > `1`) {
1823	SmallVector<int, `4`> Mask;
1824	for (unsigned K = Idx; K < (Idx + NumElts); ++K)
1825	Mask.push_back(Elt: K);
1826	Res = B.CreateShuffleVector(V: Inc, Mask, Name: NewValName);
1827	} else
1828	Res = B.CreateExtractElement(Vec: Inc, Idx, Name: NewValName);
1829
1830	return Res;
1831	}
1832
1833	private:
1834	SmallDenseMap<std::pair<BasicBlock , Value >, Value *> SlicedVals;
1835	};
1836
1837	bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
1838	// Break-up fixed-vector PHIs into smaller pieces.
1839	// Default threshold is 32, so it breaks up any vector that's >32 bits into
1840	// its elements, or into 32-bit pieces (for 8/16 bit elts).
1841	//
1842	// This is only helpful for DAGISel because it doesn't handle large PHIs as
1843	// well as GlobalISel. DAGISel lowers PHIs by using CopyToReg/CopyFromReg.
1844	// With large, odd-sized PHIs we may end up needing many `build_vector`
1845	// operations with most elements being "undef". This inhibits a lot of
1846	// optimization opportunities and can result in unreasonably high register
1847	// pressure and the inevitable stack spilling.
1848	if (!BreakLargePHIs \|\| getCGPassBuilderOption().EnableGlobalISelOption)
1849	return false;
1850
1851	FixedVectorType *FVT = dyn_cast<FixedVectorType>(Val: I.getType());
1852	if (!FVT \|\| FVT->getNumElements() == `1` \|\|
1853	DL.getTypeSizeInBits(Ty: FVT) <= BreakLargePHIsThreshold)
1854	return false;
1855
1856	if (!ForceBreakLargePHIs && !canBreakPHINode(I))
1857	return false;
1858
1859	std::vector<VectorSlice> Slices;
1860
1861	Type *EltTy = FVT->getElementType();
1862	{
1863	unsigned Idx = `0`;
1864	// For 8/16 bits type, don't scalarize fully but break it up into as many
1865	// 32-bit slices as we can, and scalarize the tail.
1866	const unsigned EltSize = DL.getTypeSizeInBits(Ty: EltTy);
1867	const unsigned NumElts = FVT->getNumElements();
1868	if (EltSize == `8` \|\| EltSize == `16`) {
1869	const unsigned SubVecSize = (`32` / EltSize);
1870	Type *SubVecTy = FixedVectorType::get(ElementType: EltTy, NumElts: SubVecSize);
1871	for (unsigned End = alignDown(Value: NumElts, Align: SubVecSize); Idx < End;
1872	Idx += SubVecSize)
1873	Slices.emplace_back(args&: SubVecTy, args&: Idx, args: SubVecSize);
1874	}
1875
1876	// Scalarize all remaining elements.
1877	for (; Idx < NumElts; ++Idx)
1878	Slices.emplace_back(args&: EltTy, args&: Idx, args: `1`);
1879	}
1880
1881	assert(Slices.size() > `1`);
1882
1883	// Create one PHI per vector piece. The "VectorSlice" class takes care of
1884	// creating the necessary instruction to extract the relevant slices of each
1885	// incoming value.
1886	IRBuilder<> B(I.getParent());
1887	B.SetCurrentDebugLocation(I.getDebugLoc());
1888
1889	unsigned IncNameSuffix = `0`;
1890	for (VectorSlice &S : Slices) {
1891	// We need to reset the build on each iteration, because getSlicedVal may
1892	// have inserted something into I's BB.
1893	B.SetInsertPoint(I.getParent()->getFirstNonPHIIt());
1894	S.NewPHI = B.CreatePHI(Ty: S.Ty, NumReservedValues: I.getNumIncomingValues());
1895
1896	for (const auto &[Idx, BB] : enumerate(First: I.blocks())) {
1897	S.NewPHI->addIncoming(V: S.getSlicedVal(BB, Inc: I.getIncomingValue(i: Idx),
1898	NewValName: "largephi.extractslice" +
1899	std::to_string(val: IncNameSuffix++)),
1900	BB);
1901	}
1902	}
1903
1904	// And replace this PHI with a vector of all the previous PHI values.
1905	Value *Vec = PoisonValue::get(T: FVT);
1906	unsigned NameSuffix = `0`;
1907	for (VectorSlice &S : Slices) {
1908	const auto ValName = "largephi.insertslice" + std::to_string(val: NameSuffix++);
1909	if (S.NumElts > `1`)
1910	Vec = B.CreateInsertVector(DstType: FVT, SrcVec: Vec, SubVec: S.NewPHI, Idx: S.Idx, Name: ValName);
1911	else
1912	Vec = B.CreateInsertElement(Vec, NewElt: S.NewPHI, Idx: S.Idx, Name: ValName);
1913	}
1914
1915	I.replaceAllUsesWith(V: Vec);
1916	DeadVals.push_back(Elt: &I);
1917	return true;
1918	}
1919
1920	/// \param V Value to check
1921	/// \param DL DataLayout
1922	/// \param TM TargetMachine (TODO: remove once DL contains nullptr values)
1923	/// \param AS Target Address Space
1924	/// \return true if \p V cannot be the null value of \p AS, false otherwise.
1925	static bool isPtrKnownNeverNull(const Value V, const* DataLayout &DL,
1926	const AMDGPUTargetMachine &TM, unsigned AS) {
1927	// Pointer cannot be null if it's a block address, GV or alloca.
1928	// NOTE: We don't support extern_weak, but if we did, we'd need to check for
1929	// it as the symbol could be null in such cases.
1930	if (isa<BlockAddress, GlobalValue, AllocaInst>(Val: V))
1931	return true;
1932
1933	// Check nonnull arguments.
1934	if (const auto *Arg = dyn_cast<Argument>(Val: V); Arg && Arg->hasNonNullAttr())
1935	return true;
1936
1937	// Check nonnull loads.
1938	if (const auto *Load = dyn_cast<LoadInst>(Val: V);
1939	Load && Load->hasMetadata(KindID: LLVMContext::MD_nonnull))
1940	return true;
1941
1942	// getUnderlyingObject may have looked through another addrspacecast, although
1943	// the optimizable situations most likely folded out by now.
1944	if (AS != cast<PointerType>(Val: V->getType())->getAddressSpace())
1945	return false;
1946
1947	// TODO: Calls that return nonnull?
1948
1949	// For all other things, use KnownBits.
1950	// We either use 0 or all bits set to indicate null, so check whether the
1951	// value can be zero or all ones.
1952	//
1953	// TODO: Use ValueTracking's isKnownNeverNull if it becomes aware that some
1954	// address spaces have non-zero null values.
1955	auto SrcPtrKB = computeKnownBits(V, DL);
1956	const auto NullVal = AMDGPU::getNullPointerValue(AS);
1957
1958	assert(SrcPtrKB.getBitWidth() == DL.getPointerSizeInBits(AS));
1959	assert((NullVal == `0` \|\| NullVal == -`1`) &&
1960	"don't know how to check for this null value!");
1961	return NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero();
1962	}
1963
1964	bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
1965	// Intrinsic doesn't support vectors, also it seems that it's often difficult
1966	// to prove that a vector cannot have any nulls in it so it's unclear if it's
1967	// worth supporting.
1968	if (I.getType()->isVectorTy())
1969	return false;
1970
1971	// Check if this can be lowered to a amdgcn.addrspacecast.nonnull.
1972	// This is only worthwhile for casts from/to priv/local to flat.
1973	const unsigned SrcAS = I.getSrcAddressSpace();
1974	const unsigned DstAS = I.getDestAddressSpace();
1975
1976	bool CanLower = false;
1977	if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1978	CanLower = (DstAS == AMDGPUAS::LOCAL_ADDRESS \|\|
1979	DstAS == AMDGPUAS::PRIVATE_ADDRESS);
1980	else if (DstAS == AMDGPUAS::FLAT_ADDRESS)
1981	CanLower = (SrcAS == AMDGPUAS::LOCAL_ADDRESS \|\|
1982	SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
1983	if (!CanLower)
1984	return false;
1985
1986	SmallVector<const Value *, `4`> WorkList;
1987	getUnderlyingObjects(V: I.getOperand(i_nocapture: `0`), Objects&: WorkList);
1988	if (!all_of(Range&: WorkList, P: [&](const Value *V) {
1989	return isPtrKnownNeverNull(V, DL, TM, AS: SrcAS);
1990	}))
1991	return false;
1992
1993	IRBuilder<> B(&I);
1994	auto *Intrin = B.CreateIntrinsic(
1995	RetTy: I.getType(), ID: Intrinsic::amdgcn_addrspacecast_nonnull, Args: {I.getOperand(i_nocapture: `0`)});
1996	I.replaceAllUsesWith(V: Intrin);
1997	DeadVals.push_back(Elt: &I);
1998	return true;
1999	}
2000
2001	bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
2002	Intrinsic::ID IID = I.getIntrinsicID();
2003	switch (IID) {
2004	case Intrinsic::minnum:
2005	case Intrinsic::minimumnum:
2006	case Intrinsic::minimum:
2007	return visitFMinLike(I);
2008	case Intrinsic::sqrt:
2009	return visitSqrt(I);
2010	case Intrinsic::log:
2011	case Intrinsic::log10:
2012	return visitLog(Log&: cast<FPMathOperator>(Val&: I), IID);
2013	case Intrinsic::log2:
2014	// No reason to handle log2.
2015	return false;
2016	case Intrinsic::amdgcn_mbcnt_lo:
2017	return visitMbcntLo(I);
2018	case Intrinsic::amdgcn_mbcnt_hi:
2019	return visitMbcntHi(I);
2020	default:
2021	return false;
2022	}
2023	}
2024
2025	/// Match non-nan fract pattern.
2026	/// minnum(fsub(x, floor(x)), nextafter(1.0, -1.0))
2027	/// minimumnum(fsub(x, floor(x)), nextafter(1.0, -1.0))
2028	/// minimum(fsub(x, floor(x)), nextafter(1.0, -1.0))
2029	///
2030	/// If fract is a useful instruction for the subtarget. Does not account for the
2031	/// nan handling; the instruction has a nan check on the input value.
2032	Value *AMDGPUCodeGenPrepareImpl::matchFractPat(IntrinsicInst &I) {
2033	if (ST.hasFractBug())
2034	return nullptr;
2035
2036	Intrinsic::ID IID = I.getIntrinsicID();
2037
2038	// The value is only used in contexts where we know the input isn't a nan, so
2039	// any of the fmin variants are fine.
2040	if (IID != Intrinsic::minnum && IID != Intrinsic::minimum &&
2041	IID != Intrinsic::minimumnum)
2042	return nullptr;
2043
2044	Type *Ty = I.getType();
2045	if (!isLegalFloatingTy(Ty: Ty->getScalarType()))
2046	return nullptr;
2047
2048	Value *Arg0 = I.getArgOperand(i: `0`);
2049	Value *Arg1 = I.getArgOperand(i: `1`);
2050
2051	const APFloat *C;
2052	if (!match(V: Arg1, P: m_APFloat(Res&: C)))
2053	return nullptr;
2054
2055	APFloat One(`1.0`);
2056	bool LosesInfo;
2057	One.convert(ToSemantics: C->getSemantics(), RM: APFloat::rmNearestTiesToEven, losesInfo: &LosesInfo);
2058
2059	// Match nextafter(1.0, -1)
2060	One.next(nextDown: true);
2061	if (One != *C)
2062	return nullptr;
2063
2064	Value *FloorSrc;
2065	if (match(V: Arg0, P: m_FSub(L: m_Value(V&: FloorSrc),
2066	R: m_Intrinsic<Intrinsic::floor>(Op0: m_Deferred(V: FloorSrc)))))
2067	return FloorSrc;
2068	return nullptr;
2069	}
2070
2071	Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,
2072	Value *FractArg) {
2073	SmallVector<Value *, `4`> FractVals;
2074	extractValues(Builder, Values&: FractVals, V: FractArg);
2075
2076	SmallVector<Value *, `4`> ResultVals(FractVals.size());
2077
2078	Type *Ty = FractArg->getType()->getScalarType();
2079	for (unsigned I = `0`, E = FractVals.size(); I != E; ++I) {
2080	ResultVals [I] =
2081	Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_fract, Types: {Ty}, Args: {FractVals [I]});
2082	}
2083
2084	return insertValues(Builder, Ty: FractArg->getType(), Values&: ResultVals);
2085	}
2086
2087	bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) {
2088	Value *FractArg = matchFractPat(I);
2089	if (!FractArg)
2090	return false;
2091
2092	// Match pattern for fract intrinsic in contexts where the nan check has been
2093	// optimized out (and hope the knowledge the source can't be nan wasn't lost).
2094	if (!I.hasNoNaNs() && !isKnownNeverNaN(V: FractArg, SQ: SQ.getWithInstruction(I: &I)))
2095	return false;
2096
2097	IRBuilder<> Builder(&I);
2098	FastMathFlags FMF = I.getFastMathFlags();
2099	FMF.setNoNaNs();
2100	Builder.setFastMathFlags(FMF);
2101
2102	Value *Fract = applyFractPat(Builder, FractArg);
2103	Fract->takeName(V: &I);
2104	I.replaceAllUsesWith(V: Fract);
2105	DeadVals.push_back(Elt: &I);
2106	return true;
2107	}
2108
2109	// Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way.
2110	bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
2111	Type *Ty = Sqrt.getType()->getScalarType();
2112	if (!Ty->isFloatTy() && (!Ty->isHalfTy() \|\| ST.has16BitInsts()))
2113	return false;
2114
2115	const FPMathOperator FPOp = cast<const* FPMathOperator>(Val: &Sqrt);
2116	FastMathFlags SqrtFMF = FPOp->getFastMathFlags();
2117
2118	// We're trying to handle the fast-but-not-that-fast case only. The lowering
2119	// of fast llvm.sqrt will give the raw instruction anyway.
2120	if (SqrtFMF.approxFunc())
2121	return false;
2122
2123	const float ReqdAccuracy = FPOp->getFPAccuracy();
2124
2125	// Defer correctly rounded expansion to codegen.
2126	if (ReqdAccuracy < `1.0f`)
2127	return false;
2128
2129	Value *SrcVal = Sqrt.getOperand(i_nocapture: `0`);
2130	bool CanTreatAsDAZ = canIgnoreDenormalInput(V: SrcVal, CtxI: &Sqrt);
2131
2132	// The raw instruction is 1 ulp, but the correction for denormal handling
2133	// brings it to 2.
2134	if (!CanTreatAsDAZ && ReqdAccuracy < `2.0f`)
2135	return false;
2136
2137	IRBuilder<> Builder(&Sqrt);
2138	SmallVector<Value *, `4`> SrcVals;
2139	extractValues(Builder, Values&: SrcVals, V: SrcVal);
2140
2141	SmallVector<Value *, `4`> ResultVals(SrcVals.size());
2142	for (int I = `0`, E = SrcVals.size(); I != E; ++I) {
2143	if (CanTreatAsDAZ)
2144	ResultVals [I] = Builder.CreateCall(Callee: getSqrtF32(), Args: SrcVals [I]);
2145	else
2146	ResultVals [I] = emitSqrtIEEE2ULP(Builder, Src: SrcVals [I], FMF: SqrtFMF);
2147	}
2148
2149	Value *NewSqrt = insertValues(Builder, Ty: Sqrt.getType(), Values&: ResultVals);
2150	NewSqrt->takeName(V: &Sqrt);
2151	Sqrt.replaceAllUsesWith(V: NewSqrt);
2152	DeadVals.push_back(Elt: &Sqrt);
2153	return true;
2154	}
2155
2156	/// Replace log and log10 intrinsic calls based on fpmath metadata.
2157	bool AMDGPUCodeGenPrepareImpl::visitLog(FPMathOperator &Log,
2158	Intrinsic::ID IID) {
2159	Type *Ty = Log.getType();
2160	if (!Ty->getScalarType()->isHalfTy() \|\| !ST.has16BitInsts())
2161	return false;
2162
2163	FastMathFlags FMF = Log.getFastMathFlags();
2164
2165	// Defer fast math cases to codegen.
2166	if (FMF.approxFunc())
2167	return false;
2168
2169	// Limit experimentally determined from OpenCL conformance test (1.79)
2170	if (Log.getFPAccuracy() < `1.80f`)
2171	return false;
2172
2173	IRBuilder<> Builder(&cast<CallInst>(Val&: Log));
2174
2175	// Use the generic intrinsic for convenience in the vector case. Codegen will
2176	// recognize the denormal handling is not necessary from the fpext.
2177	// TODO: Move to generic code
2178	Value *Log2 =
2179	Builder.CreateUnaryIntrinsic(ID: Intrinsic::log2, V: Log.getOperand(i: `0`), FMFSource: FMF);
2180
2181	double Log2BaseInverted =
2182	IID == Intrinsic::log10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
2183	Value *Mul =
2184	Builder.CreateFMulFMF(L: Log2, R: ConstantFP::get(Ty, V: Log2BaseInverted), FMFSource: FMF);
2185
2186	Mul->takeName(V: &Log);
2187
2188	Log.replaceAllUsesWith(V: Mul);
2189	DeadVals.push_back(Elt: &Log);
2190	return true;
2191	}
2192
2193	bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
2194	if (skipFunction(F))
2195	return false;
2196
2197	auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
2198	if (!TPC)
2199	return false;
2200
2201	const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
2202	const TargetLibraryInfo *TLI =
2203	&getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
2204	AssumptionCache *AC =
2205	&getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2206	auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
2207	const DominatorTree DT = DTWP ? &DTWP->getDomTree() : nullptr*;
2208	const UniformityInfo &UA =
2209	getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
2210	return AMDGPUCodeGenPrepareImpl (F, TM, TLI, AC, DT, UA).run();
2211	}
2212
2213	PreservedAnalyses AMDGPUCodeGenPreparePass::run(Function &F,
2214	FunctionAnalysisManager &FAM) {
2215	const AMDGPUTargetMachine &ATM = static_cast<const AMDGPUTargetMachine &>(TM);
2216	const TargetLibraryInfo *TLI = &FAM.getResult<TargetLibraryAnalysis>(IR&: F);
2217	AssumptionCache *AC = &FAM.getResult<AssumptionAnalysis>(IR&: F);
2218	const DominatorTree *DT = FAM.getCachedResult<DominatorTreeAnalysis>(IR&: F);
2219	const UniformityInfo &UA = FAM.getResult<UniformityInfoAnalysis>(IR&: F);
2220	AMDGPUCodeGenPrepareImpl Impl(F, ATM, TLI, AC, DT, UA);
2221	if (!Impl.run())
2222	return PreservedAnalyses::all();
2223	PreservedAnalyses PA = PreservedAnalyses::none();
2224	if (!Impl.FlowChanged)
2225	PA.preserveSet<CFGAnalyses>();
2226	return PA;
2227	}
2228
2229	INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
2230	"AMDGPU IR optimizations", false, false)
2231	INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
2232	INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
2233	INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
2234	INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
2235	false, false)
2236
2237	/// Create a workitem.id.x intrinsic call with range metadata.
2238	CallInst AMDGPUCodeGenPrepareImpl::createWorkitemIdX(IRBuilder<> &B) const* {
2239	CallInst *Tid = B.CreateIntrinsic(ID: Intrinsic::amdgcn_workitem_id_x, Args: {});
2240	ST.makeLIDRangeMetadata(I: Tid);
2241	return Tid;
2242	}
2243
2244	/// Replace the instruction with a direct workitem.id.x call.
2245	void AMDGPUCodeGenPrepareImpl::replaceWithWorkitemIdX(Instruction &I) const {
2246	IRBuilder<> B(&I);
2247	CallInst *Tid = createWorkitemIdX(B);
2248	BasicBlock::iterator BI(&I);
2249	ReplaceInstWithValue(BI, V: Tid);
2250	}
2251
2252	/// Replace the instruction with (workitem.id.x & mask).
2253	void AMDGPUCodeGenPrepareImpl::replaceWithMaskedWorkitemIdX(
2254	Instruction &I, unsigned WaveSize) const {
2255	IRBuilder<> B(&I);
2256	CallInst *Tid = createWorkitemIdX(B);
2257	Constant *Mask = ConstantInt::get(Ty: Tid->getType(), V: WaveSize - `1`);
2258	Value *AndInst = B.CreateAnd(LHS: Tid, RHS: Mask);
2259	BasicBlock::iterator BI(&I);
2260	ReplaceInstWithValue(BI, V: AndInst);
2261	}
2262
2263	/// Try to optimize mbcnt instruction by replacing with workitem.id.x when
2264	/// work group size allows direct computation of lane ID.
2265	/// Returns true if optimization was applied, false otherwise.
2266	bool AMDGPUCodeGenPrepareImpl::tryReplaceWithWorkitemId(Instruction &I,
2267	unsigned Wave) const {
2268	std::optional<unsigned> MaybeX = ST.getReqdWorkGroupSize(F, Dim: `0`);
2269	if (!MaybeX)
2270	return false;
2271
2272	// When work group size == wave_size, each work group contains exactly one
2273	// wave, so the instruction can be replaced with workitem.id.x directly.
2274	if (*MaybeX == Wave) {
2275	replaceWithWorkitemIdX(I);
2276	return true;
2277	}
2278
2279	// When work group evenly splits into waves, compute lane ID within wave
2280	// using bit masking: lane_id = workitem.id.x & (wave_size - 1).
2281	if (ST.hasWavefrontsEvenlySplittingXDim(F, /RequiresUniformYZ=/REquiresUniformYZ: true)) {
2282	replaceWithMaskedWorkitemIdX(I, WaveSize: Wave);
2283	return true;
2284	}
2285
2286	return false;
2287	}
2288
2289	/// Optimize mbcnt.lo calls on wave32 architectures for lane ID computation.
2290	bool AMDGPUCodeGenPrepareImpl::visitMbcntLo(IntrinsicInst &I) const {
2291	// This optimization only applies to wave32 targets where mbcnt.lo operates on
2292	// the full execution mask.
2293	if (!ST.isWave32())
2294	return false;
2295
2296	// Only optimize the pattern mbcnt.lo(~0, 0) which counts active lanes with
2297	// lower IDs.
2298	if (!match(V: &I,
2299	P: m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>(Op0: m_AllOnes(), Op1: m_Zero())))
2300	return false;
2301
2302	return tryReplaceWithWorkitemId(I, Wave: ST.getWavefrontSize());
2303	}
2304
2305	/// Optimize mbcnt.hi calls for lane ID computation.
2306	bool AMDGPUCodeGenPrepareImpl::visitMbcntHi(IntrinsicInst &I) const {
2307	// Abort if wave size is not known at compile time.
2308	if (!ST.isWaveSizeKnown())
2309	return false;
2310
2311	unsigned Wave = ST.getWavefrontSize();
2312
2313	// On wave32, the upper 32 bits of execution mask are always 0, so
2314	// mbcnt.hi(mask, val) always returns val unchanged.
2315	if (ST.isWave32()) {
2316	if (auto MaybeX = ST.getReqdWorkGroupSize(F, Dim: `0`)) {
2317	// Replace mbcnt.hi(mask, val) with val only when work group size matches
2318	// wave size (single wave per work group).
2319	if (*MaybeX == Wave) {
2320	BasicBlock::iterator BI(&I);
2321	ReplaceInstWithValue(BI, V: I.getArgOperand(i: `1`));
2322	return true;
2323	}
2324	}
2325	}
2326
2327	// Optimize the complete lane ID computation pattern:
2328	// mbcnt.hi(~0, mbcnt.lo(~0, 0)) which counts all active lanes with lower IDs
2329	// across the full execution mask.
2330	using namespace PatternMatch;
2331
2332	// Check for pattern: mbcnt.hi(~0, mbcnt.lo(~0, 0))
2333	if (!match(V: &I, P: m_Intrinsic<Intrinsic::amdgcn_mbcnt_hi>(
2334	Op0: m_AllOnes(), Op1: m_Intrinsic<Intrinsic::amdgcn_mbcnt_lo>(
2335	Op0: m_AllOnes(), Op1: m_Zero()))))
2336	return false;
2337
2338	return tryReplaceWithWorkitemId(I, Wave);
2339	}
2340
2341	char AMDGPUCodeGenPrepare::ID = `0`;
2342
2343	FunctionPass *llvm::createAMDGPUCodeGenPreparePass() {
2344	return new AMDGPUCodeGenPrepare ();
2345	}
2346

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp