AMDGPUCodeGenPrepare.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp]

1	//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// This pass does misc. AMDGPU optimizations on IR before instruction
11	/// selection.
12	//
13	//===----------------------------------------------------------------------===//
14
15	#include "AMDGPU.h"
16	#include "AMDGPUTargetMachine.h"
17	#include "SIModeRegisterDefaults.h"
18	#include "llvm/Analysis/AssumptionCache.h"
19	#include "llvm/Analysis/ConstantFolding.h"
20	#include "llvm/Analysis/TargetLibraryInfo.h"
21	#include "llvm/Analysis/UniformityAnalysis.h"
22	#include "llvm/Analysis/ValueTracking.h"
23	#include "llvm/CodeGen/TargetPassConfig.h"
24	#include "llvm/IR/Dominators.h"
25	#include "llvm/IR/IRBuilder.h"
26	#include "llvm/IR/InstVisitor.h"
27	#include "llvm/IR/IntrinsicsAMDGPU.h"
28	#include "llvm/IR/PatternMatch.h"
29	#include "llvm/Pass.h"
30	#include "llvm/Support/KnownBits.h"
31	#include "llvm/Support/KnownFPClass.h"
32	#include "llvm/Transforms/Utils/IntegerDivision.h"
33	#include "llvm/Transforms/Utils/Local.h"
34
35	#define DEBUG_TYPE "amdgpu-codegenprepare"
36
37	using namespace llvm;
38	using namespace llvm::PatternMatch;
39
40	namespace {
41
42	static cl::opt<bool> WidenLoads(
43	"amdgpu-codegenprepare-widen-constant-loads",
44	cl::desc ("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
45	cl::ReallyHidden,
46	cl::init(Val: false));
47
48	static cl::opt<bool> Widen16BitOps(
49	"amdgpu-codegenprepare-widen-16-bit-ops",
50	cl::desc (
51	"Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
52	cl::ReallyHidden, cl::init(Val: false));
53
54	static cl::opt<bool>
55	BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
56	cl::desc ("Break large PHI nodes for DAGISel"),
57	cl::ReallyHidden, cl::init(Val: true));
58
59	static cl::opt<bool>
60	ForceBreakLargePHIs("amdgpu-codegenprepare-force-break-large-phis",
61	cl::desc ("For testing purposes, always break large "
62	"PHIs even if it isn't profitable."),
63	cl::ReallyHidden, cl::init(Val: false));
64
65	static cl::opt<unsigned> BreakLargePHIsThreshold(
66	"amdgpu-codegenprepare-break-large-phis-threshold",
67	cl::desc ("Minimum type size in bits for breaking large PHI nodes"),
68	cl::ReallyHidden, cl::init(Val: `32`));
69
70	static cl::opt<bool> UseMul24Intrin(
71	"amdgpu-codegenprepare-mul24",
72	cl::desc ("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
73	cl::ReallyHidden,
74	cl::init(Val: true));
75
76	// Legalize 64-bit division by using the generic IR expansion.
77	static cl::opt<bool> ExpandDiv64InIR(
78	"amdgpu-codegenprepare-expand-div64",
79	cl::desc ("Expand 64-bit division in AMDGPUCodeGenPrepare"),
80	cl::ReallyHidden,
81	cl::init(Val: false));
82
83	// Leave all division operations as they are. This supersedes ExpandDiv64InIR
84	// and is used for testing the legalizer.
85	static cl::opt<bool> DisableIDivExpand(
86	"amdgpu-codegenprepare-disable-idiv-expansion",
87	cl::desc ("Prevent expanding integer division in AMDGPUCodeGenPrepare"),
88	cl::ReallyHidden,
89	cl::init(Val: false));
90
91	// Disable processing of fdiv so we can better test the backend implementations.
92	static cl::opt<bool> DisableFDivExpand(
93	"amdgpu-codegenprepare-disable-fdiv-expansion",
94	cl::desc ("Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
95	cl::ReallyHidden,
96	cl::init(Val: false));
97
98	static bool hasUnsafeFPMath(const Function &F) {
99	return F.getFnAttribute(Kind: "unsafe-fp-math").getValueAsBool();
100	}
101
102	class AMDGPUCodeGenPrepareImpl
103	: public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
104	public:
105	Function &F;
106	const GCNSubtarget &ST;
107	const AMDGPUTargetMachine &TM;
108	const TargetLibraryInfo *TLI;
109	AssumptionCache *AC;
110	const DominatorTree *DT;
111	const UniformityInfo &UA;
112	const DataLayout &DL;
113	const bool HasUnsafeFPMath;
114	const bool HasFP32DenormalFlush;
115	bool FlowChanged = false;
116	mutable Function SqrtF32 = nullptr*;
117	mutable Function LdexpF32 = nullptr*;
118
119	DenseMap<const PHINode , bool*> BreakPhiNodesCache;
120
121	AMDGPUCodeGenPrepareImpl(Function &F, const AMDGPUTargetMachine &TM,
122	const TargetLibraryInfo TLI, AssumptionCache AC,
123	const DominatorTree DT, const* UniformityInfo &UA)
124	: F(F), ST(TM.getSubtarget<GCNSubtarget>(F)), TM(TM), TLI(TLI), AC(AC),
125	DT(DT), UA(UA), DL(F.getDataLayout()),
126	HasUnsafeFPMath(hasUnsafeFPMath(F)),
127	HasFP32DenormalFlush(SIModeRegisterDefaults (F, ST).FP32Denormals ==
128	DenormalMode::getPreserveSign()) {}
129
130	Function getSqrtF32() const* {
131	if (SqrtF32)
132	return SqrtF32;
133
134	LLVMContext &Ctx = F.getContext();
135	SqrtF32 = Intrinsic::getOrInsertDeclaration(
136	M: F.getParent(), id: Intrinsic::amdgcn_sqrt, Tys: {Type::getFloatTy(C&: Ctx)});
137	return SqrtF32;
138	}
139
140	Function getLdexpF32() const* {
141	if (LdexpF32)
142	return LdexpF32;
143
144	LLVMContext &Ctx = F.getContext();
145	LdexpF32 = Intrinsic::getOrInsertDeclaration(
146	M: F.getParent(), id: Intrinsic::ldexp,
147	Tys: {Type::getFloatTy(C&: Ctx), Type::getInt32Ty(C&: Ctx)});
148	return LdexpF32;
149	}
150
151	bool canBreakPHINode(const PHINode &I);
152
153	/// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
154	/// binary operation \p V.
155	///
156	/// \returns Binary operation \p V.
157	/// \returns \p T's base element bit width.
158	unsigned getBaseElementBitWidth(const Type T) const*;
159
160	/// \returns Equivalent 32 bit integer type for given type \p T. For example,
161	/// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
162	/// is returned.
163	Type getI32Ty(IRBuilder<> &B, const* Type T) const*;
164
165	/// \returns True if binary operation \p I is a signed binary operation, false
166	/// otherwise.
167	bool isSigned(const BinaryOperator &I) const;
168
169	/// \returns True if the condition of 'select' operation \p I comes from a
170	/// signed 'icmp' operation, false otherwise.
171	bool isSigned(const SelectInst &I) const;
172
173	/// \returns True if type \p T needs to be promoted to 32 bit integer type,
174	/// false otherwise.
175	bool needsPromotionToI32(const Type T) const*;
176
177	/// Return true if \p T is a legal scalar floating point type.
178	bool isLegalFloatingTy(const Type T) const*;
179
180	/// Wrapper to pass all the arguments to computeKnownFPClass
181	KnownFPClass computeKnownFPClass(const Value *V, FPClassTest Interested,
182	const Instruction CtxI) const* {
183	return llvm::computeKnownFPClass(V, DL, InterestedClasses: Interested, TLI, AC, CxtI: CtxI, DT);
184	}
185
186	bool canIgnoreDenormalInput(const Value V, const* Instruction CtxI) const* {
187	return HasFP32DenormalFlush \|\|
188	computeKnownFPClass(V, Interested: fcSubnormal, CtxI).isKnownNeverSubnormal();
189	}
190
191	/// Promotes uniform binary operation \p I to equivalent 32 bit binary
192	/// operation.
193	///
194	/// \details \p I's base element bit width must be greater than 1 and less
195	/// than or equal 16. Promotion is done by sign or zero extending operands to
196	/// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
197	/// truncating the result of 32 bit binary operation back to \p I's original
198	/// type. Division operation is not promoted.
199	///
200	/// \returns True if \p I is promoted to equivalent 32 bit binary operation,
201	/// false otherwise.
202	bool promoteUniformOpToI32(BinaryOperator &I) const;
203
204	/// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
205	///
206	/// \details \p I's base element bit width must be greater than 1 and less
207	/// than or equal 16. Promotion is done by sign or zero extending operands to
208	/// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
209	///
210	/// \returns True.
211	bool promoteUniformOpToI32(ICmpInst &I) const;
212
213	/// Promotes uniform 'select' operation \p I to 32 bit 'select'
214	/// operation.
215	///
216	/// \details \p I's base element bit width must be greater than 1 and less
217	/// than or equal 16. Promotion is done by sign or zero extending operands to
218	/// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
219	/// result of 32 bit 'select' operation back to \p I's original type.
220	///
221	/// \returns True.
222	bool promoteUniformOpToI32(SelectInst &I) const;
223
224	/// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
225	/// intrinsic.
226	///
227	/// \details \p I's base element bit width must be greater than 1 and less
228	/// than or equal 16. Promotion is done by zero extending the operand to 32
229	/// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
230	/// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
231	/// shift amount is 32 minus \p I's base element bit width), and truncating
232	/// the result of the shift operation back to \p I's original type.
233	///
234	/// \returns True.
235	bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
236
237	/// \returns The minimum number of bits needed to store the value of \Op as an
238	/// unsigned integer. Truncating to this size and then zero-extending to
239	/// the original will not change the value.
240	unsigned numBitsUnsigned(Value Op) const*;
241
242	/// \returns The minimum number of bits needed to store the value of \Op as a
243	/// signed integer. Truncating to this size and then sign-extending to
244	/// the original size will not change the value.
245	unsigned numBitsSigned(Value Op) const*;
246
247	/// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
248	/// SelectionDAG has an issue where an and asserting the bits are known
249	bool replaceMulWithMul24(BinaryOperator &I) const;
250
251	/// Perform same function as equivalently named function in DAGCombiner. Since
252	/// we expand some divisions here, we need to perform this before obscuring.
253	bool foldBinOpIntoSelect(BinaryOperator &I) const;
254
255	bool divHasSpecialOptimization(BinaryOperator &I,
256	Value Num, Value Den) const;
257	unsigned getDivNumBits(BinaryOperator &I, Value Num, Value Den,
258	unsigned MaxDivBits, bool Signed) const;
259
260	/// Expands 24 bit div or rem.
261	Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
262	Value Num, Value Den,
263	bool IsDiv, bool IsSigned) const;
264
265	Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I,
266	Value Num, Value Den, unsigned NumBits,
267	bool IsDiv, bool IsSigned) const;
268
269	/// Expands 32 bit div or rem.
270	Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
271	Value Num, Value Den) const;
272
273	Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I,
274	Value Num, Value Den) const;
275	void expandDivRem64(BinaryOperator &I) const;
276
277	/// Widen a scalar load.
278	///
279	/// \details \p Widen scalar load for uniform, small type loads from constant
280	// memory / to a full 32-bits and then truncate the input to allow a scalar
281	// load instead of a vector load.
282	//
283	/// \returns True.
284
285	bool canWidenScalarExtLoad(LoadInst &I) const;
286
287	Value *matchFractPat(IntrinsicInst &I);
288	Value applyFractPat(IRBuilder<> &Builder, Value FractArg);
289
290	bool canOptimizeWithRsq(const FPMathOperator *SqrtOp, FastMathFlags DivFMF,
291	FastMathFlags SqrtFMF) const;
292
293	Value optimizeWithRsq(IRBuilder<> &Builder, Value Num, Value *Den,
294	FastMathFlags DivFMF, FastMathFlags SqrtFMF,
295	const Instruction CtxI) const*;
296
297	Value optimizeWithRcp(IRBuilder<> &Builder, Value Num, Value *Den,
298	FastMathFlags FMF, const Instruction CtxI) const*;
299	Value optimizeWithFDivFast(IRBuilder<> &Builder, Value Num, Value *Den,
300	float ReqdAccuracy) const;
301
302	Value visitFDivElement(IRBuilder<> &Builder, Value Num, Value *Den,
303	FastMathFlags DivFMF, FastMathFlags SqrtFMF,
304	Value RsqOp, const* Instruction *FDiv,
305	float ReqdAccuracy) const;
306
307	std::pair<Value , Value > getFrexpResults(IRBuilder<> &Builder,
308	Value Src) const*;
309
310	Value emitRcpIEEE1ULP(IRBuilder<> &Builder, Value Src,
311	bool IsNegative) const;
312	Value emitFrexpDiv(IRBuilder<> &Builder, Value LHS, Value *RHS,
313	FastMathFlags FMF) const;
314	Value emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value Src,
315	FastMathFlags FMF) const;
316
317	public:
318	bool visitFDiv(BinaryOperator &I);
319
320	bool visitInstruction(Instruction &I) { return false; }
321	bool visitBinaryOperator(BinaryOperator &I);
322	bool visitLoadInst(LoadInst &I);
323	bool visitICmpInst(ICmpInst &I);
324	bool visitSelectInst(SelectInst &I);
325	bool visitPHINode(PHINode &I);
326	bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
327
328	bool visitIntrinsicInst(IntrinsicInst &I);
329	bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
330	bool visitFMinLike(IntrinsicInst &I);
331	bool visitSqrt(IntrinsicInst &I);
332	bool run();
333	};
334
335	class AMDGPUCodeGenPrepare : public FunctionPass {
336	public:
337	static char ID;
338	AMDGPUCodeGenPrepare() : FunctionPass (ID) {}
339	void getAnalysisUsage(AnalysisUsage &AU) const override {
340	AU.addRequired<AssumptionCacheTracker>();
341	AU.addRequired<UniformityInfoWrapperPass>();
342	AU.addRequired<TargetLibraryInfoWrapperPass>();
343
344	// FIXME: Division expansion needs to preserve the dominator tree.
345	if (!ExpandDiv64InIR)
346	AU.setPreservesAll();
347	}
348	bool runOnFunction(Function &F) override;
349	StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
350	};
351
352	} // end anonymous namespace
353
354	bool AMDGPUCodeGenPrepareImpl::run() {
355	BreakPhiNodesCache.clear();
356	bool MadeChange = false;
357
358	Function::iterator NextBB;
359	for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) {
360	BasicBlock BB = &FI;
361	NextBB = std::next(x: FI);
362
363	BasicBlock::iterator Next;
364	for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
365	I = Next) {
366	Next = std::next(x: I);
367
368	MadeChange \|= visit(I&: *I);
369
370	if (Next != E) { // Control flow changed
371	BasicBlock *NextInstBB = Next ->getParent();
372	if (NextInstBB != BB) {
373	BB = NextInstBB;
374	E = BB->end();
375	FE = F.end();
376	}
377	}
378	}
379	}
380	return MadeChange;
381	}
382
383	unsigned AMDGPUCodeGenPrepareImpl::getBaseElementBitWidth(const Type T) const* {
384	assert(needsPromotionToI32(T) && "T does not need promotion to i32");
385
386	if (T->isIntegerTy())
387	return T->getIntegerBitWidth();
388	return cast<VectorType>(Val: T)->getElementType()->getIntegerBitWidth();
389	}
390
391	Type AMDGPUCodeGenPrepareImpl::getI32Ty(IRBuilder<> &B, const* Type T) const* {
392	assert(needsPromotionToI32(T) && "T does not need promotion to i32");
393
394	if (T->isIntegerTy())
395	return B.getInt32Ty();
396	return FixedVectorType::get(ElementType: B.getInt32Ty(), FVTy: cast<FixedVectorType>(Val: T));
397	}
398
399	bool AMDGPUCodeGenPrepareImpl::isSigned(const BinaryOperator &I) const {
400	return I.getOpcode() == Instruction::AShr \|\|
401	I.getOpcode() == Instruction::SDiv \|\| I.getOpcode() == Instruction::SRem;
402	}
403
404	bool AMDGPUCodeGenPrepareImpl::isSigned(const SelectInst &I) const {
405	return isa<ICmpInst>(Val: I.getOperand(i_nocapture: `0`)) &&
406	cast<ICmpInst>(Val: I.getOperand(i_nocapture: `0`))->isSigned();
407	}
408
409	bool AMDGPUCodeGenPrepareImpl::needsPromotionToI32(const Type T) const* {
410	if (!Widen16BitOps)
411	return false;
412
413	const IntegerType *IntTy = dyn_cast<IntegerType>(Val: T);
414	if (IntTy && IntTy->getBitWidth() > `1` && IntTy->getBitWidth() <= `16`)
415	return true;
416
417	if (const VectorType *VT = dyn_cast<VectorType>(Val: T)) {
418	// TODO: The set of packed operations is more limited, so may want to
419	// promote some anyway.
420	if (ST.hasVOP3PInsts())
421	return false;
422
423	return needsPromotionToI32(T: VT->getElementType());
424	}
425
426	return false;
427	}
428
429	bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type Ty) const* {
430	return Ty->isFloatTy() \|\| Ty->isDoubleTy() \|\|
431	(Ty->isHalfTy() && ST.has16BitInsts());
432	}
433
434	// Return true if the op promoted to i32 should have nsw set.
435	static bool promotedOpIsNSW(const Instruction &I) {
436	switch (I.getOpcode()) {
437	case Instruction::Shl:
438	case Instruction::Add:
439	case Instruction::Sub:
440	return true;
441	case Instruction::Mul:
442	return I.hasNoUnsignedWrap();
443	default:
444	return false;
445	}
446	}
447
448	// Return true if the op promoted to i32 should have nuw set.
449	static bool promotedOpIsNUW(const Instruction &I) {
450	switch (I.getOpcode()) {
451	case Instruction::Shl:
452	case Instruction::Add:
453	case Instruction::Mul:
454	return true;
455	case Instruction::Sub:
456	return I.hasNoUnsignedWrap();
457	default:
458	return false;
459	}
460	}
461
462	bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {
463	Type *Ty = I.getType();
464	int TySize = DL.getTypeSizeInBits(Ty);
465	Align Alignment = DL.getValueOrABITypeAlignment(Alignment: I.getAlign(), Ty);
466
467	return I.isSimple() && TySize < `32` && Alignment >= `4` && UA.isUniform(I: &I);
468	}
469
470	bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(BinaryOperator &I) const {
471	assert(needsPromotionToI32(I.getType()) &&
472	"I does not need promotion to i32");
473
474	if (I.getOpcode() == Instruction::SDiv \|\|
475	I.getOpcode() == Instruction::UDiv \|\|
476	I.getOpcode() == Instruction::SRem \|\|
477	I.getOpcode() == Instruction::URem)
478	return false;
479
480	IRBuilder<> Builder(&I);
481	Builder.SetCurrentDebugLocation(I.getDebugLoc());
482
483	Type *I32Ty = getI32Ty(B&: Builder, T: I.getType());
484	Value ExtOp0 = nullptr*;
485	Value ExtOp1 = nullptr*;
486	Value ExtRes = nullptr*;
487	Value TruncRes = nullptr*;
488
489	if (isSigned(I)) {
490	ExtOp0 = Builder.CreateSExt(V: I.getOperand(i_nocapture: `0`), DestTy: I32Ty);
491	ExtOp1 = Builder.CreateSExt(V: I.getOperand(i_nocapture: `1`), DestTy: I32Ty);
492	} else {
493	ExtOp0 = Builder.CreateZExt(V: I.getOperand(i_nocapture: `0`), DestTy: I32Ty);
494	ExtOp1 = Builder.CreateZExt(V: I.getOperand(i_nocapture: `1`), DestTy: I32Ty);
495	}
496
497	ExtRes = Builder.CreateBinOp(Opc: I.getOpcode(), LHS: ExtOp0, RHS: ExtOp1);
498	if (Instruction *Inst = dyn_cast<Instruction>(Val: ExtRes)) {
499	if (promotedOpIsNSW(I: cast<Instruction>(Val&: I)))
500	Inst->setHasNoSignedWrap();
501
502	if (promotedOpIsNUW(I: cast<Instruction>(Val&: I)))
503	Inst->setHasNoUnsignedWrap();
504
505	if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(Val: &I))
506	Inst->setIsExact(ExactOp->isExact());
507	}
508
509	TruncRes = Builder.CreateTrunc(V: ExtRes, DestTy: I.getType());
510
511	I.replaceAllUsesWith(V: TruncRes);
512	I.eraseFromParent();
513
514	return true;
515	}
516
517	bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(ICmpInst &I) const {
518	assert(needsPromotionToI32(I.getOperand(`0`)->getType()) &&
519	"I does not need promotion to i32");
520
521	IRBuilder<> Builder(&I);
522	Builder.SetCurrentDebugLocation(I.getDebugLoc());
523
524	Type *I32Ty = getI32Ty(B&: Builder, T: I.getOperand(i_nocapture: `0`)->getType());
525	Value ExtOp0 = nullptr*;
526	Value ExtOp1 = nullptr*;
527	Value NewICmp = nullptr*;
528
529	if (I.isSigned()) {
530	ExtOp0 = Builder.CreateSExt(V: I.getOperand(i_nocapture: `0`), DestTy: I32Ty);
531	ExtOp1 = Builder.CreateSExt(V: I.getOperand(i_nocapture: `1`), DestTy: I32Ty);
532	} else {
533	ExtOp0 = Builder.CreateZExt(V: I.getOperand(i_nocapture: `0`), DestTy: I32Ty);
534	ExtOp1 = Builder.CreateZExt(V: I.getOperand(i_nocapture: `1`), DestTy: I32Ty);
535	}
536	NewICmp = Builder.CreateICmp(P: I.getPredicate(), LHS: ExtOp0, RHS: ExtOp1);
537
538	I.replaceAllUsesWith(V: NewICmp);
539	I.eraseFromParent();
540
541	return true;
542	}
543
544	bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(SelectInst &I) const {
545	assert(needsPromotionToI32(I.getType()) &&
546	"I does not need promotion to i32");
547
548	IRBuilder<> Builder(&I);
549	Builder.SetCurrentDebugLocation(I.getDebugLoc());
550
551	Type *I32Ty = getI32Ty(B&: Builder, T: I.getType());
552	Value ExtOp1 = nullptr*;
553	Value ExtOp2 = nullptr*;
554	Value ExtRes = nullptr*;
555	Value TruncRes = nullptr*;
556
557	if (isSigned(I)) {
558	ExtOp1 = Builder.CreateSExt(V: I.getOperand(i_nocapture: `1`), DestTy: I32Ty);
559	ExtOp2 = Builder.CreateSExt(V: I.getOperand(i_nocapture: `2`), DestTy: I32Ty);
560	} else {
561	ExtOp1 = Builder.CreateZExt(V: I.getOperand(i_nocapture: `1`), DestTy: I32Ty);
562	ExtOp2 = Builder.CreateZExt(V: I.getOperand(i_nocapture: `2`), DestTy: I32Ty);
563	}
564	ExtRes = Builder.CreateSelect(C: I.getOperand(i_nocapture: `0`), True: ExtOp1, False: ExtOp2);
565	TruncRes = Builder.CreateTrunc(V: ExtRes, DestTy: I.getType());
566
567	I.replaceAllUsesWith(V: TruncRes);
568	I.eraseFromParent();
569
570	return true;
571	}
572
573	bool AMDGPUCodeGenPrepareImpl::promoteUniformBitreverseToI32(
574	IntrinsicInst &I) const {
575	assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
576	"I must be bitreverse intrinsic");
577	assert(needsPromotionToI32(I.getType()) &&
578	"I does not need promotion to i32");
579
580	IRBuilder<> Builder(&I);
581	Builder.SetCurrentDebugLocation(I.getDebugLoc());
582
583	Type *I32Ty = getI32Ty(B&: Builder, T: I.getType());
584	Value *ExtOp = Builder.CreateZExt(V: I.getOperand(i_nocapture: `0`), DestTy: I32Ty);
585	Value *ExtRes =
586	Builder.CreateIntrinsic(ID: Intrinsic::bitreverse, Types: {I32Ty}, Args: {ExtOp});
587	Value *LShrOp =
588	Builder.CreateLShr(LHS: ExtRes, RHS: `32` - getBaseElementBitWidth(T: I.getType()));
589	Value *TruncRes =
590	Builder.CreateTrunc(V: LShrOp, DestTy: I.getType());
591
592	I.replaceAllUsesWith(V: TruncRes);
593	I.eraseFromParent();
594
595	return true;
596	}
597
598	unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value Op) const* {
599	return computeKnownBits(V: Op, DL, AC).countMaxActiveBits();
600	}
601
602	unsigned AMDGPUCodeGenPrepareImpl::numBitsSigned(Value Op) const* {
603	return ComputeMaxSignificantBits(Op, DL, AC);
604	}
605
606	static void extractValues(IRBuilder<> &Builder,
607	SmallVectorImpl<Value > &Values, Value V) {
608	auto *VT = dyn_cast<FixedVectorType>(Val: V->getType());
609	if (!VT) {
610	Values.push_back(Elt: V);
611	return;
612	}
613
614	for (int I = `0`, E = VT->getNumElements(); I != E; ++I)
615	Values.push_back(Elt: Builder.CreateExtractElement(Vec: V, Idx: I));
616	}
617
618	static Value *insertValues(IRBuilder<> &Builder,
619	Type *Ty,
620	SmallVectorImpl<Value *> &Values) {
621	if (!Ty->isVectorTy()) {
622	assert(Values.size() == `1`);
623	return Values [`0`];
624	}
625
626	Value *NewVal = PoisonValue::get(T: Ty);
627	for (int I = `0`, E = Values.size(); I != E; ++I)
628	NewVal = Builder.CreateInsertElement(Vec: NewVal, NewElt: Values [I], Idx: I);
629
630	return NewVal;
631	}
632
633	bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
634	if (I.getOpcode() != Instruction::Mul)
635	return false;
636
637	Type *Ty = I.getType();
638	unsigned Size = Ty->getScalarSizeInBits();
639	if (Size <= `16` && ST.has16BitInsts())
640	return false;
641
642	// Prefer scalar if this could be s_mul_i32
643	if (UA.isUniform(I: &I))
644	return false;
645
646	Value *LHS = I.getOperand(i_nocapture: `0`);
647	Value *RHS = I.getOperand(i_nocapture: `1`);
648	IRBuilder<> Builder(&I);
649	Builder.SetCurrentDebugLocation(I.getDebugLoc());
650
651	unsigned LHSBits = `0`, RHSBits = `0`;
652	bool IsSigned = false;
653
654	if (ST.hasMulU24() && (LHSBits = numBitsUnsigned(Op: LHS)) <= `24` &&
655	(RHSBits = numBitsUnsigned(Op: RHS)) <= `24`) {
656	IsSigned = false;
657
658	} else if (ST.hasMulI24() && (LHSBits = numBitsSigned(Op: LHS)) <= `24` &&
659	(RHSBits = numBitsSigned(Op: RHS)) <= `24`) {
660	IsSigned = true;
661
662	} else
663	return false;
664
665	SmallVector<Value *, `4`> LHSVals;
666	SmallVector<Value *, `4`> RHSVals;
667	SmallVector<Value *, `4`> ResultVals;
668	extractValues(Builder, Values&: LHSVals, V: LHS);
669	extractValues(Builder, Values&: RHSVals, V: RHS);
670
671	IntegerType *I32Ty = Builder.getInt32Ty();
672	IntegerType *IntrinTy = Size > `32` ? Builder.getInt64Ty() : I32Ty;
673	Type *DstTy = LHSVals [`0`]->getType();
674
675	for (int I = `0`, E = LHSVals.size(); I != E; ++I) {
676	Value *LHS = IsSigned ? Builder.CreateSExtOrTrunc(V: LHSVals [I], DestTy: I32Ty)
677	: Builder.CreateZExtOrTrunc(V: LHSVals [I], DestTy: I32Ty);
678	Value *RHS = IsSigned ? Builder.CreateSExtOrTrunc(V: RHSVals [I], DestTy: I32Ty)
679	: Builder.CreateZExtOrTrunc(V: RHSVals [I], DestTy: I32Ty);
680	Intrinsic::ID ID =
681	IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
682	Value *Result = Builder.CreateIntrinsic(ID, Types: {IntrinTy}, Args: {LHS, RHS});
683	Result = IsSigned ? Builder.CreateSExtOrTrunc(V: Result, DestTy: DstTy)
684	: Builder.CreateZExtOrTrunc(V: Result, DestTy: DstTy);
685	ResultVals.push_back(Elt: Result);
686	}
687
688	Value *NewVal = insertValues(Builder, Ty, Values&: ResultVals);
689	NewVal->takeName(V: &I);
690	I.replaceAllUsesWith(V: NewVal);
691	I.eraseFromParent();
692
693	return true;
694	}
695
696	// Find a select instruction, which may have been casted. This is mostly to deal
697	// with cases where i16 selects were promoted here to i32.
698	static SelectInst findSelectThroughCast(Value V, CastInst *&Cast) {
699	Cast = nullptr;
700	if (SelectInst *Sel = dyn_cast<SelectInst>(Val: V))
701	return Sel;
702
703	if ((Cast = dyn_cast<CastInst>(Val: V))) {
704	if (SelectInst *Sel = dyn_cast<SelectInst>(Val: Cast->getOperand(i_nocapture: `0`)))
705	return Sel;
706	}
707
708	return nullptr;
709	}
710
711	bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const {
712	// Don't do this unless the old select is going away. We want to eliminate the
713	// binary operator, not replace a binop with a select.
714	int SelOpNo = `0`;
715
716	CastInst *CastOp;
717
718	// TODO: Should probably try to handle some cases with multiple
719	// users. Duplicating the select may be profitable for division.
720	SelectInst *Sel = findSelectThroughCast(V: BO.getOperand(i_nocapture: `0`), Cast&: CastOp);
721	if (!Sel \|\| !Sel->hasOneUse()) {
722	SelOpNo = `1`;
723	Sel = findSelectThroughCast(V: BO.getOperand(i_nocapture: `1`), Cast&: CastOp);
724	}
725
726	if (!Sel \|\| !Sel->hasOneUse())
727	return false;
728
729	Constant *CT = dyn_cast<Constant>(Val: Sel->getTrueValue());
730	Constant *CF = dyn_cast<Constant>(Val: Sel->getFalseValue());
731	Constant *CBO = dyn_cast<Constant>(Val: BO.getOperand(i_nocapture: SelOpNo ^ `1`));
732	if (!CBO \|\| !CT \|\| !CF)
733	return false;
734
735	if (CastOp) {
736	if (!CastOp->hasOneUse())
737	return false;
738	CT = ConstantFoldCastOperand(Opcode: CastOp->getOpcode(), C: CT, DestTy: BO.getType(), DL);
739	CF = ConstantFoldCastOperand(Opcode: CastOp->getOpcode(), C: CF, DestTy: BO.getType(), DL);
740	}
741
742	// TODO: Handle special 0/-1 cases DAG combine does, although we only really
743	// need to handle divisions here.
744	Constant *FoldedT =
745	SelOpNo ? ConstantFoldBinaryOpOperands(Opcode: BO.getOpcode(), LHS: CBO, RHS: CT, DL)
746	: ConstantFoldBinaryOpOperands(Opcode: BO.getOpcode(), LHS: CT, RHS: CBO, DL);
747	if (!FoldedT \|\| isa<ConstantExpr>(Val: FoldedT))
748	return false;
749
750	Constant *FoldedF =
751	SelOpNo ? ConstantFoldBinaryOpOperands(Opcode: BO.getOpcode(), LHS: CBO, RHS: CF, DL)
752	: ConstantFoldBinaryOpOperands(Opcode: BO.getOpcode(), LHS: CF, RHS: CBO, DL);
753	if (!FoldedF \|\| isa<ConstantExpr>(Val: FoldedF))
754	return false;
755
756	IRBuilder<> Builder(&BO);
757	Builder.SetCurrentDebugLocation(BO.getDebugLoc());
758	if (const FPMathOperator FPOp = dyn_cast<const* FPMathOperator>(Val: &BO))
759	Builder.setFastMathFlags(FPOp->getFastMathFlags());
760
761	Value *NewSelect = Builder.CreateSelect(C: Sel->getCondition(),
762	True: FoldedT, False: FoldedF);
763	NewSelect->takeName(V: &BO);
764	BO.replaceAllUsesWith(V: NewSelect);
765	BO.eraseFromParent();
766	if (CastOp)
767	CastOp->eraseFromParent();
768	Sel->eraseFromParent();
769	return true;
770	}
771
772	std::pair<Value , Value >
773	AMDGPUCodeGenPrepareImpl::getFrexpResults(IRBuilder<> &Builder,
774	Value Src) const* {
775	Type *Ty = Src->getType();
776	Value *Frexp = Builder.CreateIntrinsic(ID: Intrinsic::frexp,
777	Types: {Ty, Builder.getInt32Ty()}, Args: Src);
778	Value *FrexpMant = Builder.CreateExtractValue(Agg: Frexp, Idxs: {`0`});
779
780	// Bypass the bug workaround for the exponent result since it doesn't matter.
781	// TODO: Does the bug workaround even really need to consider the exponent
782	// result? It's unspecified by the spec.
783
784	Value *FrexpExp =
785	ST.hasFractBug()
786	? Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_frexp_exp,
787	Types: {Builder.getInt32Ty(), Ty}, Args: Src)
788	: Builder.CreateExtractValue(Agg: Frexp, Idxs: {`1`});
789	return {FrexpMant, FrexpExp};
790	}
791
792	/// Emit an expansion of 1.0 / Src good for 1ulp that supports denormals.
793	Value *AMDGPUCodeGenPrepareImpl::emitRcpIEEE1ULP(IRBuilder<> &Builder,
794	Value *Src,
795	bool IsNegative) const {
796	// Same as for 1.0, but expand the sign out of the constant.
797	// -1.0 / x -> rcp (fneg x)
798	if (IsNegative)
799	Src = Builder.CreateFNeg(V: Src);
800
801	// The rcp instruction doesn't support denormals, so scale the input
802	// out of the denormal range and convert at the end.
803	//
804	// Expand as 2^-n (1.0 / (x * 2^n))*
805
806	// TODO: Skip scaling if input is known never denormal and the input
807	// range won't underflow to denormal. The hard part is knowing the
808	// result. We need a range check, the result could be denormal for
809	// 0x1p+126 < den <= 0x1p+127.
810	auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);
811	Value *ScaleFactor = Builder.CreateNeg(V: FrexpExp);
812	Value *Rcp = Builder.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rcp, V: FrexpMant);
813	return Builder.CreateCall(Callee: getLdexpF32(), Args: {Rcp, ScaleFactor});
814	}
815
816	/// Emit a 2ulp expansion for fdiv by using frexp for input scaling.
817	Value AMDGPUCodeGenPrepareImpl::emitFrexpDiv(IRBuilder<> &Builder, Value LHS,
818	Value *RHS,
819	FastMathFlags FMF) const {
820	// If we have have to work around the fract/frexp bug, we're worse off than
821	// using the fdiv.fast expansion. The full safe expansion is faster if we have
822	// fast FMA.
823	if (HasFP32DenormalFlush && ST.hasFractBug() && !ST.hasFastFMAF32() &&
824	(!FMF.noNaNs() \|\| !FMF.noInfs()))
825	return nullptr;
826
827	// We're scaling the LHS to avoid a denormal input, and scale the denominator
828	// to avoid large values underflowing the result.
829	auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, Src: RHS);
830
831	Value *Rcp =
832	Builder.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rcp, V: FrexpMantRHS);
833
834	auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder, Src: LHS);
835	Value *Mul = Builder.CreateFMul(L: FrexpMantLHS, R: Rcp);
836
837	// We multiplied by 2^N/2^M, so we need to multiply by 2^(N-M) to scale the
838	// result.
839	Value *ExpDiff = Builder.CreateSub(LHS: FrexpExpLHS, RHS: FrexpExpRHS);
840	return Builder.CreateCall(Callee: getLdexpF32(), Args: {Mul, ExpDiff});
841	}
842
843	/// Emit a sqrt that handles denormals and is accurate to 2ulp.
844	Value *AMDGPUCodeGenPrepareImpl::emitSqrtIEEE2ULP(IRBuilder<> &Builder,
845	Value *Src,
846	FastMathFlags FMF) const {
847	Type *Ty = Src->getType();
848	APFloat SmallestNormal =
849	APFloat::getSmallestNormalized(Sem: Ty->getFltSemantics());
850	Value *NeedScale =
851	Builder.CreateFCmpOLT(LHS: Src, RHS: ConstantFP::get(Ty, V: SmallestNormal));
852
853	ConstantInt *Zero = Builder.getInt32(C: `0`);
854	Value *InputScaleFactor =
855	Builder.CreateSelect(C: NeedScale, True: Builder.getInt32(C: `32`), False: Zero);
856
857	Value *Scaled = Builder.CreateCall(Callee: getLdexpF32(), Args: {Src, InputScaleFactor});
858
859	Value *Sqrt = Builder.CreateCall(Callee: getSqrtF32(), Args: Scaled);
860
861	Value *OutputScaleFactor =
862	Builder.CreateSelect(C: NeedScale, True: Builder.getInt32(C: -`16`), False: Zero);
863	return Builder.CreateCall(Callee: getLdexpF32(), Args: {Sqrt, OutputScaleFactor});
864	}
865
866	/// Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
867	static Value emitRsqIEEE1ULP(IRBuilder<> &Builder, Value Src,
868	bool IsNegative) {
869	// bool need_scale = x < 0x1p-126f;
870	// float input_scale = need_scale ? 0x1.0p+24f : 1.0f;
871	// float output_scale = need_scale ? 0x1.0p+12f : 1.0f;
872	// rsq(x input_scale) * output_scale;*
873
874	Type *Ty = Src->getType();
875	APFloat SmallestNormal =
876	APFloat::getSmallestNormalized(Sem: Ty->getFltSemantics());
877	Value *NeedScale =
878	Builder.CreateFCmpOLT(LHS: Src, RHS: ConstantFP::get(Ty, V: SmallestNormal));
879	Constant *One = ConstantFP::get(Ty, V: `1.0`);
880	Constant *InputScale = ConstantFP::get(Ty, V: `0x1.0p+24`);
881	Constant *OutputScale =
882	ConstantFP::get(Ty, V: IsNegative ? -`0x1.0p+12` : `0x1.0p+12`);
883
884	Value *InputScaleFactor = Builder.CreateSelect(C: NeedScale, True: InputScale, False: One);
885
886	Value *ScaledInput = Builder.CreateFMul(L: Src, R: InputScaleFactor);
887	Value *Rsq = Builder.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rsq, V: ScaledInput);
888	Value *OutputScaleFactor = Builder.CreateSelect(
889	C: NeedScale, True: OutputScale, False: IsNegative ? ConstantFP::get(Ty, V: -`1.0`) : One);
890
891	return Builder.CreateFMul(L: Rsq, R: OutputScaleFactor);
892	}
893
894	bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp,
895	FastMathFlags DivFMF,
896	FastMathFlags SqrtFMF) const {
897	// The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
898	if (!DivFMF.allowContract() \|\| !SqrtFMF.allowContract())
899	return false;
900
901	// v_rsq_f32 gives 1ulp
902	return SqrtFMF.approxFunc() \|\| HasUnsafeFPMath \|\|
903	SqrtOp->getFPAccuracy() >= `1.0f`;
904	}
905
906	Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
907	IRBuilder<> &Builder, Value Num, Value Den, const FastMathFlags DivFMF,
908	const FastMathFlags SqrtFMF, const Instruction CtxI) const* {
909	// The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
910	assert(DivFMF.allowContract() && SqrtFMF.allowContract());
911
912	// rsq_f16 is accurate to 0.51 ulp.
913	// rsq_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
914	// rsq_f64 is never accurate.
915	const ConstantFP *CLHS = dyn_cast<ConstantFP>(Val: Num);
916	if (!CLHS)
917	return nullptr;
918
919	assert(Den->getType()->isFloatTy());
920
921	bool IsNegative = false;
922
923	// TODO: Handle other numerator values with arcp.
924	if (CLHS->isExactlyValue(V: `1.0`) \|\| (IsNegative = CLHS->isExactlyValue(V: -`1.0`))) {
925	// Add in the sqrt flags.
926	IRBuilder<>::FastMathFlagGuard Guard(Builder);
927	Builder.setFastMathFlags(DivFMF \| SqrtFMF);
928
929	if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) \|\| HasUnsafeFPMath \|\|
930	canIgnoreDenormalInput(V: Den, CtxI)) {
931	Value *Result = Builder.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rsq, V: Den);
932	// -1.0 / sqrt(x) -> fneg(rsq(x))
933	return IsNegative ? Builder.CreateFNeg(V: Result) : Result;
934	}
935
936	return emitRsqIEEE1ULP(Builder, Src: Den, IsNegative);
937	}
938
939	return nullptr;
940	}
941
942	// Optimize fdiv with rcp:
943	//
944	// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
945	// allowed with unsafe-fp-math or afn.
946	//
947	// a/b -> arcp(b) when arcp is allowed, and we only need provide ULP 1.0*
948	Value *
949	AMDGPUCodeGenPrepareImpl::optimizeWithRcp(IRBuilder<> &Builder, Value *Num,
950	Value *Den, FastMathFlags FMF,
951	const Instruction CtxI) const* {
952	// rcp_f16 is accurate to 0.51 ulp.
953	// rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
954	// rcp_f64 is never accurate.
955	assert(Den->getType()->isFloatTy());
956
957	if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Val: Num)) {
958	bool IsNegative = false;
959	if (CLHS->isExactlyValue(V: `1.0`) \|\|
960	(IsNegative = CLHS->isExactlyValue(V: -`1.0`))) {
961	Value *Src = Den;
962
963	if (HasFP32DenormalFlush \|\| FMF.approxFunc()) {
964	// -1.0 / x -> 1.0 / fneg(x)
965	if (IsNegative)
966	Src = Builder.CreateFNeg(V: Src);
967
968	// v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
969	// the CI documentation has a worst case error of 1 ulp.
970	// OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK
971	// to use it as long as we aren't trying to use denormals.
972	//
973	// v_rcp_f16 and v_rsq_f16 DO support denormals.
974
975	// NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
976	// insert rsq intrinsic here.
977
978	// 1.0 / x -> rcp(x)
979	return Builder.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rcp, V: Src);
980	}
981
982	// TODO: If the input isn't denormal, and we know the input exponent isn't
983	// big enough to introduce a denormal we can avoid the scaling.
984	return emitRcpIEEE1ULP(Builder, Src, IsNegative);
985	}
986	}
987
988	if (FMF.allowReciprocal()) {
989	// x / y -> x (1.0 / y)*
990
991	// TODO: Could avoid denormal scaling and use raw rcp if we knew the output
992	// will never underflow.
993	if (HasFP32DenormalFlush \|\| FMF.approxFunc()) {
994	Value *Recip = Builder.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rcp, V: Den);
995	return Builder.CreateFMul(L: Num, R: Recip);
996	}
997
998	Value Recip = emitRcpIEEE1ULP(Builder, Src: Den, IsNegative: false*);
999	return Builder.CreateFMul(L: Num, R: Recip);
1000	}
1001
1002	return nullptr;
1003	}
1004
1005	// optimize with fdiv.fast:
1006	//
1007	// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
1008	//
1009	// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
1010	//
1011	// NOTE: optimizeWithRcp should be tried first because rcp is the preference.
1012	Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
1013	IRBuilder<> &Builder, Value Num, Value Den, float ReqdAccuracy) const {
1014	// fdiv.fast can achieve 2.5 ULP accuracy.
1015	if (ReqdAccuracy < `2.5f`)
1016	return nullptr;
1017
1018	// Only have fdiv.fast for f32.
1019	assert(Den->getType()->isFloatTy());
1020
1021	bool NumIsOne = false;
1022	if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Val: Num)) {
1023	if (CNum->isExactlyValue(V: +`1.0`) \|\| CNum->isExactlyValue(V: -`1.0`))
1024	NumIsOne = true;
1025	}
1026
1027	// fdiv does not support denormals. But 1.0/x is always fine to use it.
1028	//
1029	// TODO: This works for any value with a specific known exponent range, don't
1030	// just limit to constant 1.
1031	if (!HasFP32DenormalFlush && !NumIsOne)
1032	return nullptr;
1033
1034	return Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_fdiv_fast, Args: {Num, Den});
1035	}
1036
1037	Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
1038	IRBuilder<> &Builder, Value Num, Value Den, FastMathFlags DivFMF,
1039	FastMathFlags SqrtFMF, Value RsqOp, const* Instruction *FDivInst,
1040	float ReqdDivAccuracy) const {
1041	if (RsqOp) {
1042	Value *Rsq =
1043	optimizeWithRsq(Builder, Num, Den: RsqOp, DivFMF, SqrtFMF, CtxI: FDivInst);
1044	if (Rsq)
1045	return Rsq;
1046	}
1047
1048	Value *Rcp = optimizeWithRcp(Builder, Num, Den, FMF: DivFMF, CtxI: FDivInst);
1049	if (Rcp)
1050	return Rcp;
1051
1052	// In the basic case fdiv_fast has the same instruction count as the frexp div
1053	// expansion. Slightly prefer fdiv_fast since it ends in an fmul that can
1054	// potentially be fused into a user. Also, materialization of the constants
1055	// can be reused for multiple instances.
1056	Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdAccuracy: ReqdDivAccuracy);
1057	if (FDivFast)
1058	return FDivFast;
1059
1060	return emitFrexpDiv(Builder, LHS: Num, RHS: Den, FMF: DivFMF);
1061	}
1062
1063	// Optimizations is performed based on fpmath, fast math flags as well as
1064	// denormals to optimize fdiv with either rcp or fdiv.fast.
1065	//
1066	// With rcp:
1067	// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
1068	// allowed with unsafe-fp-math or afn.
1069	//
1070	// a/b -> arcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.*
1071	//
1072	// With fdiv.fast:
1073	// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
1074	//
1075	// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
1076	//
1077	// NOTE: rcp is the preference in cases that both are legal.
1078	bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
1079	if (DisableFDivExpand)
1080	return false;
1081
1082	Type *Ty = FDiv.getType()->getScalarType();
1083	if (!Ty->isFloatTy())
1084	return false;
1085
1086	// The f64 rcp/rsq approximations are pretty inaccurate. We can do an
1087	// expansion around them in codegen. f16 is good enough to always use.
1088
1089	const FPMathOperator FPOp = cast<const* FPMathOperator>(Val: &FDiv);
1090	const FastMathFlags DivFMF = FPOp->getFastMathFlags();
1091	const float ReqdAccuracy = FPOp->getFPAccuracy();
1092
1093	FastMathFlags SqrtFMF;
1094
1095	Value *Num = FDiv.getOperand(i_nocapture: `0`);
1096	Value *Den = FDiv.getOperand(i_nocapture: `1`);
1097
1098	Value RsqOp = nullptr*;
1099	auto *DenII = dyn_cast<IntrinsicInst>(Val: Den);
1100	if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
1101	DenII->hasOneUse()) {
1102	const auto *SqrtOp = cast<FPMathOperator>(Val: DenII);
1103	SqrtFMF = SqrtOp->getFastMathFlags();
1104	if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF))
1105	RsqOp = SqrtOp->getOperand(i: `0`);
1106	}
1107
1108	// Inaccurate rcp is allowed with unsafe-fp-math or afn.
1109	//
1110	// Defer to codegen to handle this.
1111	//
1112	// TODO: Decide on an interpretation for interactions between afn + arcp +
1113	// !fpmath, and make it consistent between here and codegen. For now, defer
1114	// expansion of afn to codegen. The current interpretation is so aggressive we
1115	// don't need any pre-consideration here when we have better information. A
1116	// more conservative interpretation could use handling here.
1117	const bool AllowInaccurateRcp = HasUnsafeFPMath \|\| DivFMF.approxFunc();
1118	if (!RsqOp && AllowInaccurateRcp)
1119	return false;
1120
1121	// Defer the correct implementations to codegen.
1122	if (ReqdAccuracy < `1.0f`)
1123	return false;
1124
1125	IRBuilder<> Builder(FDiv.getParent(), std::next(x: FDiv.getIterator()));
1126	Builder.setFastMathFlags(DivFMF);
1127	Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
1128
1129	SmallVector<Value *, `4`> NumVals;
1130	SmallVector<Value *, `4`> DenVals;
1131	SmallVector<Value *, `4`> RsqDenVals;
1132	extractValues(Builder, Values&: NumVals, V: Num);
1133	extractValues(Builder, Values&: DenVals, V: Den);
1134
1135	if (RsqOp)
1136	extractValues(Builder, Values&: RsqDenVals, V: RsqOp);
1137
1138	SmallVector<Value *, `4`> ResultVals(NumVals.size());
1139	for (int I = `0`, E = NumVals.size(); I != E; ++I) {
1140	Value *NumElt = NumVals [I];
1141	Value *DenElt = DenVals [I];
1142	Value RsqDenElt = RsqOp ? RsqDenVals [I] : nullptr*;
1143
1144	Value *NewElt =
1145	visitFDivElement(Builder, Num: NumElt, Den: DenElt, DivFMF, SqrtFMF, RsqOp: RsqDenElt,
1146	FDivInst: cast<Instruction>(Val: FPOp), ReqdDivAccuracy: ReqdAccuracy);
1147	if (!NewElt) {
1148	// Keep the original, but scalarized.
1149
1150	// This has the unfortunate side effect of sometimes scalarizing when
1151	// we're not going to do anything.
1152	NewElt = Builder.CreateFDiv(L: NumElt, R: DenElt);
1153	if (auto *NewEltInst = dyn_cast<Instruction>(Val: NewElt))
1154	NewEltInst->copyMetadata(SrcInst: FDiv);
1155	}
1156
1157	ResultVals [I] = NewElt;
1158	}
1159
1160	Value *NewVal = insertValues(Builder, Ty: FDiv.getType(), Values&: ResultVals);
1161
1162	if (NewVal) {
1163	FDiv.replaceAllUsesWith(V: NewVal);
1164	NewVal->takeName(V: &FDiv);
1165	RecursivelyDeleteTriviallyDeadInstructions(V: &FDiv, TLI);
1166	}
1167
1168	return true;
1169	}
1170
1171	static std::pair<Value, Value> getMul64(IRBuilder<> &Builder,
1172	Value LHS, Value RHS) {
1173	Type *I32Ty = Builder.getInt32Ty();
1174	Type *I64Ty = Builder.getInt64Ty();
1175
1176	Value *LHS_EXT64 = Builder.CreateZExt(V: LHS, DestTy: I64Ty);
1177	Value *RHS_EXT64 = Builder.CreateZExt(V: RHS, DestTy: I64Ty);
1178	Value *MUL64 = Builder.CreateMul(LHS: LHS_EXT64, RHS: RHS_EXT64);
1179	Value *Lo = Builder.CreateTrunc(V: MUL64, DestTy: I32Ty);
1180	Value *Hi = Builder.CreateLShr(LHS: MUL64, RHS: Builder.getInt64(C: `32`));
1181	Hi = Builder.CreateTrunc(V: Hi, DestTy: I32Ty);
1182	return std::pair(Lo, Hi);
1183	}
1184
1185	static Value* getMulHu(IRBuilder<> &Builder, Value LHS, Value RHS) {
1186	return getMul64(Builder, LHS, RHS).second;
1187	}
1188
1189	/// Figure out how many bits are really needed for this division.
1190	/// \p MaxDivBits is an optimization hint to bypass the second
1191	/// ComputeNumSignBits/computeKnownBits call if the first one is
1192	/// insufficient.
1193	unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,
1194	Value *Den,
1195	unsigned MaxDivBits,
1196	bool IsSigned) const {
1197	assert(Num->getType()->getScalarSizeInBits() ==
1198	Den->getType()->getScalarSizeInBits());
1199	unsigned SSBits = Num->getType()->getScalarSizeInBits();
1200	if (IsSigned) {
1201	unsigned RHSSignBits = ComputeNumSignBits(Op: Den, DL, AC, CxtI: &I);
1202	// A sign bit needs to be reserved for shrinking.
1203	unsigned DivBits = SSBits - RHSSignBits + `1`;
1204	if (DivBits > MaxDivBits)
1205	return SSBits;
1206
1207	unsigned LHSSignBits = ComputeNumSignBits(Op: Num, DL, AC, CxtI: &I);
1208
1209	unsigned SignBits = std::min(a: LHSSignBits, b: RHSSignBits);
1210	DivBits = SSBits - SignBits + `1`;
1211	return DivBits;
1212	}
1213
1214	// All bits are used for unsigned division for Num or Den in range
1215	// (SignedMax, UnsignedMax].
1216	KnownBits Known = computeKnownBits(V: Den, DL, AC, CxtI: &I);
1217	if (Known.isNegative() \|\| !Known.isNonNegative())
1218	return SSBits;
1219	unsigned RHSSignBits = Known.countMinLeadingZeros();
1220	unsigned DivBits = SSBits - RHSSignBits;
1221	if (DivBits > MaxDivBits)
1222	return SSBits;
1223
1224	Known = computeKnownBits(V: Num, DL, AC, CxtI: &I);
1225	if (Known.isNegative() \|\| !Known.isNonNegative())
1226	return SSBits;
1227	unsigned LHSSignBits = Known.countMinLeadingZeros();
1228
1229	unsigned SignBits = std::min(a: LHSSignBits, b: RHSSignBits);
1230	DivBits = SSBits - SignBits;
1231	return DivBits;
1232	}
1233
1234	// The fractional part of a float is enough to accurately represent up to
1235	// a 24-bit signed integer.
1236	Value *AMDGPUCodeGenPrepareImpl::expandDivRem24(IRBuilder<> &Builder,
1237	BinaryOperator &I, Value *Num,
1238	Value Den, bool* IsDiv,
1239	bool IsSigned) const {
1240	unsigned DivBits = getDivNumBits(I, Num, Den, MaxDivBits: `24`, IsSigned);
1241	if (DivBits > `24`)
1242	return nullptr;
1243	return expandDivRem24Impl(Builder, I, Num, Den, NumBits: DivBits, IsDiv, IsSigned);
1244	}
1245
1246	Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
1247	IRBuilder<> &Builder, BinaryOperator &I, Value Num, Value Den,
1248	unsigned DivBits, bool IsDiv, bool IsSigned) const {
1249	Type *I32Ty = Builder.getInt32Ty();
1250	Num = Builder.CreateTrunc(V: Num, DestTy: I32Ty);
1251	Den = Builder.CreateTrunc(V: Den, DestTy: I32Ty);
1252
1253	Type *F32Ty = Builder.getFloatTy();
1254	ConstantInt *One = Builder.getInt32(C: `1`);
1255	Value *JQ = One;
1256
1257	if (IsSigned) {
1258	// char\|short jq = ia ^ ib;
1259	JQ = Builder.CreateXor(LHS: Num, RHS: Den);
1260
1261	// jq = jq >> (bitsize - 2)
1262	JQ = Builder.CreateAShr(LHS: JQ, RHS: Builder.getInt32(C: `30`));
1263
1264	// jq = jq \| 0x1
1265	JQ = Builder.CreateOr(LHS: JQ, RHS: One);
1266	}
1267
1268	// int ia = (int)LHS;
1269	Value *IA = Num;
1270
1271	// int ib, (int)RHS;
1272	Value *IB = Den;
1273
1274	// float fa = (float)ia;
1275	Value *FA = IsSigned ? Builder.CreateSIToFP(V: IA, DestTy: F32Ty)
1276	: Builder.CreateUIToFP(V: IA, DestTy: F32Ty);
1277
1278	// float fb = (float)ib;
1279	Value *FB = IsSigned ? Builder.CreateSIToFP(V: IB,DestTy: F32Ty)
1280	: Builder.CreateUIToFP(V: IB,DestTy: F32Ty);
1281
1282	Value *RCP = Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_rcp,
1283	Types: Builder.getFloatTy(), Args: {FB});
1284	Value *FQM = Builder.CreateFMul(L: FA, R: RCP);
1285
1286	// fq = trunc(fqm);
1287	CallInst *FQ = Builder.CreateUnaryIntrinsic(ID: Intrinsic::trunc, V: FQM);
1288	FQ->copyFastMathFlags(FMF: Builder.getFastMathFlags());
1289
1290	// float fqneg = -fq;
1291	Value *FQNeg = Builder.CreateFNeg(V: FQ);
1292
1293	// float fr = mad(fqneg, fb, fa);
1294	auto FMAD = !ST.hasMadMacF32Insts()
1295	? Intrinsic::fma
1296	: (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz;
1297	Value *FR = Builder.CreateIntrinsic(ID: FMAD,
1298	Types: {FQNeg->getType()}, Args: {FQNeg, FB, FA}, FMFSource: FQ);
1299
1300	// int iq = (int)fq;
1301	Value *IQ = IsSigned ? Builder.CreateFPToSI(V: FQ, DestTy: I32Ty)
1302	: Builder.CreateFPToUI(V: FQ, DestTy: I32Ty);
1303
1304	// fr = fabs(fr);
1305	FR = Builder.CreateUnaryIntrinsic(ID: Intrinsic::fabs, V: FR, FMFSource: FQ);
1306
1307	// fb = fabs(fb);
1308	FB = Builder.CreateUnaryIntrinsic(ID: Intrinsic::fabs, V: FB, FMFSource: FQ);
1309
1310	// int cv = fr >= fb;
1311	Value *CV = Builder.CreateFCmpOGE(LHS: FR, RHS: FB);
1312
1313	// jq = (cv ? jq : 0);
1314	JQ = Builder.CreateSelect(C: CV, True: JQ, False: Builder.getInt32(C: `0`));
1315
1316	// dst = iq + jq;
1317	Value *Div = Builder.CreateAdd(LHS: IQ, RHS: JQ);
1318
1319	Value *Res = Div;
1320	if (!IsDiv) {
1321	// Rem needs compensation, it's easier to recompute it
1322	Value *Rem = Builder.CreateMul(LHS: Div, RHS: Den);
1323	Res = Builder.CreateSub(LHS: Num, RHS: Rem);
1324	}
1325
1326	if (DivBits != `0` && DivBits < `32`) {
1327	// Extend in register from the number of bits this divide really is.
1328	if (IsSigned) {
1329	int InRegBits = `32` - DivBits;
1330
1331	Res = Builder.CreateShl(LHS: Res, RHS: InRegBits);
1332	Res = Builder.CreateAShr(LHS: Res, RHS: InRegBits);
1333	} else {
1334	ConstantInt *TruncMask
1335	= Builder.getInt32(C: (UINT64_C(`1`) << DivBits) - `1`);
1336	Res = Builder.CreateAnd(LHS: Res, RHS: TruncMask);
1337	}
1338	}
1339
1340	return Res;
1341	}
1342
1343	// Try to recognize special cases the DAG will emit special, better expansions
1344	// than the general expansion we do here.
1345
1346	// TODO: It would be better to just directly handle those optimizations here.
1347	bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I,
1348	Value *Num,
1349	Value Den) const* {
1350	if (Constant *C = dyn_cast<Constant>(Val: Den)) {
1351	// Arbitrary constants get a better expansion as long as a wider mulhi is
1352	// legal.
1353	if (C->getType()->getScalarSizeInBits() <= `32`)
1354	return true;
1355
1356	// TODO: Sdiv check for not exact for some reason.
1357
1358	// If there's no wider mulhi, there's only a better expansion for powers of
1359	// two.
1360	// TODO: Should really know for each vector element.
1361	if (isKnownToBeAPowerOfTwo(V: C, DL, OrZero: true, AC, CxtI: &I, DT))
1362	return true;
1363
1364	return false;
1365	}
1366
1367	if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Val: Den)) {
1368	// fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
1369	if (BinOpDen->getOpcode() == Instruction::Shl &&
1370	isa<Constant>(Val: BinOpDen->getOperand(i_nocapture: `0`)) &&
1371	isKnownToBeAPowerOfTwo(V: BinOpDen->getOperand(i_nocapture: `0`), DL, OrZero: true, AC, CxtI: &I, DT)) {
1372	return true;
1373	}
1374	}
1375
1376	return false;
1377	}
1378
1379	static Value getSign32(Value V, IRBuilder<> &Builder, const DataLayout DL) {
1380	// Check whether the sign can be determined statically.
1381	KnownBits Known = computeKnownBits(V, DL);
1382	if (Known.isNegative())
1383	return Constant::getAllOnesValue(Ty: V->getType());
1384	if (Known.isNonNegative())
1385	return Constant::getNullValue(Ty: V->getType());
1386	return Builder.CreateAShr(LHS: V, RHS: Builder.getInt32(C: `31`));
1387	}
1388
1389	Value *AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder<> &Builder,
1390	BinaryOperator &I, Value *X,
1391	Value Y) const* {
1392	Instruction::BinaryOps Opc = I.getOpcode();
1393	assert(Opc == Instruction::URem \|\| Opc == Instruction::UDiv \|\|
1394	Opc == Instruction::SRem \|\| Opc == Instruction::SDiv);
1395
1396	FastMathFlags FMF;
1397	FMF.setFast();
1398	Builder.setFastMathFlags(FMF);
1399
1400	if (divHasSpecialOptimization(I, Num: X, Den: Y))
1401	return nullptr; // Keep it for later optimization.
1402
1403	bool IsDiv = Opc == Instruction::UDiv \|\| Opc == Instruction::SDiv;
1404	bool IsSigned = Opc == Instruction::SRem \|\| Opc == Instruction::SDiv;
1405
1406	Type *Ty = X->getType();
1407	Type *I32Ty = Builder.getInt32Ty();
1408	Type *F32Ty = Builder.getFloatTy();
1409
1410	if (Ty->getScalarSizeInBits() != `32`) {
1411	if (IsSigned) {
1412	X = Builder.CreateSExtOrTrunc(V: X, DestTy: I32Ty);
1413	Y = Builder.CreateSExtOrTrunc(V: Y, DestTy: I32Ty);
1414	} else {
1415	X = Builder.CreateZExtOrTrunc(V: X, DestTy: I32Ty);
1416	Y = Builder.CreateZExtOrTrunc(V: Y, DestTy: I32Ty);
1417	}
1418	}
1419
1420	if (Value *Res = expandDivRem24(Builder, I, Num: X, Den: Y, IsDiv, IsSigned)) {
1421	return IsSigned ? Builder.CreateSExtOrTrunc(V: Res, DestTy: Ty) :
1422	Builder.CreateZExtOrTrunc(V: Res, DestTy: Ty);
1423	}
1424
1425	ConstantInt *Zero = Builder.getInt32(C: `0`);
1426	ConstantInt *One = Builder.getInt32(C: `1`);
1427
1428	Value Sign = nullptr*;
1429	if (IsSigned) {
1430	Value *SignX = getSign32(V: X, Builder, DL);
1431	Value *SignY = getSign32(V: Y, Builder, DL);
1432	// Remainder sign is the same as LHS
1433	Sign = IsDiv ? Builder.CreateXor(LHS: SignX, RHS: SignY) : SignX;
1434
1435	X = Builder.CreateAdd(LHS: X, RHS: SignX);
1436	Y = Builder.CreateAdd(LHS: Y, RHS: SignY);
1437
1438	X = Builder.CreateXor(LHS: X, RHS: SignX);
1439	Y = Builder.CreateXor(LHS: Y, RHS: SignY);
1440	}
1441
1442	// The algorithm here is based on ideas from "Software Integer Division", Tom
1443	// Rodeheffer, August 2008.
1444	//
1445	// unsigned udiv(unsigned x, unsigned y) {
1446	// // Initial estimate of inv(y). The constant is less than 2^32 to ensure
1447	// // that this is a lower bound on inv(y), even if some of the calculations
1448	// // round up.
1449	// unsigned z = (unsigned)((4294967296.0 - 512.0) v_rcp_f32((float)y));*
1450	//
1451	// // One round of UNR (Unsigned integer Newton-Raphson) to improve z.
1452	// // Empirically this is guaranteed to give a "two-y" lower bound on
1453	// // inv(y).
1454	// z += umulh(z, -y z);*
1455	//
1456	// // Quotient/remainder estimate.
1457	// unsigned q = umulh(x, z);
1458	// unsigned r = x - q y;*
1459	//
1460	// // Two rounds of quotient/remainder refinement.
1461	// if (r >= y) {
1462	// ++q;
1463	// r -= y;
1464	// }
1465	// if (r >= y) {
1466	// ++q;
1467	// r -= y;
1468	// }
1469	//
1470	// return q;
1471	// }
1472
1473	// Initial estimate of inv(y).
1474	Value *FloatY = Builder.CreateUIToFP(V: Y, DestTy: F32Ty);
1475	Value *RcpY = Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_rcp, Types: F32Ty, Args: {FloatY});
1476	Constant Scale = ConstantFP::get(Ty: F32Ty, V: llvm::bit_cast<float*>(from: `0x4F7FFFFE`));
1477	Value *ScaledY = Builder.CreateFMul(L: RcpY, R: Scale);
1478	Value *Z = Builder.CreateFPToUI(V: ScaledY, DestTy: I32Ty);
1479
1480	// One round of UNR.
1481	Value *NegY = Builder.CreateSub(LHS: Zero, RHS: Y);
1482	Value *NegYZ = Builder.CreateMul(LHS: NegY, RHS: Z);
1483	Z = Builder.CreateAdd(LHS: Z, RHS: getMulHu(Builder, LHS: Z, RHS: NegYZ));
1484
1485	// Quotient/remainder estimate.
1486	Value *Q = getMulHu(Builder, LHS: X, RHS: Z);
1487	Value *R = Builder.CreateSub(LHS: X, RHS: Builder.CreateMul(LHS: Q, RHS: Y));
1488
1489	// First quotient/remainder refinement.
1490	Value *Cond = Builder.CreateICmpUGE(LHS: R, RHS: Y);
1491	if (IsDiv)
1492	Q = Builder.CreateSelect(C: Cond, True: Builder.CreateAdd(LHS: Q, RHS: One), False: Q);
1493	R = Builder.CreateSelect(C: Cond, True: Builder.CreateSub(LHS: R, RHS: Y), False: R);
1494
1495	// Second quotient/remainder refinement.
1496	Cond = Builder.CreateICmpUGE(LHS: R, RHS: Y);
1497	Value *Res;
1498	if (IsDiv)
1499	Res = Builder.CreateSelect(C: Cond, True: Builder.CreateAdd(LHS: Q, RHS: One), False: Q);
1500	else
1501	Res = Builder.CreateSelect(C: Cond, True: Builder.CreateSub(LHS: R, RHS: Y), False: R);
1502
1503	if (IsSigned) {
1504	Res = Builder.CreateXor(LHS: Res, RHS: Sign);
1505	Res = Builder.CreateSub(LHS: Res, RHS: Sign);
1506	Res = Builder.CreateSExtOrTrunc(V: Res, DestTy: Ty);
1507	} else {
1508	Res = Builder.CreateZExtOrTrunc(V: Res, DestTy: Ty);
1509	}
1510	return Res;
1511	}
1512
1513	Value *AMDGPUCodeGenPrepareImpl::shrinkDivRem64(IRBuilder<> &Builder,
1514	BinaryOperator &I, Value *Num,
1515	Value Den) const* {
1516	if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den))
1517	return nullptr; // Keep it for later optimization.
1518
1519	Instruction::BinaryOps Opc = I.getOpcode();
1520
1521	bool IsDiv = Opc == Instruction::SDiv \|\| Opc == Instruction::UDiv;
1522	bool IsSigned = Opc == Instruction::SDiv \|\| Opc == Instruction::SRem;
1523
1524	unsigned NumDivBits = getDivNumBits(I, Num, Den, MaxDivBits: `32`, IsSigned);
1525	if (NumDivBits > `32`)
1526	return nullptr;
1527
1528	Value Narrowed = nullptr*;
1529	if (NumDivBits <= `24`) {
1530	Narrowed = expandDivRem24Impl(Builder, I, Num, Den, DivBits: NumDivBits,
1531	IsDiv, IsSigned);
1532	} else if (NumDivBits <= `32`) {
1533	Narrowed = expandDivRem32(Builder, I, X: Num, Y: Den);
1534	}
1535
1536	if (Narrowed) {
1537	return IsSigned ? Builder.CreateSExt(V: Narrowed, DestTy: Num->getType()) :
1538	Builder.CreateZExt(V: Narrowed, DestTy: Num->getType());
1539	}
1540
1541	return nullptr;
1542	}
1543
1544	void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &I) const {
1545	Instruction::BinaryOps Opc = I.getOpcode();
1546	// Do the general expansion.
1547	if (Opc == Instruction::UDiv \|\| Opc == Instruction::SDiv) {
1548	expandDivisionUpTo64Bits(Div: &I);
1549	return;
1550	}
1551
1552	if (Opc == Instruction::URem \|\| Opc == Instruction::SRem) {
1553	expandRemainderUpTo64Bits(Rem: &I);
1554	return;
1555	}
1556
1557	llvm_unreachable("not a division");
1558	}
1559
1560	/*
1561	This will cause non-byte load in consistency, for example:
1562	```
1563	%load = load i1, ptr addrspace(4) %arg, align 4
1564	%zext = zext i1 %load to
1565	i64 %add = add i64 %zext
1566	```
1567	Instead of creating `s_and_b32 s0, s0, 1`,
1568	it will create `s_and_b32 s0, s0, 0xff`.
1569	We accept this change since the non-byte load assumes the upper bits
1570	within the byte are all 0.
1571	*/
1572	static bool tryNarrowMathIfNoOverflow(Instruction *I,
1573	const SITargetLowering *TLI,
1574	const TargetTransformInfo &TTI,
1575	const DataLayout &DL) {
1576	unsigned Opc = I->getOpcode();
1577	Type *OldType = I->getType();
1578
1579	if (Opc != Instruction::Add && Opc != Instruction::Mul)
1580	return false;
1581
1582	unsigned OrigBit = OldType->getScalarSizeInBits();
1583
1584	if (Opc != Instruction::Add && Opc != Instruction::Mul)
1585	llvm_unreachable("Unexpected opcode, only valid for Instruction::Add and "
1586	"Instruction::Mul.");
1587
1588	unsigned MaxBitsNeeded = computeKnownBits(V: I, DL).countMaxActiveBits();
1589
1590	MaxBitsNeeded = std::max<unsigned>(a: bit_ceil(Value: MaxBitsNeeded), b: `8`);
1591	Type *NewType = DL.getSmallestLegalIntType(C&: I->getContext(), Width: MaxBitsNeeded);
1592	if (!NewType)
1593	return false;
1594	unsigned NewBit = NewType->getIntegerBitWidth();
1595	if (NewBit >= OrigBit)
1596	return false;
1597	NewType = I->getType()->getWithNewBitWidth(NewBitWidth: NewBit);
1598
1599	// Old cost
1600	InstructionCost OldCost =
1601	TTI.getArithmeticInstrCost(Opcode: Opc, Ty: OldType, CostKind: TTI::TCK_RecipThroughput);
1602	// New cost of new op
1603	InstructionCost NewCost =
1604	TTI.getArithmeticInstrCost(Opcode: Opc, Ty: NewType, CostKind: TTI::TCK_RecipThroughput);
1605	// New cost of narrowing 2 operands (use trunc)
1606	int NumOfNonConstOps = `2`;
1607	if (isa<Constant>(Val: I->getOperand(i: `0`)) \|\| isa<Constant>(Val: I->getOperand(i: `1`))) {
1608	// Cannot be both constant, should be propagated
1609	NumOfNonConstOps = `1`;
1610	}
1611	NewCost += NumOfNonConstOps * TTI.getCastInstrCost(Opcode: Instruction::Trunc,
1612	Dst: NewType, Src: OldType,
1613	CCH: TTI.getCastContextHint(I),
1614	CostKind: TTI::TCK_RecipThroughput);
1615	// New cost of zext narrowed result to original type
1616	NewCost +=
1617	TTI.getCastInstrCost(Opcode: Instruction::ZExt, Dst: OldType, Src: NewType,
1618	CCH: TTI.getCastContextHint(I), CostKind: TTI::TCK_RecipThroughput);
1619	if (NewCost >= OldCost)
1620	return false;
1621
1622	IRBuilder<> Builder(I);
1623	Value *Trunc0 = Builder.CreateTrunc(V: I->getOperand(i: `0`), DestTy: NewType);
1624	Value *Trunc1 = Builder.CreateTrunc(V: I->getOperand(i: `1`), DestTy: NewType);
1625	Value *Arith =
1626	Builder.CreateBinOp(Opc: (Instruction::BinaryOps)Opc, LHS: Trunc0, RHS: Trunc1);
1627
1628	Value *Zext = Builder.CreateZExt(V: Arith, DestTy: OldType);
1629	I->replaceAllUsesWith(V: Zext);
1630	I->eraseFromParent();
1631	return true;
1632	}
1633
1634	bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
1635	if (foldBinOpIntoSelect(BO&: I))
1636	return true;
1637
1638	if (ST.has16BitInsts() && needsPromotionToI32(T: I.getType()) &&
1639	UA.isUniform(I: &I) && promoteUniformOpToI32(I))
1640	return true;
1641
1642	if (UseMul24Intrin && replaceMulWithMul24(I))
1643	return true;
1644	if (tryNarrowMathIfNoOverflow(I: &I, TLI: ST.getTargetLowering(),
1645	TTI: TM.getTargetTransformInfo(F), DL))
1646	return true;
1647
1648	bool Changed = false;
1649	Instruction::BinaryOps Opc = I.getOpcode();
1650	Type *Ty = I.getType();
1651	Value NewDiv = nullptr*;
1652	unsigned ScalarSize = Ty->getScalarSizeInBits();
1653
1654	SmallVector<BinaryOperator *, `8`> Div64ToExpand;
1655
1656	if ((Opc == Instruction::URem \|\| Opc == Instruction::UDiv \|\|
1657	Opc == Instruction::SRem \|\| Opc == Instruction::SDiv) &&
1658	ScalarSize <= `64` &&
1659	!DisableIDivExpand) {
1660	Value *Num = I.getOperand(i_nocapture: `0`);
1661	Value *Den = I.getOperand(i_nocapture: `1`);
1662	IRBuilder<> Builder(&I);
1663	Builder.SetCurrentDebugLocation(I.getDebugLoc());
1664
1665	if (auto *VT = dyn_cast<FixedVectorType>(Val: Ty)) {
1666	NewDiv = PoisonValue::get(T: VT);
1667
1668	for (unsigned N = `0`, E = VT->getNumElements(); N != E; ++N) {
1669	Value *NumEltN = Builder.CreateExtractElement(Vec: Num, Idx: N);
1670	Value *DenEltN = Builder.CreateExtractElement(Vec: Den, Idx: N);
1671
1672	Value *NewElt;
1673	if (ScalarSize <= `32`) {
1674	NewElt = expandDivRem32(Builder, I, X: NumEltN, Y: DenEltN);
1675	if (!NewElt)
1676	NewElt = Builder.CreateBinOp(Opc, LHS: NumEltN, RHS: DenEltN);
1677	} else {
1678	// See if this 64-bit division can be shrunk to 32/24-bits before
1679	// producing the general expansion.
1680	NewElt = shrinkDivRem64(Builder, I, Num: NumEltN, Den: DenEltN);
1681	if (!NewElt) {
1682	// The general 64-bit expansion introduces control flow and doesn't
1683	// return the new value. Just insert a scalar copy and defer
1684	// expanding it.
1685	NewElt = Builder.CreateBinOp(Opc, LHS: NumEltN, RHS: DenEltN);
1686	// CreateBinOp does constant folding. If the operands are constant,
1687	// it will return a Constant instead of a BinaryOperator.
1688	if (auto *NewEltBO = dyn_cast<BinaryOperator>(Val: NewElt))
1689	Div64ToExpand.push_back(Elt: NewEltBO);
1690	}
1691	}
1692
1693	if (auto *NewEltI = dyn_cast<Instruction>(Val: NewElt))
1694	NewEltI->copyIRFlags(V: &I);
1695
1696	NewDiv = Builder.CreateInsertElement(Vec: NewDiv, NewElt, Idx: N);
1697	}
1698	} else {
1699	if (ScalarSize <= `32`)
1700	NewDiv = expandDivRem32(Builder, I, X: Num, Y: Den);
1701	else {
1702	NewDiv = shrinkDivRem64(Builder, I, Num, Den);
1703	if (!NewDiv)
1704	Div64ToExpand.push_back(Elt: &I);
1705	}
1706	}
1707
1708	if (NewDiv) {
1709	I.replaceAllUsesWith(V: NewDiv);
1710	I.eraseFromParent();
1711	Changed = true;
1712	}
1713	}
1714
1715	if (ExpandDiv64InIR) {
1716	// TODO: We get much worse code in specially handled constant cases.
1717	for (BinaryOperator *Div : Div64ToExpand) {
1718	expandDivRem64(I&: *Div);
1719	FlowChanged = true;
1720	Changed = true;
1721	}
1722	}
1723
1724	return Changed;
1725	}
1726
1727	bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
1728	if (!WidenLoads)
1729	return false;
1730
1731	if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS \|\|
1732	I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1733	canWidenScalarExtLoad(I)) {
1734	IRBuilder<> Builder(&I);
1735	Builder.SetCurrentDebugLocation(I.getDebugLoc());
1736
1737	Type *I32Ty = Builder.getInt32Ty();
1738	LoadInst *WidenLoad = Builder.CreateLoad(Ty: I32Ty, Ptr: I.getPointerOperand());
1739	WidenLoad->copyMetadata(SrcInst: I);
1740
1741	// If we have range metadata, we need to convert the type, and not make
1742	// assumptions about the high bits.
1743	if (auto *Range = WidenLoad->getMetadata(KindID: LLVMContext::MD_range)) {
1744	ConstantInt *Lower =
1745	mdconst::extract<ConstantInt>(MD: Range->getOperand(I: `0`));
1746
1747	if (Lower->isNullValue()) {
1748	WidenLoad->setMetadata(KindID: LLVMContext::MD_range, Node: nullptr);
1749	} else {
1750	Metadata *LowAndHigh[] = {
1751	ConstantAsMetadata::get(C: ConstantInt::get(Ty: I32Ty, V: Lower->getValue().zext(width: `32`))),
1752	// Don't make assumptions about the high bits.
1753	ConstantAsMetadata::get(C: ConstantInt::get(Ty: I32Ty, V: `0`))
1754	};
1755
1756	WidenLoad->setMetadata(KindID: LLVMContext::MD_range,
1757	Node: MDNode::get(Context&: F.getContext(), MDs: LowAndHigh));
1758	}
1759	}
1760
1761	int TySize = DL.getTypeSizeInBits(Ty: I.getType());
1762	Type *IntNTy = Builder.getIntNTy(N: TySize);
1763	Value *ValTrunc = Builder.CreateTrunc(V: WidenLoad, DestTy: IntNTy);
1764	Value *ValOrig = Builder.CreateBitCast(V: ValTrunc, DestTy: I.getType());
1765	I.replaceAllUsesWith(V: ValOrig);
1766	I.eraseFromParent();
1767	return true;
1768	}
1769
1770	return false;
1771	}
1772
1773	bool AMDGPUCodeGenPrepareImpl::visitICmpInst(ICmpInst &I) {
1774	bool Changed = false;
1775
1776	if (ST.has16BitInsts() && needsPromotionToI32(T: I.getOperand(i_nocapture: `0`)->getType()) &&
1777	UA.isUniform(I: &I))
1778	Changed \|= promoteUniformOpToI32(I);
1779
1780	return Changed;
1781	}
1782
1783	bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {
1784	Value *Cond = I.getCondition();
1785	Value *TrueVal = I.getTrueValue();
1786	Value *FalseVal = I.getFalseValue();
1787	Value *CmpVal;
1788	CmpPredicate Pred;
1789
1790	if (ST.has16BitInsts() && needsPromotionToI32(T: I.getType())) {
1791	if (UA.isUniform(I: &I))
1792	return promoteUniformOpToI32(I);
1793	return false;
1794	}
1795
1796	// Match fract pattern with nan check.
1797	if (!match(V: Cond, P: m_FCmp(Pred, L: m_Value(V&: CmpVal), R: m_NonNaN())))
1798	return false;
1799
1800	FPMathOperator *FPOp = dyn_cast<FPMathOperator>(Val: &I);
1801	if (!FPOp)
1802	return false;
1803
1804	IRBuilder<> Builder(&I);
1805	Builder.setFastMathFlags(FPOp->getFastMathFlags());
1806
1807	auto *IITrue = dyn_cast<IntrinsicInst>(Val: TrueVal);
1808	auto *IIFalse = dyn_cast<IntrinsicInst>(Val: FalseVal);
1809
1810	Value Fract = nullptr*;
1811	if (Pred == FCmpInst::FCMP_UNO && TrueVal == CmpVal && IIFalse &&
1812	CmpVal == matchFractPat(I&: *IIFalse)) {
1813	// isnan(x) ? x : fract(x)
1814	Fract = applyFractPat(Builder, FractArg: CmpVal);
1815	} else if (Pred == FCmpInst::FCMP_ORD && FalseVal == CmpVal && IITrue &&
1816	CmpVal == matchFractPat(I&: *IITrue)) {
1817	// !isnan(x) ? fract(x) : x
1818	Fract = applyFractPat(Builder, FractArg: CmpVal);
1819	} else
1820	return false;
1821
1822	Fract->takeName(V: &I);
1823	I.replaceAllUsesWith(V: Fract);
1824	RecursivelyDeleteTriviallyDeadInstructions(V: &I, TLI);
1825	return true;
1826	}
1827
1828	static bool areInSameBB(const Value A, const* Value *B) {
1829	const auto *IA = dyn_cast<Instruction>(Val: A);
1830	const auto *IB = dyn_cast<Instruction>(Val: B);
1831	return IA && IB && IA->getParent() == IB->getParent();
1832	}
1833
1834	// Helper for breaking large PHIs that returns true when an extractelement on V
1835	// is likely to be folded away by the DAG combiner.
1836	static bool isInterestingPHIIncomingValue(const Value *V) {
1837	const auto *FVT = dyn_cast<FixedVectorType>(Val: V->getType());
1838	if (!FVT)
1839	return false;
1840
1841	const Value *CurVal = V;
1842
1843	// Check for insertelements, keeping track of the elements covered.
1844	BitVector EltsCovered(FVT->getNumElements());
1845	while (const auto *IE = dyn_cast<InsertElementInst>(Val: CurVal)) {
1846	const auto *Idx = dyn_cast<ConstantInt>(Val: IE->getOperand(i_nocapture: `2`));
1847
1848	// Non constant index/out of bounds index -> folding is unlikely.
1849	// The latter is more of a sanity check because canonical IR should just
1850	// have replaced those with poison.
1851	if (!Idx \|\| Idx->getZExtValue() >= FVT->getNumElements())
1852	return false;
1853
1854	const auto *VecSrc = IE->getOperand(i_nocapture: `0`);
1855
1856	// If the vector source is another instruction, it must be in the same basic
1857	// block. Otherwise, the DAGCombiner won't see the whole thing and is
1858	// unlikely to be able to do anything interesting here.
1859	if (isa<Instruction>(Val: VecSrc) && !areInSameBB(A: VecSrc, B: IE))
1860	return false;
1861
1862	CurVal = VecSrc;
1863	EltsCovered.set(Idx->getZExtValue());
1864
1865	// All elements covered.
1866	if (EltsCovered.all())
1867	return true;
1868	}
1869
1870	// We either didn't find a single insertelement, or the insertelement chain
1871	// ended before all elements were covered. Check for other interesting values.
1872
1873	// Constants are always interesting because we can just constant fold the
1874	// extractelements.
1875	if (isa<Constant>(Val: CurVal))
1876	return true;
1877
1878	// shufflevector is likely to be profitable if either operand is a constant,
1879	// or if either source is in the same block.
1880	// This is because shufflevector is most often lowered as a series of
1881	// insert/extract elements anyway.
1882	if (const auto *SV = dyn_cast<ShuffleVectorInst>(Val: CurVal)) {
1883	return isa<Constant>(Val: SV->getOperand(i_nocapture: `1`)) \|\|
1884	areInSameBB(A: SV, B: SV->getOperand(i_nocapture: `0`)) \|\|
1885	areInSameBB(A: SV, B: SV->getOperand(i_nocapture: `1`));
1886	}
1887
1888	return false;
1889	}
1890
1891	static void collectPHINodes(const PHINode &I,
1892	SmallPtrSet<const PHINode *, `8`> &SeenPHIs) {
1893	const auto [It, Inserted] = SeenPHIs.insert(Ptr: &I);
1894	if (!Inserted)
1895	return;
1896
1897	for (const Value *Inc : I.incoming_values()) {
1898	if (const auto *PhiInc = dyn_cast<PHINode>(Val: Inc))
1899	collectPHINodes(I: *PhiInc, SeenPHIs);
1900	}
1901
1902	for (const User *U : I.users()) {
1903	if (const auto *PhiU = dyn_cast<PHINode>(Val: U))
1904	collectPHINodes(I: *PhiU, SeenPHIs);
1905	}
1906	}
1907
1908	bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(const PHINode &I) {
1909	// Check in the cache first.
1910	if (const auto It = BreakPhiNodesCache.find(Val: &I);
1911	It != BreakPhiNodesCache.end())
1912	return It ->second;
1913
1914	// We consider PHI nodes as part of "chains", so given a PHI node I, we
1915	// recursively consider all its users and incoming values that are also PHI
1916	// nodes. We then make a decision about all of those PHIs at once. Either they
1917	// all get broken up, or none of them do. That way, we avoid cases where a
1918	// single PHI is/is not broken and we end up reforming/exploding a vector
1919	// multiple times, or even worse, doing it in a loop.
1920	SmallPtrSet<const PHINode *, `8`> WorkList;
1921	collectPHINodes(I, SeenPHIs&: WorkList);
1922
1923	#ifndef NDEBUG
1924	// Check that none of the PHI nodes in the worklist are in the map. If some of
1925	// them are, it means we're not good enough at collecting related PHIs.
1926	for (const PHINode *WLP : WorkList) {
1927	assert(BreakPhiNodesCache.count(WLP) == `0`);
1928	}
1929	#endif
1930
1931	// To consider a PHI profitable to break, we need to see some interesting
1932	// incoming values. At least 2/3rd (rounded up) of all PHIs in the worklist
1933	// must have one to consider all PHIs breakable.
1934	//
1935	// This threshold has been determined through performance testing.
1936	//
1937	// Note that the computation below is equivalent to
1938	//
1939	// (unsigned)ceil((K / 3.0) 2)*
1940	//
1941	// It's simply written this way to avoid mixing integral/FP arithmetic.
1942	const auto Threshold = (alignTo(Value: WorkList.size() * `2`, Align: `3`) / `3`);
1943	unsigned NumBreakablePHIs = `0`;
1944	bool CanBreak = false;
1945	for (const PHINode *Cur : WorkList) {
1946	// Don't break PHIs that have no interesting incoming values. That is, where
1947	// there is no clear opportunity to fold the "extractelement" instructions
1948	// we would add.
1949	//
1950	// Note: IC does not run after this pass, so we're only interested in the
1951	// foldings that the DAG combiner can do.
1952	if (any_of(Range: Cur->incoming_values(), P: isInterestingPHIIncomingValue)) {
1953	if (++NumBreakablePHIs >= Threshold) {
1954	CanBreak = true;
1955	break;
1956	}
1957	}
1958	}
1959
1960	for (const PHINode *Cur : WorkList)
1961	BreakPhiNodesCache [Cur] = CanBreak;
1962
1963	return CanBreak;
1964	}
1965
1966	/// Helper class for "break large PHIs" (visitPHINode).
1967	///
1968	/// This represents a slice of a PHI's incoming value, which is made up of:
1969	/// - The type of the slice (Ty)
1970	/// - The index in the incoming value's vector where the slice starts (Idx)
1971	/// - The number of elements in the slice (NumElts).
1972	/// It also keeps track of the NewPHI node inserted for this particular slice.
1973	///
1974	/// Slice examples:
1975	/// <4 x i64> -> Split into four i64 slices.
1976	/// -> [i64, 0, 1], [i64, 1, 1], [i64, 2, 1], [i64, 3, 1]
1977	/// <5 x i16> -> Split into 2 <2 x i16> slices + a i16 tail.
1978	/// -> [<2 x i16>, 0, 2], [<2 x i16>, 2, 2], [i16, 4, 1]
1979	class VectorSlice {
1980	public:
1981	VectorSlice(Type Ty, unsigned* Idx, unsigned NumElts)
1982	: Ty(Ty), Idx(Idx), NumElts(NumElts) {}
1983
1984	Type Ty = nullptr*;
1985	unsigned Idx = `0`;
1986	unsigned NumElts = `0`;
1987	PHINode NewPHI = nullptr*;
1988
1989	/// Slice \p Inc according to the information contained within this slice.
1990	/// This is cached, so if called multiple times for the same \p BB & \p Inc
1991	/// pair, it returns the same Sliced value as well.
1992	///
1993	/// Note this intentionally* does not return the same value for, say,*
1994	/// [%bb.0, %0] & [%bb.1, %0] as:
1995	/// - It could cause issues with dominance (e.g. if bb.1 is seen first, then
1996	/// the value in bb.1 may not be reachable from bb.0 if it's its
1997	/// predecessor.)
1998	/// - We also want to make our extract instructions as local as possible so
1999	/// the DAG has better chances of folding them out. Duplicating them like
2000	/// that is beneficial in that regard.
2001	///
2002	/// This is both a minor optimization to avoid creating duplicate
2003	/// instructions, but also a requirement for correctness. It is not forbidden
2004	/// for a PHI node to have the same [BB, Val] pair multiple times. If we
2005	/// returned a new value each time, those previously identical pairs would all
2006	/// have different incoming values (from the same block) and it'd cause a "PHI
2007	/// node has multiple entries for the same basic block with different incoming
2008	/// values!" verifier error.
2009	Value getSlicedVal(BasicBlock BB, Value *Inc, StringRef NewValName) {
2010	Value *&Res = SlicedVals [{BB, Inc}];
2011	if (Res)
2012	return Res;
2013
2014	IRBuilder<> B(BB->getTerminator());
2015	if (Instruction *IncInst = dyn_cast<Instruction>(Val: Inc))
2016	B.SetCurrentDebugLocation(IncInst->getDebugLoc());
2017
2018	if (NumElts > `1`) {
2019	SmallVector<int, `4`> Mask;
2020	for (unsigned K = Idx; K < (Idx + NumElts); ++K)
2021	Mask.push_back(Elt: K);
2022	Res = B.CreateShuffleVector(V: Inc, Mask, Name: NewValName);
2023	} else
2024	Res = B.CreateExtractElement(Vec: Inc, Idx, Name: NewValName);
2025
2026	return Res;
2027	}
2028
2029	private:
2030	SmallDenseMap<std::pair<BasicBlock , Value >, Value *> SlicedVals;
2031	};
2032
2033	bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
2034	// Break-up fixed-vector PHIs into smaller pieces.
2035	// Default threshold is 32, so it breaks up any vector that's >32 bits into
2036	// its elements, or into 32-bit pieces (for 8/16 bit elts).
2037	//
2038	// This is only helpful for DAGISel because it doesn't handle large PHIs as
2039	// well as GlobalISel. DAGISel lowers PHIs by using CopyToReg/CopyFromReg.
2040	// With large, odd-sized PHIs we may end up needing many `build_vector`
2041	// operations with most elements being "undef". This inhibits a lot of
2042	// optimization opportunities and can result in unreasonably high register
2043	// pressure and the inevitable stack spilling.
2044	if (!BreakLargePHIs \|\| getCGPassBuilderOption().EnableGlobalISelOption)
2045	return false;
2046
2047	FixedVectorType *FVT = dyn_cast<FixedVectorType>(Val: I.getType());
2048	if (!FVT \|\| FVT->getNumElements() == `1` \|\|
2049	DL.getTypeSizeInBits(Ty: FVT) <= BreakLargePHIsThreshold)
2050	return false;
2051
2052	if (!ForceBreakLargePHIs && !canBreakPHINode(I))
2053	return false;
2054
2055	std::vector<VectorSlice> Slices;
2056
2057	Type *EltTy = FVT->getElementType();
2058	{
2059	unsigned Idx = `0`;
2060	// For 8/16 bits type, don't scalarize fully but break it up into as many
2061	// 32-bit slices as we can, and scalarize the tail.
2062	const unsigned EltSize = DL.getTypeSizeInBits(Ty: EltTy);
2063	const unsigned NumElts = FVT->getNumElements();
2064	if (EltSize == `8` \|\| EltSize == `16`) {
2065	const unsigned SubVecSize = (`32` / EltSize);
2066	Type *SubVecTy = FixedVectorType::get(ElementType: EltTy, NumElts: SubVecSize);
2067	for (unsigned End = alignDown(Value: NumElts, Align: SubVecSize); Idx < End;
2068	Idx += SubVecSize)
2069	Slices.emplace_back(args&: SubVecTy, args&: Idx, args: SubVecSize);
2070	}
2071
2072	// Scalarize all remaining elements.
2073	for (; Idx < NumElts; ++Idx)
2074	Slices.emplace_back(args&: EltTy, args&: Idx, args: `1`);
2075	}
2076
2077	assert(Slices.size() > `1`);
2078
2079	// Create one PHI per vector piece. The "VectorSlice" class takes care of
2080	// creating the necessary instruction to extract the relevant slices of each
2081	// incoming value.
2082	IRBuilder<> B(I.getParent());
2083	B.SetCurrentDebugLocation(I.getDebugLoc());
2084
2085	unsigned IncNameSuffix = `0`;
2086	for (VectorSlice &S : Slices) {
2087	// We need to reset the build on each iteration, because getSlicedVal may
2088	// have inserted something into I's BB.
2089	B.SetInsertPoint(I.getParent()->getFirstNonPHIIt());
2090	S.NewPHI = B.CreatePHI(Ty: S.Ty, NumReservedValues: I.getNumIncomingValues());
2091
2092	for (const auto &[Idx, BB] : enumerate(First: I.blocks())) {
2093	S.NewPHI->addIncoming(V: S.getSlicedVal(BB, Inc: I.getIncomingValue(i: Idx),
2094	NewValName: "largephi.extractslice" +
2095	std::to_string(val: IncNameSuffix++)),
2096	BB);
2097	}
2098	}
2099
2100	// And replace this PHI with a vector of all the previous PHI values.
2101	Value *Vec = PoisonValue::get(T: FVT);
2102	unsigned NameSuffix = `0`;
2103	for (VectorSlice &S : Slices) {
2104	const auto ValName = "largephi.insertslice" + std::to_string(val: NameSuffix++);
2105	if (S.NumElts > `1`)
2106	Vec = B.CreateInsertVector(DstType: FVT, SrcVec: Vec, SubVec: S.NewPHI, Idx: S.Idx, Name: ValName);
2107	else
2108	Vec = B.CreateInsertElement(Vec, NewElt: S.NewPHI, Idx: S.Idx, Name: ValName);
2109	}
2110
2111	I.replaceAllUsesWith(V: Vec);
2112	I.eraseFromParent();
2113	return true;
2114	}
2115
2116	/// \param V Value to check
2117	/// \param DL DataLayout
2118	/// \param TM TargetMachine (TODO: remove once DL contains nullptr values)
2119	/// \param AS Target Address Space
2120	/// \return true if \p V cannot be the null value of \p AS, false otherwise.
2121	static bool isPtrKnownNeverNull(const Value V, const* DataLayout &DL,
2122	const AMDGPUTargetMachine &TM, unsigned AS) {
2123	// Pointer cannot be null if it's a block address, GV or alloca.
2124	// NOTE: We don't support extern_weak, but if we did, we'd need to check for
2125	// it as the symbol could be null in such cases.
2126	if (isa<BlockAddress, GlobalValue, AllocaInst>(Val: V))
2127	return true;
2128
2129	// Check nonnull arguments.
2130	if (const auto *Arg = dyn_cast<Argument>(Val: V); Arg && Arg->hasNonNullAttr())
2131	return true;
2132
2133	// Check nonnull loads.
2134	if (const auto *Load = dyn_cast<LoadInst>(Val: V);
2135	Load && Load->hasMetadata(KindID: LLVMContext::MD_nonnull))
2136	return true;
2137
2138	// getUnderlyingObject may have looked through another addrspacecast, although
2139	// the optimizable situations most likely folded out by now.
2140	if (AS != cast<PointerType>(Val: V->getType())->getAddressSpace())
2141	return false;
2142
2143	// TODO: Calls that return nonnull?
2144
2145	// For all other things, use KnownBits.
2146	// We either use 0 or all bits set to indicate null, so check whether the
2147	// value can be zero or all ones.
2148	//
2149	// TODO: Use ValueTracking's isKnownNeverNull if it becomes aware that some
2150	// address spaces have non-zero null values.
2151	auto SrcPtrKB = computeKnownBits(V, DL);
2152	const auto NullVal = TM.getNullPointerValue(AddrSpace: AS);
2153
2154	assert(SrcPtrKB.getBitWidth() == DL.getPointerSizeInBits(AS));
2155	assert((NullVal == `0` \|\| NullVal == -`1`) &&
2156	"don't know how to check for this null value!");
2157	return NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero();
2158	}
2159
2160	bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
2161	// Intrinsic doesn't support vectors, also it seems that it's often difficult
2162	// to prove that a vector cannot have any nulls in it so it's unclear if it's
2163	// worth supporting.
2164	if (I.getType()->isVectorTy())
2165	return false;
2166
2167	// Check if this can be lowered to a amdgcn.addrspacecast.nonnull.
2168	// This is only worthwhile for casts from/to priv/local to flat.
2169	const unsigned SrcAS = I.getSrcAddressSpace();
2170	const unsigned DstAS = I.getDestAddressSpace();
2171
2172	bool CanLower = false;
2173	if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
2174	CanLower = (DstAS == AMDGPUAS::LOCAL_ADDRESS \|\|
2175	DstAS == AMDGPUAS::PRIVATE_ADDRESS);
2176	else if (DstAS == AMDGPUAS::FLAT_ADDRESS)
2177	CanLower = (SrcAS == AMDGPUAS::LOCAL_ADDRESS \|\|
2178	SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
2179	if (!CanLower)
2180	return false;
2181
2182	SmallVector<const Value *, `4`> WorkList;
2183	getUnderlyingObjects(V: I.getOperand(i_nocapture: `0`), Objects&: WorkList);
2184	if (!all_of(Range&: WorkList, P: [&](const Value *V) {
2185	return isPtrKnownNeverNull(V, DL, TM, AS: SrcAS);
2186	}))
2187	return false;
2188
2189	IRBuilder<> B(&I);
2190	auto *Intrin = B.CreateIntrinsic(
2191	RetTy: I.getType(), ID: Intrinsic::amdgcn_addrspacecast_nonnull, Args: {I.getOperand(i_nocapture: `0`)});
2192	I.replaceAllUsesWith(V: Intrin);
2193	I.eraseFromParent();
2194	return true;
2195	}
2196
2197	bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
2198	switch (I.getIntrinsicID()) {
2199	case Intrinsic::bitreverse:
2200	return visitBitreverseIntrinsicInst(I);
2201	case Intrinsic::minnum:
2202	case Intrinsic::minimumnum:
2203	case Intrinsic::minimum:
2204	return visitFMinLike(I);
2205	case Intrinsic::sqrt:
2206	return visitSqrt(I);
2207	default:
2208	return false;
2209	}
2210	}
2211
2212	bool AMDGPUCodeGenPrepareImpl::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
2213	bool Changed = false;
2214
2215	if (ST.has16BitInsts() && needsPromotionToI32(T: I.getType()) &&
2216	UA.isUniform(I: &I))
2217	Changed \|= promoteUniformBitreverseToI32(I);
2218
2219	return Changed;
2220	}
2221
2222	/// Match non-nan fract pattern.
2223	/// minnum(fsub(x, floor(x)), nextafter(1.0, -1.0))
2224	/// minimumnum(fsub(x, floor(x)), nextafter(1.0, -1.0))
2225	/// minimum(fsub(x, floor(x)), nextafter(1.0, -1.0))
2226	///
2227	/// If fract is a useful instruction for the subtarget. Does not account for the
2228	/// nan handling; the instruction has a nan check on the input value.
2229	Value *AMDGPUCodeGenPrepareImpl::matchFractPat(IntrinsicInst &I) {
2230	if (ST.hasFractBug())
2231	return nullptr;
2232
2233	Intrinsic::ID IID = I.getIntrinsicID();
2234
2235	// The value is only used in contexts where we know the input isn't a nan, so
2236	// any of the fmin variants are fine.
2237	if (IID != Intrinsic::minnum && IID != Intrinsic::minimum &&
2238	IID != Intrinsic::minimumnum)
2239	return nullptr;
2240
2241	Type *Ty = I.getType();
2242	if (!isLegalFloatingTy(Ty: Ty->getScalarType()))
2243	return nullptr;
2244
2245	Value *Arg0 = I.getArgOperand(i: `0`);
2246	Value *Arg1 = I.getArgOperand(i: `1`);
2247
2248	const APFloat *C;
2249	if (!match(V: Arg1, P: m_APFloat(Res&: C)))
2250	return nullptr;
2251
2252	APFloat One(`1.0`);
2253	bool LosesInfo;
2254	One.convert(ToSemantics: C->getSemantics(), RM: APFloat::rmNearestTiesToEven, losesInfo: &LosesInfo);
2255
2256	// Match nextafter(1.0, -1)
2257	One.next(nextDown: true);
2258	if (One != *C)
2259	return nullptr;
2260
2261	Value *FloorSrc;
2262	if (match(V: Arg0, P: m_FSub(L: m_Value(V&: FloorSrc),
2263	R: m_Intrinsic<Intrinsic::floor>(Op0: m_Deferred(V: FloorSrc)))))
2264	return FloorSrc;
2265	return nullptr;
2266	}
2267
2268	Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,
2269	Value *FractArg) {
2270	SmallVector<Value *, `4`> FractVals;
2271	extractValues(Builder, Values&: FractVals, V: FractArg);
2272
2273	SmallVector<Value *, `4`> ResultVals(FractVals.size());
2274
2275	Type *Ty = FractArg->getType()->getScalarType();
2276	for (unsigned I = `0`, E = FractVals.size(); I != E; ++I) {
2277	ResultVals [I] =
2278	Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_fract, Types: {Ty}, Args: {FractVals [I]});
2279	}
2280
2281	return insertValues(Builder, Ty: FractArg->getType(), Values&: ResultVals);
2282	}
2283
2284	bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) {
2285	Value *FractArg = matchFractPat(I);
2286	if (!FractArg)
2287	return false;
2288
2289	// Match pattern for fract intrinsic in contexts where the nan check has been
2290	// optimized out (and hope the knowledge the source can't be nan wasn't lost).
2291	if (!I.hasNoNaNs() && !isKnownNeverNaN(V: FractArg, SQ: SimplifyQuery (DL, TLI)))
2292	return false;
2293
2294	IRBuilder<> Builder(&I);
2295	FastMathFlags FMF = I.getFastMathFlags();
2296	FMF.setNoNaNs();
2297	Builder.setFastMathFlags(FMF);
2298
2299	Value *Fract = applyFractPat(Builder, FractArg);
2300	Fract->takeName(V: &I);
2301	I.replaceAllUsesWith(V: Fract);
2302
2303	RecursivelyDeleteTriviallyDeadInstructions(V: &I, TLI);
2304	return true;
2305	}
2306
2307	static bool isOneOrNegOne(const Value *Val) {
2308	const APFloat *C;
2309	return match(V: Val, P: m_APFloat(Res&: C)) && C->getExactLog2Abs() == `0`;
2310	}
2311
2312	// Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way.
2313	bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
2314	Type *Ty = Sqrt.getType()->getScalarType();
2315	if (!Ty->isFloatTy() && (!Ty->isHalfTy() \|\| ST.has16BitInsts()))
2316	return false;
2317
2318	const FPMathOperator FPOp = cast<const* FPMathOperator>(Val: &Sqrt);
2319	FastMathFlags SqrtFMF = FPOp->getFastMathFlags();
2320
2321	// We're trying to handle the fast-but-not-that-fast case only. The lowering
2322	// of fast llvm.sqrt will give the raw instruction anyway.
2323	if (SqrtFMF.approxFunc() \|\| HasUnsafeFPMath)
2324	return false;
2325
2326	const float ReqdAccuracy = FPOp->getFPAccuracy();
2327
2328	// Defer correctly rounded expansion to codegen.
2329	if (ReqdAccuracy < `1.0f`)
2330	return false;
2331
2332	// FIXME: This is an ugly hack for this pass using forward iteration instead
2333	// of reverse. If it worked like a normal combiner, the rsq would form before
2334	// we saw a sqrt call.
2335	auto *FDiv =
2336	dyn_cast_or_null<FPMathOperator>(Val: Sqrt.getUniqueUndroppableUser());
2337	if (FDiv && FDiv->getOpcode() == Instruction::FDiv &&
2338	FDiv->getFPAccuracy() >= `1.0f` &&
2339	canOptimizeWithRsq(SqrtOp: FPOp, DivFMF: FDiv->getFastMathFlags(), SqrtFMF) &&
2340	// TODO: We should also handle the arcp case for the fdiv with non-1 value
2341	isOneOrNegOne(Val: FDiv->getOperand(i: `0`)))
2342	return false;
2343
2344	Value *SrcVal = Sqrt.getOperand(i_nocapture: `0`);
2345	bool CanTreatAsDAZ = canIgnoreDenormalInput(V: SrcVal, CtxI: &Sqrt);
2346
2347	// The raw instruction is 1 ulp, but the correction for denormal handling
2348	// brings it to 2.
2349	if (!CanTreatAsDAZ && ReqdAccuracy < `2.0f`)
2350	return false;
2351
2352	IRBuilder<> Builder(&Sqrt);
2353	SmallVector<Value *, `4`> SrcVals;
2354	extractValues(Builder, Values&: SrcVals, V: SrcVal);
2355
2356	SmallVector<Value *, `4`> ResultVals(SrcVals.size());
2357	for (int I = `0`, E = SrcVals.size(); I != E; ++I) {
2358	if (CanTreatAsDAZ)
2359	ResultVals [I] = Builder.CreateCall(Callee: getSqrtF32(), Args: SrcVals [I]);
2360	else
2361	ResultVals [I] = emitSqrtIEEE2ULP(Builder, Src: SrcVals [I], FMF: SqrtFMF);
2362	}
2363
2364	Value *NewSqrt = insertValues(Builder, Ty: Sqrt.getType(), Values&: ResultVals);
2365	NewSqrt->takeName(V: &Sqrt);
2366	Sqrt.replaceAllUsesWith(V: NewSqrt);
2367	Sqrt.eraseFromParent();
2368	return true;
2369	}
2370
2371	bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
2372	if (skipFunction(F))
2373	return false;
2374
2375	auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
2376	if (!TPC)
2377	return false;
2378
2379	const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
2380	const TargetLibraryInfo *TLI =
2381	&getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
2382	AssumptionCache *AC =
2383	&getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2384	auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
2385	const DominatorTree DT = DTWP ? &DTWP->getDomTree() : nullptr*;
2386	const UniformityInfo &UA =
2387	getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
2388	return AMDGPUCodeGenPrepareImpl (F, TM, TLI, AC, DT, UA).run();
2389	}
2390
2391	PreservedAnalyses AMDGPUCodeGenPreparePass::run(Function &F,
2392	FunctionAnalysisManager &FAM) {
2393	const AMDGPUTargetMachine &ATM = static_cast<const AMDGPUTargetMachine &>(TM);
2394	const TargetLibraryInfo *TLI = &FAM.getResult<TargetLibraryAnalysis>(IR&: F);
2395	AssumptionCache *AC = &FAM.getResult<AssumptionAnalysis>(IR&: F);
2396	const DominatorTree *DT = FAM.getCachedResult<DominatorTreeAnalysis>(IR&: F);
2397	const UniformityInfo &UA = FAM.getResult<UniformityInfoAnalysis>(IR&: F);
2398	AMDGPUCodeGenPrepareImpl Impl(F, ATM, TLI, AC, DT, UA);
2399	if (!Impl.run())
2400	return PreservedAnalyses::all();
2401	PreservedAnalyses PA = PreservedAnalyses::none();
2402	if (!Impl.FlowChanged)
2403	PA.preserveSet<CFGAnalyses>();
2404	return PA;
2405	}
2406
2407	INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
2408	"AMDGPU IR optimizations", false, false)
2409	INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
2410	INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
2411	INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
2412	INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
2413	false, false)
2414
2415	char AMDGPUCodeGenPrepare::ID = `0`;
2416
2417	FunctionPass *llvm::createAMDGPUCodeGenPreparePass() {
2418	return new AMDGPUCodeGenPrepare ();
2419	}
2420

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp