AMDGPULibCalls.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp]

1	//===- AMDGPULibCalls.cpp -------------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// This file does AMD library function optimizations.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "AMDGPU.h"
15	#include "AMDGPULibFunc.h"
16	#include "llvm/Analysis/AssumptionCache.h"
17	#include "llvm/Analysis/TargetLibraryInfo.h"
18	#include "llvm/Analysis/ValueTracking.h"
19	#include "llvm/IR/AttributeMask.h"
20	#include "llvm/IR/Dominators.h"
21	#include "llvm/IR/IRBuilder.h"
22	#include "llvm/IR/IntrinsicsAMDGPU.h"
23	#include "llvm/IR/MDBuilder.h"
24	#include "llvm/IR/PatternMatch.h"
25	#include <cmath>
26
27	#define DEBUG_TYPE "amdgpu-simplifylib"
28
29	using namespace llvm;
30	using namespace llvm::PatternMatch;
31
32	static cl::opt<bool> EnablePreLink("amdgpu-prelink",
33	cl::desc ("Enable pre-link mode optimizations"),
34	cl::init(Val: false),
35	cl::Hidden);
36
37	static cl::list<std::string> UseNative("amdgpu-use-native",
38	cl::desc ("Comma separated list of functions to replace with native, or all"),
39	cl::CommaSeparated, cl::ValueOptional,
40	cl::Hidden);
41
42	#define MATH_PI numbers::pi
43	#define MATH_E numbers::e
44	#define MATH_SQRT2 numbers::sqrt2
45	#define MATH_SQRT1_2 numbers::inv_sqrt2
46
47	enum class PowKind { Pow, PowR, PowN, RootN };
48
49	namespace llvm {
50
51	class AMDGPULibCalls {
52	private:
53	SimplifyQuery SQ;
54
55	using FuncInfo = llvm::AMDGPULibFunc;
56
57	// -fuse-native.
58	bool AllNative = false;
59
60	bool useNativeFunc(const StringRef F) const;
61
62	// Return a pointer (pointer expr) to the function if function definition with
63	// "FuncName" exists. It may create a new function prototype in pre-link mode.
64	FunctionCallee getFunction(Module M, const* FuncInfo &fInfo);
65
66	/// Wrapper around getFunction which tries to use a faster variant if
67	/// available, and falls back to a less fast option.
68	///
69	/// Return a replacement function for \p fInfo that has float-typed fast
70	/// variants. \p NewFunc is a base replacement function to use. \p
71	/// NewFuncFastVariant is a faster version to use if the calling context knows
72	/// it's legal. If there is no fast variant to use, \p NewFuncFastVariant
73	/// should be EI_NONE.
74	FunctionCallee getFloatFastVariant(Module M, const* FuncInfo &fInfo,
75	FuncInfo &newInfo,
76	AMDGPULibFunc::EFuncId NewFunc,
77	AMDGPULibFunc::EFuncId NewFuncFastVariant);
78
79	bool parseFunctionName(const StringRef &FMangledName, FuncInfo &FInfo);
80
81	bool TDOFold(CallInst CI, const* FuncInfo &FInfo);
82
83	/ Specialized optimizations /
84
85	// pow/powr/pown
86	bool fold_pow(FPMathOperator FPOp, IRBuilder<> &B, const* FuncInfo &FInfo);
87
88	/// Peform a fast math expansion of pow, powr, pown or rootn.
89	bool expandFastPow(FPMathOperator *FPOp, IRBuilder<> &B, PowKind Kind);
90
91	bool tryOptimizePow(FPMathOperator *FPOp, IRBuilder<> &B,
92	const FuncInfo &FInfo);
93
94	// rootn
95	bool fold_rootn(FPMathOperator FPOp, IRBuilder<> &B, const* FuncInfo &FInfo);
96
97	// -fuse-native for sincos
98	bool sincosUseNative(CallInst aCI, const* FuncInfo &FInfo);
99
100	// evaluate calls if calls' arguments are constants.
101	bool evaluateScalarMathFunc(const FuncInfo &FInfo, APFloat &Res0,
102	APFloat &Res1, Constant copr0, Constant copr1);
103	bool evaluateCall(CallInst aCI, const* FuncInfo &FInfo);
104
105	/// Insert a value to sincos function \p Fsincos. Returns (value of sin, value
106	/// of cos, sincos call).
107	std::tuple<Value , Value , Value > insertSinCos(Value Arg,
108	FastMathFlags FMF,
109	IRBuilder<> &B,
110	FunctionCallee Fsincos);
111
112	// sin/cos
113	bool fold_sincos(FPMathOperator FPOp, IRBuilder<> &B, const* FuncInfo &FInfo);
114
115	// __read_pipe/__write_pipe
116	bool fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
117	const FuncInfo &FInfo);
118
119	// Get a scalar native builtin single argument FP function
120	FunctionCallee getNativeFunction(Module M, const* FuncInfo &FInfo);
121
122	/// Substitute a call to a known libcall with an intrinsic call. If \p
123	/// AllowMinSize is true, allow the replacement in a minsize function.
124	bool shouldReplaceLibcallWithIntrinsic(const CallInst *CI,
125	bool AllowMinSizeF32 = false,
126	bool AllowF64 = false,
127	bool AllowStrictFP = false);
128	void replaceLibCallWithSimpleIntrinsic(IRBuilder<> &B, CallInst *CI,
129	Intrinsic::ID IntrID);
130
131	bool tryReplaceLibcallWithSimpleIntrinsic(IRBuilder<> &B, CallInst *CI,
132	Intrinsic::ID IntrID,
133	bool AllowMinSizeF32 = false,
134	bool AllowF64 = false,
135	bool AllowStrictFP = false);
136
137	protected:
138	bool isUnsafeFiniteOnlyMath(const FPMathOperator FPOp) const*;
139
140	bool canIncreasePrecisionOfConstantFold(const FPMathOperator FPOp) const*;
141
142	static void replaceCall(Instruction I, Value With) {
143	I->replaceAllUsesWith(V: With);
144	I->eraseFromParent();
145	}
146
147	static void replaceCall(FPMathOperator I, Value With) {
148	replaceCall(I: cast<Instruction>(Val: I), With);
149	}
150
151	public:
152	AMDGPULibCalls(Function &F, FunctionAnalysisManager &FAM);
153
154	bool fold(CallInst *CI);
155
156	void initNativeFuncs();
157
158	// Replace a normal math function call with that native version
159	bool useNative(CallInst *CI);
160	};
161
162	} // end namespace llvm
163
164	template <typename IRB>
165	static CallInst CreateCallEx(IRB &B, FunctionCallee Callee, Value Arg,
166	const Twine &Name = "") {
167	CallInst *R = B.CreateCall(Callee, Arg, Name);
168	if (Function *F = dyn_cast<Function>(Val: Callee.getCallee()))
169	R->setCallingConv(F->getCallingConv());
170	return R;
171	}
172
173	template <typename IRB>
174	static CallInst CreateCallEx2(IRB &B, FunctionCallee Callee, Value Arg1,
175	Value Arg2, const* Twine &Name = "") {
176	CallInst *R = B.CreateCall(Callee, {Arg1, Arg2}, Name);
177	if (Function *F = dyn_cast<Function>(Val: Callee.getCallee()))
178	R->setCallingConv(F->getCallingConv());
179	return R;
180	}
181
182	static FunctionType getPownType(FunctionType FT) {
183	Type *PowNExpTy = Type::getInt32Ty(C&: FT->getContext());
184	if (VectorType *VecTy = dyn_cast<VectorType>(Val: FT->getReturnType()))
185	PowNExpTy = VectorType::get(ElementType: PowNExpTy, EC: VecTy->getElementCount());
186
187	return FunctionType::get(Result: FT->getReturnType(),
188	Params: {FT->getParamType(i: `0`), PowNExpTy}, isVarArg: false);
189	}
190
191	// Data structures for table-driven optimizations.
192	// FuncTbl works for both f32 and f64 functions with 1 input argument
193
194	struct TableEntry {
195	double result;
196	double input;
197	};
198
199	/ a list of {result, input} /
200	static const TableEntry tbl_acos[] = {
201	{MATH_PI / `2.0`, .input: `0.0`},
202	{MATH_PI / `2.0`, .input: -`0.0`},
203	{.result: `0.0`, .input: `1.0`},
204	{MATH_PI, .input: -`1.0`}
205	};
206	static const TableEntry tbl_acosh[] = {
207	{.result: `0.0`, .input: `1.0`}
208	};
209	static const TableEntry tbl_acospi[] = {
210	{.result: `0.5`, .input: `0.0`},
211	{.result: `0.5`, .input: -`0.0`},
212	{.result: `0.0`, .input: `1.0`},
213	{.result: `1.0`, .input: -`1.0`}
214	};
215	static const TableEntry tbl_asin[] = {
216	{.result: `0.0`, .input: `0.0`},
217	{.result: -`0.0`, .input: -`0.0`},
218	{MATH_PI / `2.0`, .input: `1.0`},
219	{.result: -MATH_PI / `2.0`, .input: -`1.0`}
220	};
221	static const TableEntry tbl_asinh[] = {
222	{.result: `0.0`, .input: `0.0`},
223	{.result: -`0.0`, .input: -`0.0`}
224	};
225	static const TableEntry tbl_asinpi[] = {
226	{.result: `0.0`, .input: `0.0`},
227	{.result: -`0.0`, .input: -`0.0`},
228	{.result: `0.5`, .input: `1.0`},
229	{.result: -`0.5`, .input: -`1.0`}
230	};
231	static const TableEntry tbl_atan[] = {
232	{.result: `0.0`, .input: `0.0`},
233	{.result: -`0.0`, .input: -`0.0`},
234	{MATH_PI / `4.0`, .input: `1.0`},
235	{.result: -MATH_PI / `4.0`, .input: -`1.0`}
236	};
237	static const TableEntry tbl_atanh[] = {
238	{.result: `0.0`, .input: `0.0`},
239	{.result: -`0.0`, .input: -`0.0`}
240	};
241	static const TableEntry tbl_atanpi[] = {
242	{.result: `0.0`, .input: `0.0`},
243	{.result: -`0.0`, .input: -`0.0`},
244	{.result: `0.25`, .input: `1.0`},
245	{.result: -`0.25`, .input: -`1.0`}
246	};
247	static const TableEntry tbl_cbrt[] = {
248	{.result: `0.0`, .input: `0.0`},
249	{.result: -`0.0`, .input: -`0.0`},
250	{.result: `1.0`, .input: `1.0`},
251	{.result: -`1.0`, .input: -`1.0`},
252	};
253	static const TableEntry tbl_cos[] = {
254	{.result: `1.0`, .input: `0.0`},
255	{.result: `1.0`, .input: -`0.0`}
256	};
257	static const TableEntry tbl_cosh[] = {
258	{.result: `1.0`, .input: `0.0`},
259	{.result: `1.0`, .input: -`0.0`}
260	};
261	static const TableEntry tbl_cospi[] = {
262	{.result: `1.0`, .input: `0.0`},
263	{.result: `1.0`, .input: -`0.0`}
264	};
265	static const TableEntry tbl_erfc[] = {
266	{.result: `1.0`, .input: `0.0`},
267	{.result: `1.0`, .input: -`0.0`}
268	};
269	static const TableEntry tbl_erf[] = {
270	{.result: `0.0`, .input: `0.0`},
271	{.result: -`0.0`, .input: -`0.0`}
272	};
273	static const TableEntry tbl_exp[] = {
274	{.result: `1.0`, .input: `0.0`},
275	{.result: `1.0`, .input: -`0.0`},
276	{MATH_E, .input: `1.0`}
277	};
278	static const TableEntry tbl_exp2[] = {
279	{.result: `1.0`, .input: `0.0`},
280	{.result: `1.0`, .input: -`0.0`},
281	{.result: `2.0`, .input: `1.0`}
282	};
283	static const TableEntry tbl_exp10[] = {
284	{.result: `1.0`, .input: `0.0`},
285	{.result: `1.0`, .input: -`0.0`},
286	{.result: `10.0`, .input: `1.0`}
287	};
288	static const TableEntry tbl_expm1[] = {
289	{.result: `0.0`, .input: `0.0`},
290	{.result: -`0.0`, .input: -`0.0`}
291	};
292	static const TableEntry tbl_log[] = {
293	{.result: `0.0`, .input: `1.0`},
294	{.result: `1.0`, MATH_E}
295	};
296	static const TableEntry tbl_log2[] = {
297	{.result: `0.0`, .input: `1.0`},
298	{.result: `1.0`, .input: `2.0`}
299	};
300	static const TableEntry tbl_log10[] = {
301	{.result: `0.0`, .input: `1.0`},
302	{.result: `1.0`, .input: `10.0`}
303	};
304	static const TableEntry tbl_rsqrt[] = {
305	{.result: `1.0`, .input: `1.0`},
306	{MATH_SQRT1_2, .input: `2.0`}
307	};
308	static const TableEntry tbl_sin[] = {
309	{.result: `0.0`, .input: `0.0`},
310	{.result: -`0.0`, .input: -`0.0`}
311	};
312	static const TableEntry tbl_sinh[] = {
313	{.result: `0.0`, .input: `0.0`},
314	{.result: -`0.0`, .input: -`0.0`}
315	};
316	static const TableEntry tbl_sinpi[] = {
317	{.result: `0.0`, .input: `0.0`},
318	{.result: -`0.0`, .input: -`0.0`}
319	};
320	static const TableEntry tbl_sqrt[] = {
321	{.result: `0.0`, .input: `0.0`},
322	{.result: `1.0`, .input: `1.0`},
323	{MATH_SQRT2, .input: `2.0`}
324	};
325	static const TableEntry tbl_tan[] = {
326	{.result: `0.0`, .input: `0.0`},
327	{.result: -`0.0`, .input: -`0.0`}
328	};
329	static const TableEntry tbl_tanh[] = {
330	{.result: `0.0`, .input: `0.0`},
331	{.result: -`0.0`, .input: -`0.0`}
332	};
333	static const TableEntry tbl_tanpi[] = {
334	{.result: `0.0`, .input: `0.0`},
335	{.result: -`0.0`, .input: -`0.0`}
336	};
337	static const TableEntry tbl_tgamma[] = {
338	{.result: `1.0`, .input: `1.0`},
339	{.result: `1.0`, .input: `2.0`},
340	{.result: `2.0`, .input: `3.0`},
341	{.result: `6.0`, .input: `4.0`}
342	};
343
344	static bool HasNative(AMDGPULibFunc::EFuncId id) {
345	switch(id) {
346	case AMDGPULibFunc::EI_DIVIDE:
347	case AMDGPULibFunc::EI_COS:
348	case AMDGPULibFunc::EI_EXP:
349	case AMDGPULibFunc::EI_EXP2:
350	case AMDGPULibFunc::EI_EXP10:
351	case AMDGPULibFunc::EI_LOG:
352	case AMDGPULibFunc::EI_LOG2:
353	case AMDGPULibFunc::EI_LOG10:
354	case AMDGPULibFunc::EI_POWR:
355	case AMDGPULibFunc::EI_RECIP:
356	case AMDGPULibFunc::EI_RSQRT:
357	case AMDGPULibFunc::EI_SIN:
358	case AMDGPULibFunc::EI_SINCOS:
359	case AMDGPULibFunc::EI_SQRT:
360	case AMDGPULibFunc::EI_TAN:
361	return true;
362	default:;
363	}
364	return false;
365	}
366
367	using TableRef = ArrayRef<TableEntry>;
368
369	static TableRef getOptTable(AMDGPULibFunc::EFuncId id) {
370	switch(id) {
371	case AMDGPULibFunc::EI_ACOS: return TableRef (tbl_acos);
372	case AMDGPULibFunc::EI_ACOSH: return TableRef (tbl_acosh);
373	case AMDGPULibFunc::EI_ACOSPI: return TableRef (tbl_acospi);
374	case AMDGPULibFunc::EI_ASIN: return TableRef (tbl_asin);
375	case AMDGPULibFunc::EI_ASINH: return TableRef (tbl_asinh);
376	case AMDGPULibFunc::EI_ASINPI: return TableRef (tbl_asinpi);
377	case AMDGPULibFunc::EI_ATAN: return TableRef (tbl_atan);
378	case AMDGPULibFunc::EI_ATANH: return TableRef (tbl_atanh);
379	case AMDGPULibFunc::EI_ATANPI: return TableRef (tbl_atanpi);
380	case AMDGPULibFunc::EI_CBRT: return TableRef (tbl_cbrt);
381	case AMDGPULibFunc::EI_NCOS:
382	case AMDGPULibFunc::EI_COS: return TableRef (tbl_cos);
383	case AMDGPULibFunc::EI_COSH: return TableRef (tbl_cosh);
384	case AMDGPULibFunc::EI_COSPI: return TableRef (tbl_cospi);
385	case AMDGPULibFunc::EI_ERFC: return TableRef (tbl_erfc);
386	case AMDGPULibFunc::EI_ERF: return TableRef (tbl_erf);
387	case AMDGPULibFunc::EI_EXP: return TableRef (tbl_exp);
388	case AMDGPULibFunc::EI_NEXP2:
389	case AMDGPULibFunc::EI_EXP2: return TableRef (tbl_exp2);
390	case AMDGPULibFunc::EI_EXP10: return TableRef (tbl_exp10);
391	case AMDGPULibFunc::EI_EXPM1: return TableRef (tbl_expm1);
392	case AMDGPULibFunc::EI_LOG: return TableRef (tbl_log);
393	case AMDGPULibFunc::EI_NLOG2:
394	case AMDGPULibFunc::EI_LOG2: return TableRef (tbl_log2);
395	case AMDGPULibFunc::EI_LOG10: return TableRef (tbl_log10);
396	case AMDGPULibFunc::EI_NRSQRT:
397	case AMDGPULibFunc::EI_RSQRT: return TableRef (tbl_rsqrt);
398	case AMDGPULibFunc::EI_NSIN:
399	case AMDGPULibFunc::EI_SIN: return TableRef (tbl_sin);
400	case AMDGPULibFunc::EI_SINH: return TableRef (tbl_sinh);
401	case AMDGPULibFunc::EI_SINPI: return TableRef (tbl_sinpi);
402	case AMDGPULibFunc::EI_NSQRT:
403	case AMDGPULibFunc::EI_SQRT: return TableRef (tbl_sqrt);
404	case AMDGPULibFunc::EI_TAN: return TableRef (tbl_tan);
405	case AMDGPULibFunc::EI_TANH: return TableRef (tbl_tanh);
406	case AMDGPULibFunc::EI_TANPI: return TableRef (tbl_tanpi);
407	case AMDGPULibFunc::EI_TGAMMA: return TableRef (tbl_tgamma);
408	default:;
409	}
410	return TableRef ();
411	}
412
413	static inline int getVecSize(const AMDGPULibFunc& FInfo) {
414	return FInfo.getLeads()[`0`].VectorSize;
415	}
416
417	static inline AMDGPULibFunc::EType getArgType(const AMDGPULibFunc& FInfo) {
418	return (AMDGPULibFunc::EType)FInfo.getLeads()[`0`].ArgType;
419	}
420
421	FunctionCallee AMDGPULibCalls::getFunction(Module M, const* FuncInfo &fInfo) {
422	// If we are doing PreLinkOpt, the function is external. So it is safe to
423	// use getOrInsertFunction() at this stage.
424
425	return EnablePreLink ? AMDGPULibFunc::getOrInsertFunction(M, fInfo)
426	: AMDGPULibFunc::getFunction(M, fInfo);
427	}
428
429	FunctionCallee AMDGPULibCalls::getFloatFastVariant(
430	Module M, const* FuncInfo &fInfo, FuncInfo &newInfo,
431	AMDGPULibFunc::EFuncId NewFunc, AMDGPULibFunc::EFuncId FastVariant) {
432	assert(NewFunc != FastVariant);
433
434	if (FastVariant != AMDGPULibFunc::EI_NONE &&
435	getArgType(FInfo: fInfo) == AMDGPULibFunc::F32) {
436	newInfo = AMDGPULibFunc (FastVariant, fInfo);
437	if (FunctionCallee NewCallee = getFunction(M, fInfo: newInfo))
438	return NewCallee;
439	}
440
441	newInfo = AMDGPULibFunc (NewFunc, fInfo);
442	return getFunction(M, fInfo: newInfo);
443	}
444
445	bool AMDGPULibCalls::parseFunctionName(const StringRef &FMangledName,
446	FuncInfo &FInfo) {
447	return AMDGPULibFunc::parse(MangledName: FMangledName, Ptr&: FInfo);
448	}
449
450	bool AMDGPULibCalls::isUnsafeFiniteOnlyMath(const FPMathOperator FPOp) const* {
451	return FPOp->hasApproxFunc() && FPOp->hasNoNaNs() && FPOp->hasNoInfs();
452	}
453
454	bool AMDGPULibCalls::canIncreasePrecisionOfConstantFold(
455	const FPMathOperator FPOp) const* {
456	// TODO: Refine to approxFunc or contract
457	return FPOp->isFast();
458	}
459
460	AMDGPULibCalls::AMDGPULibCalls(Function &F, FunctionAnalysisManager &FAM)
461	: SQ (F.getParent()->getDataLayout(),
462	&FAM.getResult<TargetLibraryAnalysis>(IR&: F),
463	FAM.getCachedResult<DominatorTreeAnalysis>(IR&: F),
464	&FAM.getResult<AssumptionAnalysis>(IR&: F)) {}
465
466	bool AMDGPULibCalls::useNativeFunc(const StringRef F) const {
467	return AllNative \|\| llvm::is_contained(Range&: UseNative, Element: F);
468	}
469
470	void AMDGPULibCalls::initNativeFuncs() {
471	AllNative = useNativeFunc(F: "all") \|\|
472	(UseNative.getNumOccurrences() && UseNative.size() == `1` &&
473	UseNative.begin()->empty());
474	}
475
476	bool AMDGPULibCalls::sincosUseNative(CallInst aCI, const* FuncInfo &FInfo) {
477	bool native_sin = useNativeFunc(F: "sin");
478	bool native_cos = useNativeFunc(F: "cos");
479
480	if (native_sin && native_cos) {
481	Module *M = aCI->getModule();
482	Value *opr0 = aCI->getArgOperand(i: `0`);
483
484	AMDGPULibFunc nf;
485	nf.getLeads()[`0`].ArgType = FInfo.getLeads()[`0`].ArgType;
486	nf.getLeads()[`0`].VectorSize = FInfo.getLeads()[`0`].VectorSize;
487
488	nf.setPrefix(AMDGPULibFunc::NATIVE);
489	nf.setId(AMDGPULibFunc::EI_SIN);
490	FunctionCallee sinExpr = getFunction(M, fInfo: nf);
491
492	nf.setPrefix(AMDGPULibFunc::NATIVE);
493	nf.setId(AMDGPULibFunc::EI_COS);
494	FunctionCallee cosExpr = getFunction(M, fInfo: nf);
495	if (sinExpr && cosExpr) {
496	Value *sinval =
497	CallInst::Create(Func: sinExpr, Args: opr0, NameStr: "splitsin", InsertBefore: aCI->getIterator());
498	Value *cosval =
499	CallInst::Create(Func: cosExpr, Args: opr0, NameStr: "splitcos", InsertBefore: aCI->getIterator());
500	new StoreInst (cosval, aCI->getArgOperand(i: `1`), aCI->getIterator());
501
502	DEBUG_WITH_TYPE("usenative", dbgs() << "<useNative> replace " << *aCI
503	<< " with native version of sin/cos");
504
505	replaceCall(I: aCI, With: sinval);
506	return true;
507	}
508	}
509	return false;
510	}
511
512	bool AMDGPULibCalls::useNative(CallInst *aCI) {
513	Function *Callee = aCI->getCalledFunction();
514	if (!Callee \|\| aCI->isNoBuiltin())
515	return false;
516
517	FuncInfo FInfo;
518	if (!parseFunctionName(FMangledName: Callee->getName(), FInfo) \|\| !FInfo.isMangled() \|\|
519	FInfo.getPrefix() != AMDGPULibFunc::NOPFX \|\|
520	getArgType(FInfo) == AMDGPULibFunc::F64 \|\| !HasNative(id: FInfo.getId()) \|\|
521	!(AllNative \|\| useNativeFunc(F: FInfo.getName()))) {
522	return false;
523	}
524
525	if (FInfo.getId() == AMDGPULibFunc::EI_SINCOS)
526	return sincosUseNative(aCI, FInfo);
527
528	FInfo.setPrefix(AMDGPULibFunc::NATIVE);
529	FunctionCallee F = getFunction(M: aCI->getModule(), fInfo: FInfo);
530	if (!F)
531	return false;
532
533	aCI->setCalledFunction(F);
534	DEBUG_WITH_TYPE("usenative", dbgs() << "<useNative> replace " << *aCI
535	<< " with native version");
536	return true;
537	}
538
539	// Clang emits call of __read_pipe_2 or __read_pipe_4 for OpenCL read_pipe
540	// builtin, with appended type size and alignment arguments, where 2 or 4
541	// indicates the original number of arguments. The library has optimized version
542	// of __read_pipe_2/__read_pipe_4 when the type size and alignment has the same
543	// power of 2 value. This function transforms __read_pipe_2 to __read_pipe_2_N
544	// for such cases where N is the size in bytes of the type (N = 1, 2, 4, 8, ...,
545	// 128). The same for __read_pipe_4, write_pipe_2, and write_pipe_4.
546	bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
547	const FuncInfo &FInfo) {
548	auto *Callee = CI->getCalledFunction();
549	if (!Callee->isDeclaration())
550	return false;
551
552	assert(Callee->hasName() && "Invalid read_pipe/write_pipe function");
553	auto *M = Callee->getParent();
554	std::string Name = std::string (Callee->getName());
555	auto NumArg = CI->arg_size();
556	if (NumArg != `4` && NumArg != `6`)
557	return false;
558	ConstantInt *PacketSize =
559	dyn_cast<ConstantInt>(Val: CI->getArgOperand(i: NumArg - `2`));
560	ConstantInt *PacketAlign =
561	dyn_cast<ConstantInt>(Val: CI->getArgOperand(i: NumArg - `1`));
562	if (!PacketSize \|\| !PacketAlign)
563	return false;
564
565	unsigned Size = PacketSize->getZExtValue();
566	Align Alignment = PacketAlign->getAlignValue();
567	if (Alignment != Size)
568	return false;
569
570	unsigned PtrArgLoc = CI->arg_size() - `3`;
571	Value *PtrArg = CI->getArgOperand(i: PtrArgLoc);
572	Type *PtrTy = PtrArg->getType();
573
574	SmallVector<llvm::Type *, `6`> ArgTys;
575	for (unsigned I = `0`; I != PtrArgLoc; ++I)
576	ArgTys.push_back(Elt: CI->getArgOperand(i: I)->getType());
577	ArgTys.push_back(Elt: PtrTy);
578
579	Name = Name + "_" + std::to_string(val: Size);
580	auto *FTy = FunctionType::get(Result: Callee->getReturnType(),
581	Params: ArrayRef<Type >(ArgTys), isVarArg: false*);
582	AMDGPULibFunc NewLibFunc(Name, FTy);
583	FunctionCallee F = AMDGPULibFunc::getOrInsertFunction(M, fInfo: NewLibFunc);
584	if (!F)
585	return false;
586
587	SmallVector<Value *, `6`> Args;
588	for (unsigned I = `0`; I != PtrArgLoc; ++I)
589	Args.push_back(Elt: CI->getArgOperand(i: I));
590	Args.push_back(Elt: PtrArg);
591
592	auto *NCI = B.CreateCall(Callee: F, Args);
593	NCI->setAttributes(CI->getAttributes());
594	CI->replaceAllUsesWith(V: NCI);
595	CI->dropAllReferences();
596	CI->eraseFromParent();
597
598	return true;
599	}
600
601	// This function returns false if no change; return true otherwise.
602	bool AMDGPULibCalls::fold(CallInst *CI) {
603	Function *Callee = CI->getCalledFunction();
604	// Ignore indirect calls.
605	if (!Callee \|\| Callee->isIntrinsic() \|\| CI->isNoBuiltin())
606	return false;
607
608	FuncInfo FInfo;
609	if (!parseFunctionName(FMangledName: Callee->getName(), FInfo))
610	return false;
611
612	// Further check the number of arguments to see if they match.
613	// TODO: Check calling convention matches too
614	if (!FInfo.isCompatibleSignature(M: *Callee->getParent(), FuncTy: CI->getFunctionType()))
615	return false;
616
617	LLVM_DEBUG(dbgs() << "AMDIC: try folding " << *CI << `'\n'`);
618
619	if (TDOFold(CI, FInfo))
620	return true;
621
622	IRBuilder<> B(CI);
623	if (CI->isStrictFP())
624	B.setIsFPConstrained(true);
625
626	if (FPMathOperator *FPOp = dyn_cast<FPMathOperator>(Val: CI)) {
627	// Under unsafe-math, evaluate calls if possible.
628	// According to Brian Sumner, we can do this for all f32 function calls
629	// using host's double function calls.
630	if (canIncreasePrecisionOfConstantFold(FPOp) && evaluateCall(aCI: CI, FInfo))
631	return true;
632
633	// Copy fast flags from the original call.
634	FastMathFlags FMF = FPOp->getFastMathFlags();
635	B.setFastMathFlags(FMF);
636
637	// Specialized optimizations for each function call.
638	//
639	// TODO: Handle native functions
640	switch (FInfo.getId()) {
641	case AMDGPULibFunc::EI_EXP:
642	if (FMF.none())
643	return false;
644	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::exp,
645	AllowMinSizeF32: FMF.approxFunc());
646	case AMDGPULibFunc::EI_EXP2:
647	if (FMF.none())
648	return false;
649	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::exp2,
650	AllowMinSizeF32: FMF.approxFunc());
651	case AMDGPULibFunc::EI_LOG:
652	if (FMF.none())
653	return false;
654	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::log,
655	AllowMinSizeF32: FMF.approxFunc());
656	case AMDGPULibFunc::EI_LOG2:
657	if (FMF.none())
658	return false;
659	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::log2,
660	AllowMinSizeF32: FMF.approxFunc());
661	case AMDGPULibFunc::EI_LOG10:
662	if (FMF.none())
663	return false;
664	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::log10,
665	AllowMinSizeF32: FMF.approxFunc());
666	case AMDGPULibFunc::EI_FMIN:
667	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::minnum,
668	AllowMinSizeF32: true, AllowF64: true);
669	case AMDGPULibFunc::EI_FMAX:
670	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::maxnum,
671	AllowMinSizeF32: true, AllowF64: true);
672	case AMDGPULibFunc::EI_FMA:
673	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::fma, AllowMinSizeF32: true,
674	AllowF64: true);
675	case AMDGPULibFunc::EI_MAD:
676	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::fmuladd,
677	AllowMinSizeF32: true, AllowF64: true);
678	case AMDGPULibFunc::EI_FABS:
679	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::fabs, AllowMinSizeF32: true,
680	AllowF64: true, AllowStrictFP: true);
681	case AMDGPULibFunc::EI_COPYSIGN:
682	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::copysign,
683	AllowMinSizeF32: true, AllowF64: true, AllowStrictFP: true);
684	case AMDGPULibFunc::EI_FLOOR:
685	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::floor, AllowMinSizeF32: true,
686	AllowF64: true);
687	case AMDGPULibFunc::EI_CEIL:
688	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::ceil, AllowMinSizeF32: true,
689	AllowF64: true);
690	case AMDGPULibFunc::EI_TRUNC:
691	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::trunc, AllowMinSizeF32: true,
692	AllowF64: true);
693	case AMDGPULibFunc::EI_RINT:
694	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::rint, AllowMinSizeF32: true,
695	AllowF64: true);
696	case AMDGPULibFunc::EI_ROUND:
697	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::round, AllowMinSizeF32: true,
698	AllowF64: true);
699	case AMDGPULibFunc::EI_LDEXP: {
700	if (!shouldReplaceLibcallWithIntrinsic(CI, AllowMinSizeF32: true, AllowF64: true))
701	return false;
702
703	Value *Arg1 = CI->getArgOperand(i: `1`);
704	if (VectorType *VecTy = dyn_cast<VectorType>(Val: CI->getType());
705	VecTy && !isa<VectorType>(Val: Arg1->getType())) {
706	Value *SplatArg1 = B.CreateVectorSplat(EC: VecTy->getElementCount(), V: Arg1);
707	CI->setArgOperand(i: `1`, v: SplatArg1);
708	}
709
710	CI->setCalledFunction(Intrinsic::getOrInsertDeclaration(
711	M: CI->getModule(), id: Intrinsic::ldexp,
712	Tys: {CI->getType(), CI->getArgOperand(i: `1`)->getType()}));
713	return true;
714	}
715	case AMDGPULibFunc::EI_POW:
716	case AMDGPULibFunc::EI_POW_FAST:
717	return tryOptimizePow(FPOp, B, FInfo);
718	case AMDGPULibFunc::EI_POWR:
719	case AMDGPULibFunc::EI_POWR_FAST: {
720	if (fold_pow(FPOp, B, FInfo))
721	return true;
722	if (!FMF.approxFunc())
723	return false;
724
725	if (FInfo.getId() == AMDGPULibFunc::EI_POWR && FMF.approxFunc() &&
726	getArgType(FInfo) == AMDGPULibFunc::F32) {
727	Module *M = Callee->getParent();
728	AMDGPULibFunc PowrFastInfo(AMDGPULibFunc::EI_POWR_FAST, FInfo);
729	if (FunctionCallee PowrFastFunc = getFunction(M, fInfo: PowrFastInfo)) {
730	CI->setCalledFunction(PowrFastFunc);
731	return true;
732	}
733	}
734
735	if (!shouldReplaceLibcallWithIntrinsic(CI))
736	return false;
737	return expandFastPow(FPOp, B, Kind: PowKind::PowR);
738	}
739	case AMDGPULibFunc::EI_POWN:
740	case AMDGPULibFunc::EI_POWN_FAST: {
741	if (fold_pow(FPOp, B, FInfo))
742	return true;
743	if (!FMF.approxFunc())
744	return false;
745
746	if (FInfo.getId() == AMDGPULibFunc::EI_POWN &&
747	getArgType(FInfo) == AMDGPULibFunc::F32) {
748	Module *M = Callee->getParent();
749	AMDGPULibFunc PownFastInfo(AMDGPULibFunc::EI_POWN_FAST, FInfo);
750	if (FunctionCallee PownFastFunc = getFunction(M, fInfo: PownFastInfo)) {
751	CI->setCalledFunction(PownFastFunc);
752	return true;
753	}
754	}
755
756	if (!shouldReplaceLibcallWithIntrinsic(CI))
757	return false;
758	return expandFastPow(FPOp, B, Kind: PowKind::PowN);
759	}
760	case AMDGPULibFunc::EI_ROOTN:
761	case AMDGPULibFunc::EI_ROOTN_FAST: {
762	if (fold_rootn(FPOp, B, FInfo))
763	return true;
764	if (!FMF.approxFunc())
765	return false;
766
767	if (getArgType(FInfo) == AMDGPULibFunc::F32) {
768	Module *M = Callee->getParent();
769	AMDGPULibFunc RootnFastInfo(AMDGPULibFunc::EI_ROOTN_FAST, FInfo);
770	if (FunctionCallee RootnFastFunc = getFunction(M, fInfo: RootnFastInfo)) {
771	CI->setCalledFunction(RootnFastFunc);
772	return true;
773	}
774	}
775
776	return expandFastPow(FPOp, B, Kind: PowKind::RootN);
777	}
778	case AMDGPULibFunc::EI_SQRT:
779	// TODO: Allow with strictfp + constrained intrinsic
780	return tryReplaceLibcallWithSimpleIntrinsic(
781	B, CI, IntrID: Intrinsic::sqrt, AllowMinSizeF32: true, AllowF64: true, /AllowStrictFP=/false);
782	case AMDGPULibFunc::EI_COS:
783	case AMDGPULibFunc::EI_SIN:
784	return fold_sincos(FPOp, B, FInfo);
785	default:
786	break;
787	}
788	} else {
789	// Specialized optimizations for each function call
790	switch (FInfo.getId()) {
791	case AMDGPULibFunc::EI_READ_PIPE_2:
792	case AMDGPULibFunc::EI_READ_PIPE_4:
793	case AMDGPULibFunc::EI_WRITE_PIPE_2:
794	case AMDGPULibFunc::EI_WRITE_PIPE_4:
795	return fold_read_write_pipe(CI, B, FInfo);
796	default:
797	break;
798	}
799	}
800
801	return false;
802	}
803
804	static Constant getConstantFloatVector(const* ArrayRef<APFloat> Values,
805	const Type *Ty) {
806	Type *ElemTy = Ty->getScalarType();
807	const fltSemantics &FltSem = ElemTy->getFltSemantics();
808
809	SmallVector<Constant *, `4`> ConstValues;
810	ConstValues.reserve(N: Values.size());
811	for (APFloat APF : Values) {
812	bool Unused;
813	APF.convert(ToSemantics: FltSem, RM: APFloat::rmNearestTiesToEven, losesInfo: &Unused);
814	ConstValues.push_back(Elt: ConstantFP::get(Ty: ElemTy, V: APF));
815	}
816	return ConstantVector::get(V: ConstValues);
817	}
818
819	bool AMDGPULibCalls::TDOFold(CallInst CI, const* FuncInfo &FInfo) {
820	// Table-Driven optimization
821	const TableRef tr = getOptTable(id: FInfo.getId());
822	if (tr.empty())
823	return false;
824
825	int const sz = (int)tr.size();
826	Value *opr0 = CI->getArgOperand(i: `0`);
827
828	int vecSize = getVecSize(FInfo);
829	if (vecSize > `1`) {
830	// Vector version
831	Constant *CV = dyn_cast<Constant>(Val: opr0);
832	if (CV && CV->getType()->isVectorTy()) {
833	SmallVector<APFloat, `4`> Values;
834	Values.reserve(N: vecSize);
835	for (int eltNo = `0`; eltNo < vecSize; ++eltNo) {
836	ConstantFP *eltval =
837	cast<ConstantFP>(Val: CV->getAggregateElement(Elt: (unsigned)eltNo));
838	auto MatchingRow = llvm::find_if(Range: tr, P: [eltval](const TableEntry &entry) {
839	return eltval->isExactlyValue(V: entry.input);
840	});
841	if (MatchingRow == tr.end())
842	return false;
843	Values.push_back(Elt: APFloat (MatchingRow->result));
844	}
845	Constant *NewValues = getConstantFloatVector(Values, Ty: CI->getType());
846	LLVM_DEBUG(errs() << "AMDIC: " << CI << " ---> " << NewValues << "\n");
847	replaceCall(I: CI, With: NewValues);
848	return true;
849	}
850	} else {
851	// Scalar version
852	if (ConstantFP *CF = dyn_cast<ConstantFP>(Val: opr0)) {
853	for (int i = `0`; i < sz; ++i) {
854	if (CF->isExactlyValue(V: tr [i].input)) {
855	Value *nval = ConstantFP::get(Ty: CF->getType(), V: tr [i].result);
856	LLVM_DEBUG(errs() << "AMDIC: " << CI << " ---> " << nval << "\n");
857	replaceCall(I: CI, With: nval);
858	return true;
859	}
860	}
861	}
862	}
863
864	return false;
865	}
866
867	namespace llvm {
868	static double log2(double V) {
869	#if _XOPEN_SOURCE >= 600 \|\| defined(_ISOC99_SOURCE) \|\| _POSIX_C_SOURCE >= 200112L
870	return ::log2(x: V);
871	#else
872	return log(V) / numbers::ln2;
873	#endif
874	}
875	} // namespace llvm
876
877	bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B,
878	const FuncInfo &FInfo) {
879	assert((FInfo.getId() == AMDGPULibFunc::EI_POW \|\|
880	FInfo.getId() == AMDGPULibFunc::EI_POW_FAST \|\|
881	FInfo.getId() == AMDGPULibFunc::EI_POWR \|\|
882	FInfo.getId() == AMDGPULibFunc::EI_POWR_FAST \|\|
883	FInfo.getId() == AMDGPULibFunc::EI_POWN \|\|
884	FInfo.getId() == AMDGPULibFunc::EI_POWN_FAST) &&
885	"fold_pow: encounter a wrong function call");
886
887	Module *M = B.GetInsertBlock()->getModule();
888	Type *eltType = FPOp->getType()->getScalarType();
889	Value *opr0 = FPOp->getOperand(i: `0`);
890	Value *opr1 = FPOp->getOperand(i: `1`);
891
892	const APFloat CF = nullptr*;
893	const APInt CINT = nullptr*;
894	if (!match(V: opr1, P: m_APFloatAllowPoison(Res&: CF)))
895	match(V: opr1, P: m_APIntAllowPoison(Res&: CINT));
896
897	// 0x1111111 means that we don't do anything for this call.
898	int ci_opr1 = (CINT ? (int)CINT->getSExtValue() : `0x1111111`);
899
900	if ((CF && CF->isZero()) \|\| (CINT && ci_opr1 == `0`)) {
901	// pow/powr/pown(x, 0) == 1
902	LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1\n");
903	Constant *cnval = ConstantFP::get(Ty: eltType, V: `1.0`);
904	if (getVecSize(FInfo) > `1`) {
905	cnval = ConstantDataVector::getSplat(NumElts: getVecSize(FInfo), Elt: cnval);
906	}
907	replaceCall(I: FPOp, With: cnval);
908	return true;
909	}
910	if ((CF && CF->isExactlyValue(V: `1.0`)) \|\| (CINT && ci_opr1 == `1`)) {
911	// pow/powr/pown(x, 1.0) = x
912	LLVM_DEBUG(errs() << "AMDIC: " << FPOp << " ---> " << opr0 << "\n");
913	replaceCall(I: FPOp, With: opr0);
914	return true;
915	}
916	if ((CF && CF->isExactlyValue(V: `2.0`)) \|\| (CINT && ci_opr1 == `2`)) {
917	// pow/powr/pown(x, 2.0) = xx*
918	LLVM_DEBUG(errs() << "AMDIC: " << FPOp << " ---> " << opr0 << " * "
919	<< *opr0 << "\n");
920	Value *nval = B.CreateFMul(L: opr0, R: opr0, Name: "__pow2");
921	replaceCall(I: FPOp, With: nval);
922	return true;
923	}
924	if ((CF && CF->isExactlyValue(V: -`1.0`)) \|\| (CINT && ci_opr1 == -`1`)) {
925	// pow/powr/pown(x, -1.0) = 1.0/x
926	LLVM_DEBUG(errs() << "AMDIC: " << FPOp << " ---> 1 / " << opr0 << "\n");
927	Constant *cnval = ConstantFP::get(Ty: eltType, V: `1.0`);
928	if (getVecSize(FInfo) > `1`) {
929	cnval = ConstantDataVector::getSplat(NumElts: getVecSize(FInfo), Elt: cnval);
930	}
931	Value *nval = B.CreateFDiv(L: cnval, R: opr0, Name: "__powrecip");
932	replaceCall(I: FPOp, With: nval);
933	return true;
934	}
935
936	if (CF && (CF->isExactlyValue(V: `0.5`) \|\| CF->isExactlyValue(V: -`0.5`))) {
937	// pow[r](x, [-]0.5) = sqrt(x)
938	bool issqrt = CF->isExactlyValue(V: `0.5`);
939	if (FunctionCallee FPExpr =
940	getFunction(M, fInfo: AMDGPULibFunc (issqrt ? AMDGPULibFunc::EI_SQRT
941	: AMDGPULibFunc::EI_RSQRT,
942	FInfo))) {
943	LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << FInfo.getName()
944	<< `'('` << *opr0 << ")\n");
945	Value *nval = CreateCallEx(B,Callee: FPExpr, Arg: opr0, Name: issqrt ? "__pow2sqrt"
946	: "__pow2rsqrt");
947	replaceCall(I: FPOp, With: nval);
948	return true;
949	}
950	}
951
952	if (!isUnsafeFiniteOnlyMath(FPOp))
953	return false;
954
955	// Unsafe Math optimization
956
957	// Remember that ci_opr1 is set if opr1 is integral
958	if (CF) {
959	double dval = (getArgType(FInfo) == AMDGPULibFunc::F32)
960	? (double)CF->convertToFloat()
961	: CF->convertToDouble();
962	int ival = (int)dval;
963	if ((double)ival == dval) {
964	ci_opr1 = ival;
965	} else
966	ci_opr1 = `0x11111111`;
967	}
968
969	// pow/powr/pown(x, c) = [1/](xx..x); where
970	// trunc(c) == c && the number of x == c && \|c\| <= 12
971	unsigned abs_opr1 = (ci_opr1 < `0`) ? -ci_opr1 : ci_opr1;
972	if (abs_opr1 <= `12`) {
973	Constant *cnval;
974	Value *nval;
975	if (abs_opr1 == `0`) {
976	cnval = ConstantFP::get(Ty: eltType, V: `1.0`);
977	if (getVecSize(FInfo) > `1`) {
978	cnval = ConstantDataVector::getSplat(NumElts: getVecSize(FInfo), Elt: cnval);
979	}
980	nval = cnval;
981	} else {
982	Value valx2 = nullptr*;
983	nval = nullptr;
984	while (abs_opr1 > `0`) {
985	valx2 = valx2 ? B.CreateFMul(L: valx2, R: valx2, Name: "__powx2") : opr0;
986	if (abs_opr1 & `1`) {
987	nval = nval ? B.CreateFMul(L: nval, R: valx2, Name: "__powprod") : valx2;
988	}
989	abs_opr1 >>= `1`;
990	}
991	}
992
993	if (ci_opr1 < `0`) {
994	cnval = ConstantFP::get(Ty: eltType, V: `1.0`);
995	if (getVecSize(FInfo) > `1`) {
996	cnval = ConstantDataVector::getSplat(NumElts: getVecSize(FInfo), Elt: cnval);
997	}
998	nval = B.CreateFDiv(L: cnval, R: nval, Name: "__1powprod");
999	}
1000	LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> "
1001	<< ((ci_opr1 < `0`) ? "1/prod(" : "prod(") << *opr0
1002	<< ")\n");
1003	replaceCall(I: FPOp, With: nval);
1004	return true;
1005	}
1006
1007	// If we should use the generic intrinsic instead of emitting a libcall
1008	const bool ShouldUseIntrinsic = eltType->isFloatTy() \|\| eltType->isHalfTy();
1009
1010	// powr ---> exp2(y log2(x))*
1011	// pown/pow ---> powr(fabs(x), y) \| (x & ((int)y << 31))
1012	FunctionCallee ExpExpr;
1013	if (ShouldUseIntrinsic)
1014	ExpExpr = Intrinsic::getOrInsertDeclaration(M, id: Intrinsic::exp2,
1015	Tys: {FPOp->getType()});
1016	else {
1017	ExpExpr = getFunction(M, fInfo: AMDGPULibFunc (AMDGPULibFunc::EI_EXP2, FInfo));
1018	if (!ExpExpr)
1019	return false;
1020	}
1021
1022	bool needlog = false;
1023	bool needabs = false;
1024	bool needcopysign = false;
1025	Constant cnval = nullptr*;
1026	if (getVecSize(FInfo) == `1`) {
1027	CF = nullptr;
1028	match(V: opr0, P: m_APFloatAllowPoison(Res&: CF));
1029
1030	if (CF) {
1031	double V = (getArgType(FInfo) == AMDGPULibFunc::F32)
1032	? (double)CF->convertToFloat()
1033	: CF->convertToDouble();
1034
1035	V = log2(V: std::abs(x: V));
1036	cnval = ConstantFP::get(Ty: eltType, V);
1037	needcopysign = (FInfo.getId() != AMDGPULibFunc::EI_POWR &&
1038	FInfo.getId() != AMDGPULibFunc::EI_POWR_FAST) &&
1039	CF->isNegative();
1040	} else {
1041	needlog = true;
1042	needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR &&
1043	FInfo.getId() != AMDGPULibFunc::EI_POWR_FAST;
1044	}
1045	} else {
1046	ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(Val: opr0);
1047
1048	if (!CDV) {
1049	needlog = true;
1050	needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR &&
1051	FInfo.getId() != AMDGPULibFunc::EI_POWR_FAST;
1052	} else {
1053	assert ((int)CDV->getNumElements() == getVecSize(FInfo) &&
1054	"Wrong vector size detected");
1055
1056	SmallVector<double, `0`> DVal;
1057	for (int i=`0`; i < getVecSize(FInfo); ++i) {
1058	double V = CDV->getElementAsAPFloat(i).convertToDouble();
1059	if (V < `0.0`) needcopysign = true;
1060	V = log2(V: std::abs(x: V));
1061	DVal.push_back(Elt: V);
1062	}
1063	if (getArgType(FInfo) == AMDGPULibFunc::F32) {
1064	SmallVector<float, `0`> FVal;
1065	for (double D : DVal)
1066	FVal.push_back(Elt: (float)D);
1067	ArrayRef<float> tmp(FVal);
1068	cnval = ConstantDataVector::get(Context&: M->getContext(), Elts: tmp);
1069	} else {
1070	ArrayRef<double> tmp(DVal);
1071	cnval = ConstantDataVector::get(Context&: M->getContext(), Elts: tmp);
1072	}
1073	}
1074	}
1075
1076	if (needcopysign && (FInfo.getId() == AMDGPULibFunc::EI_POW \|\|
1077	FInfo.getId() == AMDGPULibFunc::EI_POW_FAST)) {
1078	// We cannot handle corner cases for a general pow() function, give up
1079	// unless y is a constant integral value. Then proceed as if it were pown.
1080	if (!isKnownIntegral(V: opr1, SQ: SQ.getWithInstruction(I: cast<Instruction>(Val: FPOp)),
1081	FMF: FPOp->getFastMathFlags()))
1082	return false;
1083	}
1084
1085	Value *nval;
1086	if (needabs) {
1087	nval = B.CreateUnaryIntrinsic(ID: Intrinsic::fabs, V: opr0, FMFSource: nullptr, Name: "__fabs");
1088	} else {
1089	nval = cnval ? cnval : opr0;
1090	}
1091	if (needlog) {
1092	FunctionCallee LogExpr;
1093	if (ShouldUseIntrinsic) {
1094	LogExpr = Intrinsic::getOrInsertDeclaration(M, id: Intrinsic::log2,
1095	Tys: {FPOp->getType()});
1096	} else {
1097	LogExpr = getFunction(M, fInfo: AMDGPULibFunc (AMDGPULibFunc::EI_LOG2, FInfo));
1098	if (!LogExpr)
1099	return false;
1100	}
1101
1102	nval = CreateCallEx(B,Callee: LogExpr, Arg: nval, Name: "__log2");
1103	}
1104
1105	if (FInfo.getId() == AMDGPULibFunc::EI_POWN \|\|
1106	FInfo.getId() == AMDGPULibFunc::EI_POWN_FAST) {
1107	// convert int(32) to fp(f32 or f64)
1108	opr1 = B.CreateSIToFP(V: opr1, DestTy: nval->getType(), Name: "pownI2F");
1109	}
1110	nval = B.CreateFMul(L: opr1, R: nval, Name: "__ylogx");
1111
1112	CallInst *Exp2Call = CreateCallEx(B, Callee: ExpExpr, Arg: nval, Name: "__exp2");
1113
1114	// TODO: Generalized fpclass logic for pow
1115	FPClassTest KnownNot = FPClassTest::fcNegative;
1116	if (FPOp->hasNoNaNs())
1117	KnownNot \|= FPClassTest::fcNan;
1118
1119	Exp2Call->addRetAttr(
1120	Attr: Attribute::getWithNoFPClass(Context&: Exp2Call->getContext(), Mask: KnownNot));
1121	nval = Exp2Call;
1122
1123	if (needcopysign) {
1124	Type* nTyS = B.getIntNTy(N: eltType->getPrimitiveSizeInBits());
1125	Type *nTy = FPOp->getType()->getWithNewType(EltTy: nTyS);
1126	Value *opr_n = FPOp->getOperand(i: `1`);
1127	if (opr_n->getType()->getScalarType()->isIntegerTy())
1128	opr_n = B.CreateZExtOrTrunc(V: opr_n, DestTy: nTy, Name: "__ytou");
1129	else
1130	opr_n = B.CreateFPToSI(V: opr1, DestTy: nTy, Name: "__ytou");
1131
1132	unsigned size = nTy->getScalarSizeInBits();
1133	Value *sign = B.CreateShl(LHS: opr_n, RHS: size-`1`, Name: "__yeven");
1134	sign = B.CreateAnd(LHS: B.CreateBitCast(V: opr0, DestTy: nTy), RHS: sign, Name: "__pow_sign");
1135
1136	nval = B.CreateCopySign(LHS: nval, RHS: B.CreateBitCast(V: sign, DestTy: nval->getType()),
1137	FMFSource: nullptr, Name: "__pow_sign");
1138	}
1139
1140	LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> "
1141	<< "exp2(" << opr1 << " log2(" << *opr0 << "))\n");
1142	replaceCall(I: FPOp, With: nval);
1143
1144	return true;
1145	}
1146
1147	bool AMDGPULibCalls::fold_rootn(FPMathOperator *FPOp, IRBuilder<> &B,
1148	const FuncInfo &FInfo) {
1149	Value *opr0 = FPOp->getOperand(i: `0`);
1150	Value *opr1 = FPOp->getOperand(i: `1`);
1151
1152	const APInt CINT = nullptr*;
1153	if (!match(V: opr1, P: m_APIntAllowPoison(Res&: CINT)))
1154	return false;
1155
1156	Function *Parent = B.GetInsertBlock()->getParent();
1157
1158	int ci_opr1 = (int)CINT->getSExtValue();
1159	if (ci_opr1 == `1` && !Parent->hasFnAttribute(Kind: Attribute::StrictFP)) {
1160	// rootn(x, 1) = x
1161	//
1162	// TODO: Insert constrained canonicalize for strictfp case.
1163	LLVM_DEBUG(errs() << "AMDIC: " << FPOp << " ---> " << opr0 << `'\n'`);
1164	replaceCall(I: FPOp, With: opr0);
1165	return true;
1166	}
1167
1168	Module *M = B.GetInsertBlock()->getModule();
1169
1170	CallInst *CI = cast<CallInst>(Val: FPOp);
1171	if (ci_opr1 == `2` &&
1172	shouldReplaceLibcallWithIntrinsic(CI,
1173	/AllowMinSizeF32=/true,
1174	/AllowF64=/true)) {
1175	// rootn(x, 2) = sqrt(x)
1176	LLVM_DEBUG(errs() << "AMDIC: " << FPOp << " ---> sqrt(" << opr0 << ")\n");
1177
1178	CallInst *NewCall = B.CreateUnaryIntrinsic(ID: Intrinsic::sqrt, V: opr0, FMFSource: CI);
1179	NewCall->takeName(V: CI);
1180
1181	// OpenCL rootn has a looser ulp of 2 requirement than sqrt, so add some
1182	// metadata.
1183	MDBuilder MDHelper(M->getContext());
1184	MDNode *FPMD = MDHelper.createFPMath(Accuracy: std::max(a: FPOp->getFPAccuracy(), b: `2.0f`));
1185	NewCall->setMetadata(KindID: LLVMContext::MD_fpmath, Node: FPMD);
1186
1187	replaceCall(I: CI, With: NewCall);
1188	return true;
1189	}
1190
1191	if (ci_opr1 == `3`) { // rootn(x, 3) = cbrt(x)
1192	if (FunctionCallee FPExpr =
1193	getFunction(M, fInfo: AMDGPULibFunc (AMDGPULibFunc::EI_CBRT, FInfo))) {
1194	LLVM_DEBUG(errs() << "AMDIC: " << FPOp << " ---> cbrt(" << opr0
1195	<< ")\n");
1196	Value *nval = CreateCallEx(B,Callee: FPExpr, Arg: opr0, Name: "__rootn2cbrt");
1197	replaceCall(I: FPOp, With: nval);
1198	return true;
1199	}
1200	} else if (ci_opr1 == -`1`) { // rootn(x, -1) = 1.0/x
1201	LLVM_DEBUG(errs() << "AMDIC: " << FPOp << " ---> 1.0 / " << opr0 << "\n");
1202	Value *nval = B.CreateFDiv(L: ConstantFP::get(Ty: opr0->getType(), V: `1.0`),
1203	R: opr0,
1204	Name: "__rootn2div");
1205	replaceCall(I: FPOp, With: nval);
1206	return true;
1207	}
1208
1209	if (ci_opr1 == -`2` &&
1210	shouldReplaceLibcallWithIntrinsic(CI,
1211	/AllowMinSizeF32=/true,
1212	/AllowF64=/true)) {
1213	// rootn(x, -2) = rsqrt(x)
1214
1215	// The original rootn had looser ulp requirements than the resultant sqrt
1216	// and fdiv.
1217	MDBuilder MDHelper(M->getContext());
1218	MDNode *FPMD = MDHelper.createFPMath(Accuracy: std::max(a: FPOp->getFPAccuracy(), b: `2.0f`));
1219
1220	// TODO: Could handle strictfp but need to fix strict sqrt emission
1221	FastMathFlags FMF = FPOp->getFastMathFlags();
1222	FMF.setAllowContract(true);
1223
1224	CallInst *Sqrt = B.CreateUnaryIntrinsic(ID: Intrinsic::sqrt, V: opr0, FMFSource: CI);
1225	Instruction *RSqrt = cast<Instruction>(
1226	Val: B.CreateFDiv(L: ConstantFP::get(Ty: opr0->getType(), V: `1.0`), R: Sqrt));
1227	Sqrt->setFastMathFlags(FMF);
1228	RSqrt->setFastMathFlags(FMF);
1229	RSqrt->setMetadata(KindID: LLVMContext::MD_fpmath, Node: FPMD);
1230
1231	LLVM_DEBUG(errs() << "AMDIC: " << FPOp << " ---> rsqrt(" << opr0
1232	<< ")\n");
1233	replaceCall(I: CI, With: RSqrt);
1234	return true;
1235	}
1236
1237	return false;
1238	}
1239
1240	// is_integer(y) => trunc(y) == y
1241	static Value emitIsInteger(IRBuilder<> &B, Value Y) {
1242	Value *TruncY = B.CreateUnaryIntrinsic(ID: Intrinsic::trunc, V: Y);
1243	return B.CreateFCmpOEQ(LHS: TruncY, RHS: Y);
1244	}
1245
1246	static Value emitIsEvenInteger(IRBuilder<> &B, Value Y) {
1247	// Even integers are still integers after division by 2.
1248	auto *HalfY = B.CreateFMul(L: Y, R: ConstantFP::get(Ty: Y->getType(), V: `0.5`));
1249	return emitIsInteger(B, Y: HalfY);
1250	}
1251
1252	// is_odd_integer(y) => is_integer(y) && !is_even_integer(y)
1253	static Value emitIsOddInteger(IRBuilder<> &B, Value Y) {
1254	Value *IsIntY = emitIsInteger(B, Y);
1255	Value *IsEvenY = emitIsEvenInteger(B, Y);
1256	Value *NotEvenY = B.CreateNot(V: IsEvenY);
1257	return B.CreateAnd(LHS: IsIntY, RHS: NotEvenY);
1258	}
1259
1260	// isinf(val) => fabs(val) == +inf
1261	static Value emitIsInf(IRBuilder<> &B, Value val) {
1262	auto *fabsVal = B.CreateUnaryIntrinsic(ID: Intrinsic::fabs, V: val);
1263	return B.CreateFCmpOEQ(LHS: fabsVal, RHS: ConstantFP::getInfinity(Ty: val->getType()));
1264	}
1265
1266	// y log2(fabs(x))*
1267	static Value emitFastExpYLnx(IRBuilder<> &B, Value X, Value *Y) {
1268	Value *AbsX = B.CreateUnaryIntrinsic(ID: Intrinsic::fabs, V: X);
1269	Value *LogAbsX = B.CreateUnaryIntrinsic(ID: Intrinsic::log2, V: AbsX);
1270	Value *YTimesLogX = B.CreateFMul(L: Y, R: LogAbsX);
1271	return B.CreateUnaryIntrinsic(ID: Intrinsic::exp2, V: YTimesLogX);
1272	}
1273
1274	/// Emit special case management epilog code for fast pow, powr, pown, and rootn
1275	/// expansions. \p x and \p y should be the arguments to the library call
1276	/// (possibly with some values clamped). \p expylnx should be the result to use
1277	/// in normal circumstances.
1278	static Value emitPowFixup(IRBuilder<> &B, Value X, Value Y, Value ExpYLnX,
1279	PowKind Kind) {
1280	Constant *Zero = ConstantFP::getZero(Ty: X->getType());
1281	Constant *One = ConstantFP::get(Ty: X->getType(), V: `1.0`);
1282	Constant *QNaN = ConstantFP::getQNaN(Ty: X->getType());
1283	Constant *PInf = ConstantFP::getInfinity(Ty: X->getType());
1284
1285	switch (Kind) {
1286	case PowKind::Pow: {
1287	// is_odd_integer(y)
1288	Value *IsOddY = emitIsOddInteger(B, Y);
1289
1290	// ret = copysign(expylnx, is_odd_y ? x : 1.0f)
1291	Value *SelSign = B.CreateSelect(C: IsOddY, True: X, False: One);
1292	Value *Ret = B.CreateCopySign(LHS: ExpYLnX, RHS: SelSign);
1293
1294	// if (x < 0 && !is_integer(y)) ret = QNAN
1295	Value *IsIntY = emitIsInteger(B, Y);
1296	Value *condNegX = B.CreateFCmpOLT(LHS: X, RHS: Zero);
1297	Value *condNotIntY = B.CreateNot(V: IsIntY);
1298	Value *condNaN = B.CreateAnd(LHS: condNegX, RHS: condNotIntY);
1299	Ret = B.CreateSelect(C: condNaN, True: QNaN, False: Ret);
1300
1301	// if (isinf(ay)) { ... }
1302
1303	// FIXME: Missing backend optimization to save on materialization cost of
1304	// mixed sign constant infinities.
1305	Value *YIsInf = emitIsInf(B, val: Y);
1306
1307	Value *AY = B.CreateUnaryIntrinsic(ID: Intrinsic::fabs, V: Y);
1308	Value *YIsNegInf = B.CreateFCmpUNE(LHS: Y, RHS: AY);
1309
1310	Value *AX = B.CreateUnaryIntrinsic(ID: Intrinsic::fabs, V: X);
1311	Value *AxEqOne = B.CreateFCmpOEQ(LHS: AX, RHS: One);
1312	Value *AxLtOne = B.CreateFCmpOLT(LHS: AX, RHS: One);
1313	Value *XorCond = B.CreateXor(LHS: AxLtOne, RHS: YIsNegInf);
1314	Value *SelInf =
1315	B.CreateSelect(C: AxEqOne, True: AX, False: B.CreateSelect(C: XorCond, True: Zero, False: AY));
1316	Ret = B.CreateSelect(C: YIsInf, True: SelInf, False: Ret);
1317
1318	// if (isinf(ax) \|\| x == 0.0f) { ... }
1319	Value *XIsInf = emitIsInf(B, val: X);
1320	Value *XEqZero = B.CreateFCmpOEQ(LHS: X, RHS: Zero);
1321	Value *AxInfOrZero = B.CreateOr(LHS: XIsInf, RHS: XEqZero);
1322	Value *YLtZero = B.CreateFCmpOLT(LHS: Y, RHS: Zero);
1323	Value *XorZeroInf = B.CreateXor(LHS: XEqZero, RHS: YLtZero);
1324	Value *SelVal = B.CreateSelect(C: XorZeroInf, True: Zero, False: PInf);
1325	Value *SelSign2 = B.CreateSelect(C: IsOddY, True: X, False: Zero);
1326	Value *Copysign = B.CreateCopySign(LHS: SelVal, RHS: SelSign2);
1327	Ret = B.CreateSelect(C: AxInfOrZero, True: Copysign, False: Ret);
1328
1329	// if (isunordered(x, y)) ret = QNAN
1330	Value *isUnordered = B.CreateFCmpUNO(LHS: X, RHS: Y);
1331	return B.CreateSelect(C: isUnordered, True: QNaN, False: Ret);
1332	}
1333	case PowKind::PowR: {
1334	Value *YIsNeg = B.CreateFCmpOLT(LHS: Y, RHS: Zero);
1335	Value *IZ = B.CreateSelect(C: YIsNeg, True: PInf, False: Zero);
1336	Value *ZI = B.CreateSelect(C: YIsNeg, True: Zero, False: PInf);
1337
1338	Value *YEqZero = B.CreateFCmpOEQ(LHS: Y, RHS: Zero);
1339	Value *SelZeroCase = B.CreateSelect(C: YEqZero, True: QNaN, False: IZ);
1340	Value *XEqZero = B.CreateFCmpOEQ(LHS: X, RHS: Zero);
1341	Value *Ret = B.CreateSelect(C: XEqZero, True: SelZeroCase, False: ExpYLnX);
1342
1343	Value *XEqInf = B.CreateFCmpOEQ(LHS: X, RHS: PInf);
1344	Value *YNeZero = B.CreateFCmpUNE(LHS: Y, RHS: Zero);
1345	Value *CondInfCase = B.CreateAnd(LHS: XEqInf, RHS: YNeZero);
1346	Ret = B.CreateSelect(C: CondInfCase, True: ZI, False: Ret);
1347
1348	Value *IsInfY = emitIsInf(B, val: Y);
1349	Value *XNeOne = B.CreateFCmpUNE(LHS: X, RHS: One);
1350	Value *CondInfY = B.CreateAnd(LHS: IsInfY, RHS: XNeOne);
1351	Value *XLtOne = B.CreateFCmpOLT(LHS: X, RHS: One);
1352	Value *SelInfYCase = B.CreateSelect(C: XLtOne, True: IZ, False: ZI);
1353	Ret = B.CreateSelect(C: CondInfY, True: SelInfYCase, False: Ret);
1354
1355	Value *IsUnordered = B.CreateFCmpUNO(LHS: X, RHS: Y);
1356	return B.CreateSelect(C: IsUnordered, True: QNaN, False: Ret);
1357	}
1358	case PowKind::PowN: {
1359	Constant *ZeroI = ConstantInt::get(Ty: Y->getType(), V: `0`);
1360
1361	// is_odd_y = (ny & 1) != 0
1362	Value *OneI = ConstantInt::get(Ty: Y->getType(), V: `1`);
1363	Value *YAnd1 = B.CreateAnd(LHS: Y, RHS: OneI);
1364	Value *IsOddY = B.CreateICmpNE(LHS: YAnd1, RHS: ZeroI);
1365
1366	// ret = copysign(expylnx, is_odd_y ? x : 1.0f)
1367	Value *SelSign = B.CreateSelect(C: IsOddY, True: X, False: One);
1368	Value *Ret = B.CreateCopySign(LHS: ExpYLnX, RHS: SelSign);
1369
1370	// if (isinf(x) \|\| x == 0.0f)
1371	Value *FabsX = B.CreateUnaryIntrinsic(ID: Intrinsic::fabs, V: X);
1372	Value *XIsInf = B.CreateFCmpOEQ(LHS: FabsX, RHS: PInf);
1373	Value *XEqZero = B.CreateFCmpOEQ(LHS: X, RHS: Zero);
1374	Value *InfOrZero = B.CreateOr(LHS: XIsInf, RHS: XEqZero);
1375
1376	// (x == 0.0f) ^ (ny < 0) ? 0.0f : +inf
1377	Value *YLtZero = B.CreateICmpSLT(LHS: Y, RHS: ZeroI);
1378	Value *XorZeroInf = B.CreateXor(LHS: XEqZero, RHS: YLtZero);
1379	Value *SelVal = B.CreateSelect(C: XorZeroInf, True: Zero, False: PInf);
1380
1381	// copysign(selVal, is_odd_y ? x : 0.0f)
1382	Value *SelSign2 = B.CreateSelect(C: IsOddY, True: X, False: Zero);
1383	Value *Copysign = B.CreateCopySign(LHS: SelVal, RHS: SelSign2);
1384
1385	return B.CreateSelect(C: InfOrZero, True: Copysign, False: Ret);
1386	}
1387	case PowKind::RootN: {
1388	Constant *ZeroI = ConstantInt::get(Ty: Y->getType(), V: `0`);
1389
1390	// is_odd_y = (ny & 1) != 0
1391	Value *YAnd1 = B.CreateAnd(LHS: Y, RHS: ConstantInt::get(Ty: Y->getType(), V: `1`));
1392	Value *IsOddY = B.CreateICmpNE(LHS: YAnd1, RHS: ZeroI);
1393
1394	// ret = copysign(expylnx, is_odd_y ? x : 1.0f)
1395	Value *SelSign = B.CreateSelect(C: IsOddY, True: X, False: One);
1396	Value *Ret = B.CreateCopySign(LHS: ExpYLnX, RHS: SelSign);
1397
1398	// if (isinf(x) \|\| x == 0.0f)
1399	Value *FabsX = B.CreateUnaryIntrinsic(ID: Intrinsic::fabs, V: X);
1400	Value *IsInfX = B.CreateFCmpOEQ(LHS: FabsX, RHS: PInf);
1401	Value *XEqZero = B.CreateFCmpOEQ(LHS: X, RHS: Zero);
1402	Value *CondInfOrZero = B.CreateOr(LHS: IsInfX, RHS: XEqZero);
1403
1404	// (x == 0.0f) ^ (ny < 0) ? 0.0f : +inf
1405	Value *YLtZero = B.CreateICmpSLT(LHS: Y, RHS: ZeroI);
1406	Value *XorZeroInf = B.CreateXor(LHS: XEqZero, RHS: YLtZero);
1407	Value *SelVal = B.CreateSelect(C: XorZeroInf, True: Zero, False: PInf);
1408
1409	// copysign(selVal, is_odd_y ? x : 0.0f)
1410	Value *SelSign2 = B.CreateSelect(C: IsOddY, True: X, False: Zero);
1411	Value *Copysign = B.CreateCopySign(LHS: SelVal, RHS: SelSign2);
1412
1413	Ret = B.CreateSelect(C: CondInfOrZero, True: Copysign, False: Ret);
1414
1415	// if ((x < 0.0f && !is_odd_y) \|\| ny == 0) ret = QNAN
1416	Value *XIsNeg = B.CreateFCmpOLT(LHS: X, RHS: Zero);
1417	Value *NotOddY = B.CreateNot(V: IsOddY);
1418	Value *CondNegAndNotOdd = B.CreateAnd(LHS: XIsNeg, RHS: NotOddY);
1419	Value *YEqZero = B.CreateICmpEQ(LHS: Y, RHS: ZeroI);
1420	Value *CondBad = B.CreateOr(LHS: CondNegAndNotOdd, RHS: YEqZero);
1421	return B.CreateSelect(C: CondBad, True: QNaN, False: Ret);
1422	}
1423	}
1424
1425	llvm_unreachable("covered switch");
1426	}
1427
1428	// TODO: Move the fold_pow folding to sqrt/fdiv here
1429	bool AMDGPULibCalls::expandFastPow(FPMathOperator *FPOp, IRBuilder<> &B,
1430	PowKind Kind) {
1431	Type *Ty = FPOp->getType();
1432
1433	// There's currently no reason to do this for half. The correct path is
1434	// promote to float and use the fast float expansion.
1435	//
1436	// TODO: We could move this expansion to lowering to get half pow to work.
1437	if (!Ty->getScalarType()->isFloatTy())
1438	return false;
1439
1440	// TODO: Verify optimization for double and bfloat.
1441	Value *X = FPOp->getOperand(i: `0`);
1442	Value *Y = FPOp->getOperand(i: `1`);
1443
1444	switch (Kind) {
1445	case PowKind::Pow: {
1446	Constant *One = ConstantFP::get(Ty: X->getType(), V: `1.0`);
1447
1448	// if (x == 1.0f) y = 1.0f;
1449	Value *XEqOne = B.CreateFCmpOEQ(LHS: X, RHS: One);
1450	Y = B.CreateSelect(C: XEqOne, True: One, False: Y);
1451
1452	// if (y == 0.0f) x = 1.0f;
1453	Value *YEqZero = B.CreateFCmpOEQ(LHS: Y, RHS: ConstantFP::getZero(Ty: X->getType()));
1454	X = B.CreateSelect(C: YEqZero, True: One, False: X);
1455
1456	Value *ExpYLnX = emitFastExpYLnx(B, X, Y);
1457	Value *Fixed = emitPowFixup(B, X, Y, ExpYLnX, Kind);
1458	replaceCall(I: FPOp, With: Fixed);
1459	return true;
1460	}
1461	case PowKind::PowR: {
1462	Value *NegX = B.CreateFCmpOLT(LHS: X, RHS: ConstantFP::getZero(Ty: X->getType()));
1463	X = B.CreateSelect(C: NegX, True: ConstantFP::getQNaN(Ty: X->getType()), False: X);
1464
1465	Value *ExpYLnX = emitFastExpYLnx(B, X, Y);
1466	Value *Fixed = emitPowFixup(B, X, Y, ExpYLnX, Kind);
1467	replaceCall(I: FPOp, With: Fixed);
1468	return true;
1469	}
1470	case PowKind::PowN: {
1471	// ny == 0
1472	Value *YEqZero = B.CreateICmpEQ(LHS: Y, RHS: ConstantInt::get(Ty: Y->getType(), V: `0`));
1473
1474	// x = (ny == 0 ? 1.0f : x)
1475	X = B.CreateSelect(C: YEqZero, True: ConstantFP::get(Ty: X->getType(), V: `1.0`), False: X);
1476
1477	Value *CastY = B.CreateSIToFP(V: Y, DestTy: X->getType());
1478	Value *ExpYLnX = emitFastExpYLnx(B, X, Y: CastY);
1479	Value *Fixed = emitPowFixup(B, X, Y, ExpYLnX, Kind);
1480	replaceCall(I: FPOp, With: Fixed);
1481	return true;
1482	}
1483	case PowKind::RootN: {
1484	Value *CastY = B.CreateSIToFP(V: Y, DestTy: X->getType());
1485
1486	// This is afn anyway, so we will turn into rcp.
1487	Value *RcpY = B.CreateFDiv(L: ConstantFP::get(Ty: X->getType(), V: `1.0`), R: CastY);
1488
1489	Value *ExpYLnX = emitFastExpYLnx(B, X, Y: RcpY);
1490	Value *Fixed = emitPowFixup(B, X, Y, ExpYLnX, Kind);
1491	replaceCall(I: FPOp, With: Fixed);
1492	return true;
1493	}
1494	}
1495	llvm_unreachable("Unhandled PowKind enum");
1496	}
1497
1498	bool AMDGPULibCalls::tryOptimizePow(FPMathOperator *FPOp, IRBuilder<> &B,
1499	const FuncInfo &FInfo) {
1500	FastMathFlags FMF = FPOp->getFastMathFlags();
1501	CallInst *Call = cast<CallInst>(Val: FPOp);
1502	Module *M = Call->getModule();
1503
1504	FuncInfo PowrInfo;
1505	AMDGPULibFunc::EFuncId FastPowrFuncId =
1506	FMF.approxFunc() \|\| FInfo.getId() == AMDGPULibFunc::EI_POW_FAST
1507	? AMDGPULibFunc::EI_POWR_FAST
1508	: AMDGPULibFunc::EI_NONE;
1509	FunctionCallee PowrFunc = getFloatFastVariant(
1510	M, fInfo: FInfo, newInfo&: PowrInfo, NewFunc: AMDGPULibFunc::EI_POWR, FastVariant: FastPowrFuncId);
1511
1512	// TODO: Prefer fast pown to fast powr, but slow powr to slow pown.
1513
1514	// pow(x, y) -> powr(x, y) for x >= -0.0
1515	// TODO: Account for flags on current call
1516	if (PowrFunc && cannotBeOrderedLessThanZero(V: FPOp->getOperand(i: `0`),
1517	SQ: SQ.getWithInstruction(I: Call))) {
1518	Call->setCalledFunction(PowrFunc);
1519	return fold_pow(FPOp, B, FInfo: PowrInfo) \|\| true;
1520	}
1521
1522	// pow(x, y) -> pown(x, y) for known integral y
1523	if (isKnownIntegral(V: FPOp->getOperand(i: `1`), SQ: SQ.getWithInstruction(I: Call),
1524	FMF: FPOp->getFastMathFlags())) {
1525	FunctionType *PownType = getPownType(FT: Call->getFunctionType());
1526
1527	FuncInfo PownInfo;
1528	AMDGPULibFunc::EFuncId FastPownFuncId =
1529	FMF.approxFunc() \|\| FInfo.getId() == AMDGPULibFunc::EI_POW_FAST
1530	? AMDGPULibFunc::EI_POWN_FAST
1531	: AMDGPULibFunc::EI_NONE;
1532	FunctionCallee PownFunc = getFloatFastVariant(
1533	M, fInfo: FInfo, newInfo&: PownInfo, NewFunc: AMDGPULibFunc::EI_POWN, FastVariant: FastPownFuncId);
1534
1535	if (PownFunc) {
1536	// TODO: If the incoming integral value is an sitofp/uitofp, it won't
1537	// fold out without a known range. We can probably take the source
1538	// value directly.
1539	Value *CastedArg =
1540	B.CreateFPToSI(V: FPOp->getOperand(i: `1`), DestTy: PownType->getParamType(i: `1`));
1541	// Have to drop any nofpclass attributes on the original call site.
1542	Call->removeParamAttrs(
1543	ArgNo: `1`, AttrsToRemove: AttributeFuncs::typeIncompatible(Ty: CastedArg->getType(),
1544	AS: Call->getParamAttributes(ArgNo: `1`)));
1545	Call->setCalledFunction(PownFunc);
1546	Call->setArgOperand(i: `1`, v: CastedArg);
1547	return fold_pow(FPOp, B, FInfo: PownInfo) \|\| true;
1548	}
1549	}
1550
1551	if (fold_pow(FPOp, B, FInfo))
1552	return true;
1553
1554	if (!FMF.approxFunc())
1555	return false;
1556
1557	if (FInfo.getId() == AMDGPULibFunc::EI_POW && FMF.approxFunc() &&
1558	getArgType(FInfo) == AMDGPULibFunc::F32) {
1559	AMDGPULibFunc PowFastInfo(AMDGPULibFunc::EI_POW_FAST, FInfo);
1560	if (FunctionCallee PowFastFunc = getFunction(M, fInfo: PowFastInfo)) {
1561	Call->setCalledFunction(PowFastFunc);
1562	return fold_pow(FPOp, B, FInfo: PowFastInfo) \|\| true;
1563	}
1564	}
1565
1566	return expandFastPow(FPOp, B, Kind: PowKind::Pow);
1567	}
1568
1569	// Get a scalar native builtin single argument FP function
1570	FunctionCallee AMDGPULibCalls::getNativeFunction(Module *M,
1571	const FuncInfo &FInfo) {
1572	if (getArgType(FInfo) == AMDGPULibFunc::F64 \|\| !HasNative(id: FInfo.getId()))
1573	return nullptr;
1574	FuncInfo nf = FInfo;
1575	nf.setPrefix(AMDGPULibFunc::NATIVE);
1576	return getFunction(M, fInfo: nf);
1577	}
1578
1579	// Some library calls are just wrappers around llvm intrinsics, but compiled
1580	// conservatively. Preserve the flags from the original call site by
1581	// substituting them with direct calls with all the flags.
1582	bool AMDGPULibCalls::shouldReplaceLibcallWithIntrinsic(const CallInst *CI,
1583	bool AllowMinSizeF32,
1584	bool AllowF64,
1585	bool AllowStrictFP) {
1586	Type *FltTy = CI->getType()->getScalarType();
1587	const bool IsF32 = FltTy->isFloatTy();
1588
1589	// f64 intrinsics aren't implemented for most operations.
1590	if (!IsF32 && !FltTy->isHalfTy() && (!AllowF64 \|\| !FltTy->isDoubleTy()))
1591	return false;
1592
1593	// We're implicitly inlining by replacing the libcall with the intrinsic, so
1594	// don't do it for noinline call sites.
1595	if (CI->isNoInline())
1596	return false;
1597
1598	const Function *ParentF = CI->getFunction();
1599	// TODO: Handle strictfp
1600	if (!AllowStrictFP && ParentF->hasFnAttribute(Kind: Attribute::StrictFP))
1601	return false;
1602
1603	if (IsF32 && !AllowMinSizeF32 && ParentF->hasMinSize())
1604	return false;
1605	return true;
1606	}
1607
1608	void AMDGPULibCalls::replaceLibCallWithSimpleIntrinsic(IRBuilder<> &B,
1609	CallInst *CI,
1610	Intrinsic::ID IntrID) {
1611	if (CI->arg_size() == `2`) {
1612	Value *Arg0 = CI->getArgOperand(i: `0`);
1613	Value *Arg1 = CI->getArgOperand(i: `1`);
1614	VectorType *Arg0VecTy = dyn_cast<VectorType>(Val: Arg0->getType());
1615	VectorType *Arg1VecTy = dyn_cast<VectorType>(Val: Arg1->getType());
1616	if (Arg0VecTy && !Arg1VecTy) {
1617	Value *SplatRHS = B.CreateVectorSplat(EC: Arg0VecTy->getElementCount(), V: Arg1);
1618	CI->setArgOperand(i: `1`, v: SplatRHS);
1619	} else if (!Arg0VecTy && Arg1VecTy) {
1620	Value *SplatLHS = B.CreateVectorSplat(EC: Arg1VecTy->getElementCount(), V: Arg0);
1621	CI->setArgOperand(i: `0`, v: SplatLHS);
1622	}
1623	}
1624
1625	CI->setCalledFunction(Intrinsic::getOrInsertDeclaration(
1626	M: CI->getModule(), id: IntrID, Tys: {CI->getType()}));
1627	}
1628
1629	bool AMDGPULibCalls::tryReplaceLibcallWithSimpleIntrinsic(
1630	IRBuilder<> &B, CallInst CI, Intrinsic::ID IntrID, bool* AllowMinSizeF32,
1631	bool AllowF64, bool AllowStrictFP) {
1632	if (!shouldReplaceLibcallWithIntrinsic(CI, AllowMinSizeF32, AllowF64,
1633	AllowStrictFP))
1634	return false;
1635	replaceLibCallWithSimpleIntrinsic(B, CI, IntrID);
1636	return true;
1637	}
1638
1639	std::tuple<Value , Value , Value *>
1640	AMDGPULibCalls::insertSinCos(Value *Arg, FastMathFlags FMF, IRBuilder<> &B,
1641	FunctionCallee Fsincos) {
1642	DebugLoc DL = B.getCurrentDebugLocation();
1643	Function *F = B.GetInsertBlock()->getParent();
1644	B.SetInsertPointPastAllocas(F);
1645
1646	AllocaInst Alloc = B.CreateAlloca(Ty: Arg->getType(), ArraySize: nullptr*, Name: "__sincos_");
1647
1648	if (Instruction *ArgInst = dyn_cast<Instruction>(Val: Arg)) {
1649	// If the argument is an instruction, it must dominate all uses so put our
1650	// sincos call there. Otherwise, right after the allocas works well enough
1651	// if it's an argument or constant.
1652
1653	B.SetInsertPoint(TheBB: ArgInst->getParent(), IP: ++ArgInst->getIterator());
1654
1655	// SetInsertPoint unwelcomely always tries to set the debug loc.
1656	B.SetCurrentDebugLocation(DL);
1657	}
1658
1659	Type *CosPtrTy = Fsincos.getFunctionType()->getParamType(i: `1`);
1660
1661	// The allocaInst allocates the memory in private address space. This need
1662	// to be addrspacecasted to point to the address space of cos pointer type.
1663	// In OpenCL 2.0 this is generic, while in 1.2 that is private.
1664	Value *CastAlloc = B.CreateAddrSpaceCast(V: Alloc, DestTy: CosPtrTy);
1665
1666	CallInst *SinCos = CreateCallEx2(B, Callee: Fsincos, Arg1: Arg, Arg2: CastAlloc);
1667
1668	// TODO: Is it worth trying to preserve the location for the cos calls for the
1669	// load?
1670
1671	LoadInst *LoadCos = B.CreateLoad(Ty: Arg->getType(), Ptr: Alloc);
1672	return {SinCos, LoadCos, SinCos};
1673	}
1674
1675	// fold sin, cos -> sincos.
1676	bool AMDGPULibCalls::fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B,
1677	const FuncInfo &fInfo) {
1678	assert(fInfo.getId() == AMDGPULibFunc::EI_SIN \|\|
1679	fInfo.getId() == AMDGPULibFunc::EI_COS);
1680
1681	if ((getArgType(FInfo: fInfo) != AMDGPULibFunc::F32 &&
1682	getArgType(FInfo: fInfo) != AMDGPULibFunc::F64) \|\|
1683	fInfo.getPrefix() != AMDGPULibFunc::NOPFX)
1684	return false;
1685
1686	bool const isSin = fInfo.getId() == AMDGPULibFunc::EI_SIN;
1687
1688	Value *CArgVal = FPOp->getOperand(i: `0`);
1689
1690	// TODO: Constant fold the call
1691	if (isa<ConstantData>(Val: CArgVal))
1692	return false;
1693
1694	CallInst *CI = cast<CallInst>(Val: FPOp);
1695
1696	Function *F = B.GetInsertBlock()->getParent();
1697	Module *M = F->getParent();
1698
1699	// Merge the sin and cos. For OpenCL 2.0, there may only be a generic pointer
1700	// implementation. Prefer the private form if available.
1701	AMDGPULibFunc SinCosLibFuncPrivate(AMDGPULibFunc::EI_SINCOS, fInfo);
1702	SinCosLibFuncPrivate.getLeads()[`0`].PtrKind =
1703	AMDGPULibFunc::getEPtrKindFromAddrSpace(AS: AMDGPUAS::PRIVATE_ADDRESS);
1704
1705	AMDGPULibFunc SinCosLibFuncGeneric(AMDGPULibFunc::EI_SINCOS, fInfo);
1706	SinCosLibFuncGeneric.getLeads()[`0`].PtrKind =
1707	AMDGPULibFunc::getEPtrKindFromAddrSpace(AS: AMDGPUAS::FLAT_ADDRESS);
1708
1709	FunctionCallee FSinCosPrivate = getFunction(M, fInfo: SinCosLibFuncPrivate);
1710	FunctionCallee FSinCosGeneric = getFunction(M, fInfo: SinCosLibFuncGeneric);
1711	FunctionCallee FSinCos = FSinCosPrivate ? FSinCosPrivate : FSinCosGeneric;
1712	if (!FSinCos)
1713	return false;
1714
1715	SmallVector<CallInst *> SinCalls;
1716	SmallVector<CallInst *> CosCalls;
1717	SmallVector<CallInst *> SinCosCalls;
1718	FuncInfo PartnerInfo(isSin ? AMDGPULibFunc::EI_COS : AMDGPULibFunc::EI_SIN,
1719	fInfo);
1720	const std::string PairName = PartnerInfo.mangle();
1721
1722	StringRef SinName = isSin ? CI->getCalledFunction()->getName() : PairName;
1723	StringRef CosName = isSin ? PairName : CI->getCalledFunction()->getName();
1724	const std::string SinCosPrivateName = SinCosLibFuncPrivate.mangle();
1725	const std::string SinCosGenericName = SinCosLibFuncGeneric.mangle();
1726
1727	// Intersect the two sets of flags.
1728	FastMathFlags FMF = FPOp->getFastMathFlags();
1729	MDNode *FPMath = CI->getMetadata(KindID: LLVMContext::MD_fpmath);
1730
1731	SmallVector<DILocation *> MergeDbgLocs = {CI->getDebugLoc()};
1732
1733	for (User* U : CArgVal->users()) {
1734	CallInst *XI = dyn_cast<CallInst>(Val: U);
1735	if (!XI \|\| XI->getFunction() != F \|\| XI->isNoBuiltin())
1736	continue;
1737
1738	Function *UCallee = XI->getCalledFunction();
1739	if (!UCallee)
1740	continue;
1741
1742	bool Handled = true;
1743
1744	if (UCallee->getName() == SinName)
1745	SinCalls.push_back(Elt: XI);
1746	else if (UCallee->getName() == CosName)
1747	CosCalls.push_back(Elt: XI);
1748	else if (UCallee->getName() == SinCosPrivateName \|\|
1749	UCallee->getName() == SinCosGenericName)
1750	SinCosCalls.push_back(Elt: XI);
1751	else
1752	Handled = false;
1753
1754	if (Handled) {
1755	MergeDbgLocs.push_back(Elt: XI->getDebugLoc());
1756	auto *OtherOp = cast<FPMathOperator>(Val: XI);
1757	FMF &= OtherOp->getFastMathFlags();
1758	FPMath = MDNode::getMostGenericFPMath(
1759	A: FPMath, B: XI->getMetadata(KindID: LLVMContext::MD_fpmath));
1760	}
1761	}
1762
1763	if (SinCalls.empty() \|\| CosCalls.empty())
1764	return false;
1765
1766	B.setFastMathFlags(FMF);
1767	B.setDefaultFPMathTag(FPMath);
1768	DILocation *DbgLoc = DILocation::getMergedLocations(Locs: MergeDbgLocs);
1769	B.SetCurrentDebugLocation(DbgLoc);
1770
1771	auto [Sin, Cos, SinCos] = insertSinCos(Arg: CArgVal, FMF, B, Fsincos: FSinCos);
1772
1773	auto replaceTrigInsts = [](ArrayRef<CallInst > Calls, Value Res) {
1774	for (CallInst *C : Calls)
1775	C->replaceAllUsesWith(V: Res);
1776
1777	// Leave the other dead instructions to avoid clobbering iterators.
1778	};
1779
1780	replaceTrigInsts (SinCalls, Sin);
1781	replaceTrigInsts (CosCalls, Cos);
1782	replaceTrigInsts (SinCosCalls, SinCos);
1783
1784	// It's safe to delete the original now.
1785	CI->eraseFromParent();
1786	return true;
1787	}
1788
1789	bool AMDGPULibCalls::evaluateScalarMathFunc(const FuncInfo &FInfo,
1790	APFloat &Res0, APFloat &Res1,
1791	Constant copr0, Constant copr1) {
1792	// By default, opr0/opr1/opr3 holds values of float/double type.
1793	// If they are not float/double, each function has to its
1794	// operand separately.
1795	double opr0 = `0.0`, opr1 = `0.0`;
1796	ConstantFP *fpopr0 = dyn_cast_or_null<ConstantFP>(Val: copr0);
1797	ConstantFP *fpopr1 = dyn_cast_or_null<ConstantFP>(Val: copr1);
1798	if (fpopr0) {
1799	opr0 = (getArgType(FInfo) == AMDGPULibFunc::F64)
1800	? fpopr0->getValueAPF().convertToDouble()
1801	: (double)fpopr0->getValueAPF().convertToFloat();
1802	}
1803
1804	if (fpopr1) {
1805	opr1 = (getArgType(FInfo) == AMDGPULibFunc::F64)
1806	? fpopr1->getValueAPF().convertToDouble()
1807	: (double)fpopr1->getValueAPF().convertToFloat();
1808	}
1809
1810	switch (FInfo.getId()) {
1811	default:
1812	return false;
1813
1814	case AMDGPULibFunc::EI_ACOS:
1815	Res0 = APFloat {acos(x: opr0)};
1816	return true;
1817
1818	case AMDGPULibFunc::EI_ACOSH:
1819	// acosh(x) == log(x + sqrt(xx - 1))*
1820	Res0 = APFloat {log(x: opr0 + sqrt(x: opr0 * opr0 - `1.0`))};
1821	return true;
1822
1823	case AMDGPULibFunc::EI_ACOSPI:
1824	Res0 = APFloat {acos(x: opr0) / MATH_PI};
1825	return true;
1826
1827	case AMDGPULibFunc::EI_ASIN:
1828	Res0 = APFloat {asin(x: opr0)};
1829	return true;
1830
1831	case AMDGPULibFunc::EI_ASINH:
1832	// asinh(x) == log(x + sqrt(xx + 1))*
1833	Res0 = APFloat {log(x: opr0 + sqrt(x: opr0 * opr0 + `1.0`))};
1834	return true;
1835
1836	case AMDGPULibFunc::EI_ASINPI:
1837	Res0 = APFloat {asin(x: opr0) / MATH_PI};
1838	return true;
1839
1840	case AMDGPULibFunc::EI_ATAN:
1841	Res0 = APFloat {atan(x: opr0)};
1842	return true;
1843
1844	case AMDGPULibFunc::EI_ATANH:
1845	// atanh(x) == (log(x+1) - log(x-1))/2;
1846	Res0 = APFloat {(log(x: opr0 + `1.0`) - log(x: opr0 - `1.0`)) / `2.0`};
1847	return true;
1848
1849	case AMDGPULibFunc::EI_ATANPI:
1850	Res0 = APFloat {atan(x: opr0) / MATH_PI};
1851	return true;
1852
1853	case AMDGPULibFunc::EI_CBRT:
1854	Res0 =
1855	APFloat {(opr0 < `0.0`) ? -pow(x: -opr0, y: `1.0` / `3.0`) : pow(x: opr0, y: `1.0` / `3.0`)};
1856	return true;
1857
1858	case AMDGPULibFunc::EI_COS:
1859	Res0 = APFloat {cos(x: opr0)};
1860	return true;
1861
1862	case AMDGPULibFunc::EI_COSH:
1863	Res0 = APFloat {cosh(x: opr0)};
1864	return true;
1865
1866	case AMDGPULibFunc::EI_COSPI:
1867	Res0 = APFloat {cos(MATH_PI * opr0)};
1868	return true;
1869
1870	case AMDGPULibFunc::EI_EXP:
1871	Res0 = APFloat {exp(x: opr0)};
1872	return true;
1873
1874	case AMDGPULibFunc::EI_EXP2:
1875	Res0 = APFloat {pow(x: `2.0`, y: opr0)};
1876	return true;
1877
1878	case AMDGPULibFunc::EI_EXP10:
1879	Res0 = APFloat {pow(x: `10.0`, y: opr0)};
1880	return true;
1881
1882	case AMDGPULibFunc::EI_LOG:
1883	Res0 = APFloat {log(x: opr0)};
1884	return true;
1885
1886	case AMDGPULibFunc::EI_LOG2:
1887	Res0 = APFloat {log(x: opr0) / log(x: `2.0`)};
1888	return true;
1889
1890	case AMDGPULibFunc::EI_LOG10:
1891	Res0 = APFloat {log(x: opr0) / log(x: `10.0`)};
1892	return true;
1893
1894	case AMDGPULibFunc::EI_RSQRT:
1895	Res0 = APFloat {`1.0` / sqrt(x: opr0)};
1896	return true;
1897
1898	case AMDGPULibFunc::EI_SIN:
1899	Res0 = APFloat {sin(x: opr0)};
1900	return true;
1901
1902	case AMDGPULibFunc::EI_SINH:
1903	Res0 = APFloat {sinh(x: opr0)};
1904	return true;
1905
1906	case AMDGPULibFunc::EI_SINPI:
1907	Res0 = APFloat {sin(MATH_PI * opr0)};
1908	return true;
1909
1910	case AMDGPULibFunc::EI_TAN:
1911	Res0 = APFloat {tan(x: opr0)};
1912	return true;
1913
1914	case AMDGPULibFunc::EI_TANH:
1915	Res0 = APFloat {tanh(x: opr0)};
1916	return true;
1917
1918	case AMDGPULibFunc::EI_TANPI:
1919	Res0 = APFloat {tan(MATH_PI * opr0)};
1920	return true;
1921
1922	// two-arg functions
1923	case AMDGPULibFunc::EI_POW:
1924	case AMDGPULibFunc::EI_POWR:
1925	Res0 = APFloat {pow(x: opr0, y: opr1)};
1926	return true;
1927
1928	case AMDGPULibFunc::EI_POWN: {
1929	if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(Val: copr1)) {
1930	double val = (double)iopr1->getSExtValue();
1931	Res0 = APFloat {pow(x: opr0, y: val)};
1932	return true;
1933	}
1934	return false;
1935	}
1936
1937	case AMDGPULibFunc::EI_ROOTN: {
1938	if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(Val: copr1)) {
1939	double val = (double)iopr1->getSExtValue();
1940	Res0 = APFloat {pow(x: opr0, y: `1.0` / val)};
1941	return true;
1942	}
1943	return false;
1944	}
1945
1946	// with ptr arg
1947	case AMDGPULibFunc::EI_SINCOS:
1948	Res0 = APFloat {sin(x: opr0)};
1949	Res1 = APFloat {cos(x: opr0)};
1950	return true;
1951	}
1952
1953	return false;
1954	}
1955
1956	bool AMDGPULibCalls::evaluateCall(CallInst aCI, const* FuncInfo &FInfo) {
1957	int numArgs = (int)aCI->arg_size();
1958	if (numArgs > `3`)
1959	return false;
1960
1961	Constant copr0 = nullptr*;
1962	Constant copr1 = nullptr*;
1963	if (numArgs > `0`) {
1964	if ((copr0 = dyn_cast<Constant>(Val: aCI->getArgOperand(i: `0`))) == nullptr)
1965	return false;
1966	}
1967
1968	if (numArgs > `1`) {
1969	if ((copr1 = dyn_cast<Constant>(Val: aCI->getArgOperand(i: `1`))) == nullptr) {
1970	if (FInfo.getId() != AMDGPULibFunc::EI_SINCOS)
1971	return false;
1972	}
1973	}
1974
1975	// At this point, all arguments to aCI are constants.
1976
1977	// max vector size is 16, and sincos will generate two results.
1978	SmallVector<APFloat, `16`> Val0, Val1;
1979	int FuncVecSize = getVecSize(FInfo);
1980	bool hasTwoResults = (FInfo.getId() == AMDGPULibFunc::EI_SINCOS);
1981	if (FuncVecSize == `1`) {
1982	if (!evaluateScalarMathFunc(FInfo, Res0&: Val0.emplace_back(Args: `0.0`),
1983	Res1&: Val1.emplace_back(Args: `0.0`), copr0, copr1)) {
1984	return false;
1985	}
1986	} else {
1987	ConstantDataVector *CDV0 = dyn_cast_or_null<ConstantDataVector>(Val: copr0);
1988	ConstantDataVector *CDV1 = dyn_cast_or_null<ConstantDataVector>(Val: copr1);
1989	for (int i = `0`; i < FuncVecSize; ++i) {
1990	Constant celt0 = CDV0 ? CDV0->getElementAsConstant(i) : nullptr*;
1991	Constant celt1 = CDV1 ? CDV1->getElementAsConstant(i) : nullptr*;
1992	if (!evaluateScalarMathFunc(FInfo, Res0&: Val0.emplace_back(Args: `0.0`),
1993	Res1&: Val1.emplace_back(Args: `0.0`), copr0: celt0, copr1: celt1)) {
1994	return false;
1995	}
1996	}
1997	}
1998
1999	Constant nval0, nval1;
2000	if (FuncVecSize == `1`) {
2001	nval0 = ConstantFP::get(Ty: aCI->getType(), V: Val0 [`0`]);
2002	if (hasTwoResults)
2003	nval1 = ConstantFP::get(Ty: aCI->getType(), V: Val1 [`0`]);
2004	} else {
2005	nval0 = getConstantFloatVector(Values: Val0, Ty: aCI->getType());
2006	if (hasTwoResults)
2007	nval1 = getConstantFloatVector(Values: Val1, Ty: aCI->getType());
2008	}
2009
2010	if (hasTwoResults) {
2011	// sincos
2012	assert(FInfo.getId() == AMDGPULibFunc::EI_SINCOS &&
2013	"math function with ptr arg not supported yet");
2014	new StoreInst (nval1, aCI->getArgOperand(i: `1`), aCI->getIterator());
2015	}
2016
2017	replaceCall(I: aCI, With: nval0);
2018	return true;
2019	}
2020
2021	PreservedAnalyses AMDGPUSimplifyLibCallsPass::run(Function &F,
2022	FunctionAnalysisManager &AM) {
2023	AMDGPULibCalls Simplifier(F, AM);
2024	Simplifier.initNativeFuncs();
2025
2026	bool Changed = false;
2027
2028	LLVM_DEBUG(dbgs() << "AMDIC: process function ";
2029	F.printAsOperand(dbgs(), false, F.getParent()); dbgs() << `'\n'`;);
2030
2031	for (auto &BB : F) {
2032	for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;) {
2033	// Ignore non-calls.
2034	CallInst *CI = dyn_cast<CallInst>(Val&: I);
2035	++I;
2036
2037	if (CI) {
2038	if (Simplifier.fold(CI))
2039	Changed = true;
2040	}
2041	}
2042	}
2043	return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
2044	}
2045
2046	PreservedAnalyses AMDGPUUseNativeCallsPass::run(Function &F,
2047	FunctionAnalysisManager &AM) {
2048	if (UseNative.empty())
2049	return PreservedAnalyses::all();
2050
2051	AMDGPULibCalls Simplifier(F, AM);
2052	Simplifier.initNativeFuncs();
2053
2054	bool Changed = false;
2055	for (auto &BB : F) {
2056	for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;) {
2057	// Ignore non-calls.
2058	CallInst *CI = dyn_cast<CallInst>(Val&: I);
2059	++I;
2060	if (CI && Simplifier.useNative(aCI: CI))
2061	Changed = true;
2062	}
2063	}
2064	return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
2065	}
2066

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp