AMDGPULibCalls.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp]

1	//===- AMDGPULibCalls.cpp -------------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// This file does AMD library function optimizations.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "AMDGPU.h"
15	#include "AMDGPULibFunc.h"
16	#include "GCNSubtarget.h"
17	#include "llvm/Analysis/AssumptionCache.h"
18	#include "llvm/Analysis/TargetLibraryInfo.h"
19	#include "llvm/Analysis/ValueTracking.h"
20	#include "llvm/IR/AttributeMask.h"
21	#include "llvm/IR/Dominators.h"
22	#include "llvm/IR/IRBuilder.h"
23	#include "llvm/IR/IntrinsicInst.h"
24	#include "llvm/IR/IntrinsicsAMDGPU.h"
25	#include "llvm/IR/MDBuilder.h"
26	#include "llvm/IR/PatternMatch.h"
27	#include "llvm/InitializePasses.h"
28	#include <cmath>
29
30	#define DEBUG_TYPE "amdgpu-simplifylib"
31
32	using namespace llvm;
33	using namespace llvm::PatternMatch;
34
35	static cl::opt<bool> EnablePreLink("amdgpu-prelink",
36	cl::desc ("Enable pre-link mode optimizations"),
37	cl::init(Val: false),
38	cl::Hidden);
39
40	static cl::list<std::string> UseNative("amdgpu-use-native",
41	cl::desc ("Comma separated list of functions to replace with native, or all"),
42	cl::CommaSeparated, cl::ValueOptional,
43	cl::Hidden);
44
45	#define MATH_PI numbers::pi
46	#define MATH_E numbers::e
47	#define MATH_SQRT2 numbers::sqrt2
48	#define MATH_SQRT1_2 numbers::inv_sqrt2
49
50	namespace llvm {
51
52	class AMDGPULibCalls {
53	private:
54	const TargetLibraryInfo TLInfo = nullptr*;
55	AssumptionCache AC = nullptr*;
56	DominatorTree DT = nullptr*;
57
58	using FuncInfo = llvm::AMDGPULibFunc;
59
60	bool UnsafeFPMath = false;
61
62	// -fuse-native.
63	bool AllNative = false;
64
65	bool useNativeFunc(const StringRef F) const;
66
67	// Return a pointer (pointer expr) to the function if function definition with
68	// "FuncName" exists. It may create a new function prototype in pre-link mode.
69	FunctionCallee getFunction(Module M, const* FuncInfo &fInfo);
70
71	bool parseFunctionName(const StringRef &FMangledName, FuncInfo &FInfo);
72
73	bool TDOFold(CallInst CI, const* FuncInfo &FInfo);
74
75	/ Specialized optimizations /
76
77	// pow/powr/pown
78	bool fold_pow(FPMathOperator FPOp, IRBuilder<> &B, const* FuncInfo &FInfo);
79
80	// rootn
81	bool fold_rootn(FPMathOperator FPOp, IRBuilder<> &B, const* FuncInfo &FInfo);
82
83	// -fuse-native for sincos
84	bool sincosUseNative(CallInst aCI, const* FuncInfo &FInfo);
85
86	// evaluate calls if calls' arguments are constants.
87	bool evaluateScalarMathFunc(const FuncInfo &FInfo, double &Res0, double &Res1,
88	Constant copr0, Constant copr1);
89	bool evaluateCall(CallInst aCI, const* FuncInfo &FInfo);
90
91	/// Insert a value to sincos function \p Fsincos. Returns (value of sin, value
92	/// of cos, sincos call).
93	std::tuple<Value , Value , Value > insertSinCos(Value Arg,
94	FastMathFlags FMF,
95	IRBuilder<> &B,
96	FunctionCallee Fsincos);
97
98	// sin/cos
99	bool fold_sincos(FPMathOperator FPOp, IRBuilder<> &B, const* FuncInfo &FInfo);
100
101	// __read_pipe/__write_pipe
102	bool fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
103	const FuncInfo &FInfo);
104
105	// Get a scalar native builtin single argument FP function
106	FunctionCallee getNativeFunction(Module M, const* FuncInfo &FInfo);
107
108	/// Substitute a call to a known libcall with an intrinsic call. If \p
109	/// AllowMinSize is true, allow the replacement in a minsize function.
110	bool shouldReplaceLibcallWithIntrinsic(const CallInst *CI,
111	bool AllowMinSizeF32 = false,
112	bool AllowF64 = false,
113	bool AllowStrictFP = false);
114	void replaceLibCallWithSimpleIntrinsic(IRBuilder<> &B, CallInst *CI,
115	Intrinsic::ID IntrID);
116
117	bool tryReplaceLibcallWithSimpleIntrinsic(IRBuilder<> &B, CallInst *CI,
118	Intrinsic::ID IntrID,
119	bool AllowMinSizeF32 = false,
120	bool AllowF64 = false,
121	bool AllowStrictFP = false);
122
123	protected:
124	bool isUnsafeMath(const FPMathOperator FPOp) const*;
125	bool isUnsafeFiniteOnlyMath(const FPMathOperator FPOp) const*;
126
127	bool canIncreasePrecisionOfConstantFold(const FPMathOperator FPOp) const*;
128
129	static void replaceCall(Instruction I, Value With) {
130	I->replaceAllUsesWith(V: With);
131	I->eraseFromParent();
132	}
133
134	static void replaceCall(FPMathOperator I, Value With) {
135	replaceCall(I: cast<Instruction>(Val: I), With);
136	}
137
138	public:
139	AMDGPULibCalls() = default;
140
141	bool fold(CallInst *CI);
142
143	void initFunction(Function &F, FunctionAnalysisManager &FAM);
144	void initNativeFuncs();
145
146	// Replace a normal math function call with that native version
147	bool useNative(CallInst *CI);
148	};
149
150	} // end namespace llvm
151
152	template <typename IRB>
153	static CallInst CreateCallEx(IRB &B, FunctionCallee Callee, Value Arg,
154	const Twine &Name = "") {
155	CallInst *R = B.CreateCall(Callee, Arg, Name);
156	if (Function *F = dyn_cast<Function>(Val: Callee.getCallee()))
157	R->setCallingConv(F->getCallingConv());
158	return R;
159	}
160
161	template <typename IRB>
162	static CallInst CreateCallEx2(IRB &B, FunctionCallee Callee, Value Arg1,
163	Value Arg2, const* Twine &Name = "") {
164	CallInst *R = B.CreateCall(Callee, {Arg1, Arg2}, Name);
165	if (Function *F = dyn_cast<Function>(Val: Callee.getCallee()))
166	R->setCallingConv(F->getCallingConv());
167	return R;
168	}
169
170	static FunctionType getPownType(FunctionType FT) {
171	Type *PowNExpTy = Type::getInt32Ty(C&: FT->getContext());
172	if (VectorType *VecTy = dyn_cast<VectorType>(Val: FT->getReturnType()))
173	PowNExpTy = VectorType::get(ElementType: PowNExpTy, EC: VecTy->getElementCount());
174
175	return FunctionType::get(Result: FT->getReturnType(),
176	Params: {FT->getParamType(i: `0`), PowNExpTy}, isVarArg: false);
177	}
178
179	// Data structures for table-driven optimizations.
180	// FuncTbl works for both f32 and f64 functions with 1 input argument
181
182	struct TableEntry {
183	double result;
184	double input;
185	};
186
187	/ a list of {result, input} /
188	static const TableEntry tbl_acos[] = {
189	{MATH_PI / `2.0`, .input: `0.0`},
190	{MATH_PI / `2.0`, .input: -`0.0`},
191	{.result: `0.0`, .input: `1.0`},
192	{MATH_PI, .input: -`1.0`}
193	};
194	static const TableEntry tbl_acosh[] = {
195	{.result: `0.0`, .input: `1.0`}
196	};
197	static const TableEntry tbl_acospi[] = {
198	{.result: `0.5`, .input: `0.0`},
199	{.result: `0.5`, .input: -`0.0`},
200	{.result: `0.0`, .input: `1.0`},
201	{.result: `1.0`, .input: -`1.0`}
202	};
203	static const TableEntry tbl_asin[] = {
204	{.result: `0.0`, .input: `0.0`},
205	{.result: -`0.0`, .input: -`0.0`},
206	{MATH_PI / `2.0`, .input: `1.0`},
207	{.result: -MATH_PI / `2.0`, .input: -`1.0`}
208	};
209	static const TableEntry tbl_asinh[] = {
210	{.result: `0.0`, .input: `0.0`},
211	{.result: -`0.0`, .input: -`0.0`}
212	};
213	static const TableEntry tbl_asinpi[] = {
214	{.result: `0.0`, .input: `0.0`},
215	{.result: -`0.0`, .input: -`0.0`},
216	{.result: `0.5`, .input: `1.0`},
217	{.result: -`0.5`, .input: -`1.0`}
218	};
219	static const TableEntry tbl_atan[] = {
220	{.result: `0.0`, .input: `0.0`},
221	{.result: -`0.0`, .input: -`0.0`},
222	{MATH_PI / `4.0`, .input: `1.0`},
223	{.result: -MATH_PI / `4.0`, .input: -`1.0`}
224	};
225	static const TableEntry tbl_atanh[] = {
226	{.result: `0.0`, .input: `0.0`},
227	{.result: -`0.0`, .input: -`0.0`}
228	};
229	static const TableEntry tbl_atanpi[] = {
230	{.result: `0.0`, .input: `0.0`},
231	{.result: -`0.0`, .input: -`0.0`},
232	{.result: `0.25`, .input: `1.0`},
233	{.result: -`0.25`, .input: -`1.0`}
234	};
235	static const TableEntry tbl_cbrt[] = {
236	{.result: `0.0`, .input: `0.0`},
237	{.result: -`0.0`, .input: -`0.0`},
238	{.result: `1.0`, .input: `1.0`},
239	{.result: -`1.0`, .input: -`1.0`},
240	};
241	static const TableEntry tbl_cos[] = {
242	{.result: `1.0`, .input: `0.0`},
243	{.result: `1.0`, .input: -`0.0`}
244	};
245	static const TableEntry tbl_cosh[] = {
246	{.result: `1.0`, .input: `0.0`},
247	{.result: `1.0`, .input: -`0.0`}
248	};
249	static const TableEntry tbl_cospi[] = {
250	{.result: `1.0`, .input: `0.0`},
251	{.result: `1.0`, .input: -`0.0`}
252	};
253	static const TableEntry tbl_erfc[] = {
254	{.result: `1.0`, .input: `0.0`},
255	{.result: `1.0`, .input: -`0.0`}
256	};
257	static const TableEntry tbl_erf[] = {
258	{.result: `0.0`, .input: `0.0`},
259	{.result: -`0.0`, .input: -`0.0`}
260	};
261	static const TableEntry tbl_exp[] = {
262	{.result: `1.0`, .input: `0.0`},
263	{.result: `1.0`, .input: -`0.0`},
264	{MATH_E, .input: `1.0`}
265	};
266	static const TableEntry tbl_exp2[] = {
267	{.result: `1.0`, .input: `0.0`},
268	{.result: `1.0`, .input: -`0.0`},
269	{.result: `2.0`, .input: `1.0`}
270	};
271	static const TableEntry tbl_exp10[] = {
272	{.result: `1.0`, .input: `0.0`},
273	{.result: `1.0`, .input: -`0.0`},
274	{.result: `10.0`, .input: `1.0`}
275	};
276	static const TableEntry tbl_expm1[] = {
277	{.result: `0.0`, .input: `0.0`},
278	{.result: -`0.0`, .input: -`0.0`}
279	};
280	static const TableEntry tbl_log[] = {
281	{.result: `0.0`, .input: `1.0`},
282	{.result: `1.0`, MATH_E}
283	};
284	static const TableEntry tbl_log2[] = {
285	{.result: `0.0`, .input: `1.0`},
286	{.result: `1.0`, .input: `2.0`}
287	};
288	static const TableEntry tbl_log10[] = {
289	{.result: `0.0`, .input: `1.0`},
290	{.result: `1.0`, .input: `10.0`}
291	};
292	static const TableEntry tbl_rsqrt[] = {
293	{.result: `1.0`, .input: `1.0`},
294	{MATH_SQRT1_2, .input: `2.0`}
295	};
296	static const TableEntry tbl_sin[] = {
297	{.result: `0.0`, .input: `0.0`},
298	{.result: -`0.0`, .input: -`0.0`}
299	};
300	static const TableEntry tbl_sinh[] = {
301	{.result: `0.0`, .input: `0.0`},
302	{.result: -`0.0`, .input: -`0.0`}
303	};
304	static const TableEntry tbl_sinpi[] = {
305	{.result: `0.0`, .input: `0.0`},
306	{.result: -`0.0`, .input: -`0.0`}
307	};
308	static const TableEntry tbl_sqrt[] = {
309	{.result: `0.0`, .input: `0.0`},
310	{.result: `1.0`, .input: `1.0`},
311	{MATH_SQRT2, .input: `2.0`}
312	};
313	static const TableEntry tbl_tan[] = {
314	{.result: `0.0`, .input: `0.0`},
315	{.result: -`0.0`, .input: -`0.0`}
316	};
317	static const TableEntry tbl_tanh[] = {
318	{.result: `0.0`, .input: `0.0`},
319	{.result: -`0.0`, .input: -`0.0`}
320	};
321	static const TableEntry tbl_tanpi[] = {
322	{.result: `0.0`, .input: `0.0`},
323	{.result: -`0.0`, .input: -`0.0`}
324	};
325	static const TableEntry tbl_tgamma[] = {
326	{.result: `1.0`, .input: `1.0`},
327	{.result: `1.0`, .input: `2.0`},
328	{.result: `2.0`, .input: `3.0`},
329	{.result: `6.0`, .input: `4.0`}
330	};
331
332	static bool HasNative(AMDGPULibFunc::EFuncId id) {
333	switch(id) {
334	case AMDGPULibFunc::EI_DIVIDE:
335	case AMDGPULibFunc::EI_COS:
336	case AMDGPULibFunc::EI_EXP:
337	case AMDGPULibFunc::EI_EXP2:
338	case AMDGPULibFunc::EI_EXP10:
339	case AMDGPULibFunc::EI_LOG:
340	case AMDGPULibFunc::EI_LOG2:
341	case AMDGPULibFunc::EI_LOG10:
342	case AMDGPULibFunc::EI_POWR:
343	case AMDGPULibFunc::EI_RECIP:
344	case AMDGPULibFunc::EI_RSQRT:
345	case AMDGPULibFunc::EI_SIN:
346	case AMDGPULibFunc::EI_SINCOS:
347	case AMDGPULibFunc::EI_SQRT:
348	case AMDGPULibFunc::EI_TAN:
349	return true;
350	default:;
351	}
352	return false;
353	}
354
355	using TableRef = ArrayRef<TableEntry>;
356
357	static TableRef getOptTable(AMDGPULibFunc::EFuncId id) {
358	switch(id) {
359	case AMDGPULibFunc::EI_ACOS: return TableRef (tbl_acos);
360	case AMDGPULibFunc::EI_ACOSH: return TableRef (tbl_acosh);
361	case AMDGPULibFunc::EI_ACOSPI: return TableRef (tbl_acospi);
362	case AMDGPULibFunc::EI_ASIN: return TableRef (tbl_asin);
363	case AMDGPULibFunc::EI_ASINH: return TableRef (tbl_asinh);
364	case AMDGPULibFunc::EI_ASINPI: return TableRef (tbl_asinpi);
365	case AMDGPULibFunc::EI_ATAN: return TableRef (tbl_atan);
366	case AMDGPULibFunc::EI_ATANH: return TableRef (tbl_atanh);
367	case AMDGPULibFunc::EI_ATANPI: return TableRef (tbl_atanpi);
368	case AMDGPULibFunc::EI_CBRT: return TableRef (tbl_cbrt);
369	case AMDGPULibFunc::EI_NCOS:
370	case AMDGPULibFunc::EI_COS: return TableRef (tbl_cos);
371	case AMDGPULibFunc::EI_COSH: return TableRef (tbl_cosh);
372	case AMDGPULibFunc::EI_COSPI: return TableRef (tbl_cospi);
373	case AMDGPULibFunc::EI_ERFC: return TableRef (tbl_erfc);
374	case AMDGPULibFunc::EI_ERF: return TableRef (tbl_erf);
375	case AMDGPULibFunc::EI_EXP: return TableRef (tbl_exp);
376	case AMDGPULibFunc::EI_NEXP2:
377	case AMDGPULibFunc::EI_EXP2: return TableRef (tbl_exp2);
378	case AMDGPULibFunc::EI_EXP10: return TableRef (tbl_exp10);
379	case AMDGPULibFunc::EI_EXPM1: return TableRef (tbl_expm1);
380	case AMDGPULibFunc::EI_LOG: return TableRef (tbl_log);
381	case AMDGPULibFunc::EI_NLOG2:
382	case AMDGPULibFunc::EI_LOG2: return TableRef (tbl_log2);
383	case AMDGPULibFunc::EI_LOG10: return TableRef (tbl_log10);
384	case AMDGPULibFunc::EI_NRSQRT:
385	case AMDGPULibFunc::EI_RSQRT: return TableRef (tbl_rsqrt);
386	case AMDGPULibFunc::EI_NSIN:
387	case AMDGPULibFunc::EI_SIN: return TableRef (tbl_sin);
388	case AMDGPULibFunc::EI_SINH: return TableRef (tbl_sinh);
389	case AMDGPULibFunc::EI_SINPI: return TableRef (tbl_sinpi);
390	case AMDGPULibFunc::EI_NSQRT:
391	case AMDGPULibFunc::EI_SQRT: return TableRef (tbl_sqrt);
392	case AMDGPULibFunc::EI_TAN: return TableRef (tbl_tan);
393	case AMDGPULibFunc::EI_TANH: return TableRef (tbl_tanh);
394	case AMDGPULibFunc::EI_TANPI: return TableRef (tbl_tanpi);
395	case AMDGPULibFunc::EI_TGAMMA: return TableRef (tbl_tgamma);
396	default:;
397	}
398	return TableRef ();
399	}
400
401	static inline int getVecSize(const AMDGPULibFunc& FInfo) {
402	return FInfo.getLeads()[`0`].VectorSize;
403	}
404
405	static inline AMDGPULibFunc::EType getArgType(const AMDGPULibFunc& FInfo) {
406	return (AMDGPULibFunc::EType)FInfo.getLeads()[`0`].ArgType;
407	}
408
409	FunctionCallee AMDGPULibCalls::getFunction(Module M, const* FuncInfo &fInfo) {
410	// If we are doing PreLinkOpt, the function is external. So it is safe to
411	// use getOrInsertFunction() at this stage.
412
413	return EnablePreLink ? AMDGPULibFunc::getOrInsertFunction(M, fInfo)
414	: AMDGPULibFunc::getFunction(M, fInfo);
415	}
416
417	bool AMDGPULibCalls::parseFunctionName(const StringRef &FMangledName,
418	FuncInfo &FInfo) {
419	return AMDGPULibFunc::parse(MangledName: FMangledName, Ptr&: FInfo);
420	}
421
422	bool AMDGPULibCalls::isUnsafeMath(const FPMathOperator FPOp) const* {
423	return UnsafeFPMath \|\| FPOp->isFast();
424	}
425
426	bool AMDGPULibCalls::isUnsafeFiniteOnlyMath(const FPMathOperator FPOp) const* {
427	return UnsafeFPMath \|\|
428	(FPOp->hasApproxFunc() && FPOp->hasNoNaNs() && FPOp->hasNoInfs());
429	}
430
431	bool AMDGPULibCalls::canIncreasePrecisionOfConstantFold(
432	const FPMathOperator FPOp) const* {
433	// TODO: Refine to approxFunc or contract
434	return isUnsafeMath(FPOp);
435	}
436
437	void AMDGPULibCalls::initFunction(Function &F, FunctionAnalysisManager &FAM) {
438	UnsafeFPMath = F.getFnAttribute(Kind: "unsafe-fp-math").getValueAsBool();
439	AC = &FAM.getResult<AssumptionAnalysis>(IR&: F);
440	TLInfo = &FAM.getResult<TargetLibraryAnalysis>(IR&: F);
441	DT = FAM.getCachedResult<DominatorTreeAnalysis>(IR&: F);
442	}
443
444	bool AMDGPULibCalls::useNativeFunc(const StringRef F) const {
445	return AllNative \|\| llvm::is_contained(Range&: UseNative, Element: F);
446	}
447
448	void AMDGPULibCalls::initNativeFuncs() {
449	AllNative = useNativeFunc(F: "all") \|\|
450	(UseNative.getNumOccurrences() && UseNative.size() == `1` &&
451	UseNative.begin()->empty());
452	}
453
454	bool AMDGPULibCalls::sincosUseNative(CallInst aCI, const* FuncInfo &FInfo) {
455	bool native_sin = useNativeFunc(F: "sin");
456	bool native_cos = useNativeFunc(F: "cos");
457
458	if (native_sin && native_cos) {
459	Module *M = aCI->getModule();
460	Value *opr0 = aCI->getArgOperand(i: `0`);
461
462	AMDGPULibFunc nf;
463	nf.getLeads()[`0`].ArgType = FInfo.getLeads()[`0`].ArgType;
464	nf.getLeads()[`0`].VectorSize = FInfo.getLeads()[`0`].VectorSize;
465
466	nf.setPrefix(AMDGPULibFunc::NATIVE);
467	nf.setId(AMDGPULibFunc::EI_SIN);
468	FunctionCallee sinExpr = getFunction(M, fInfo: nf);
469
470	nf.setPrefix(AMDGPULibFunc::NATIVE);
471	nf.setId(AMDGPULibFunc::EI_COS);
472	FunctionCallee cosExpr = getFunction(M, fInfo: nf);
473	if (sinExpr && cosExpr) {
474	Value *sinval =
475	CallInst::Create(Func: sinExpr, Args: opr0, NameStr: "splitsin", InsertBefore: aCI->getIterator());
476	Value *cosval =
477	CallInst::Create(Func: cosExpr, Args: opr0, NameStr: "splitcos", InsertBefore: aCI->getIterator());
478	new StoreInst (cosval, aCI->getArgOperand(i: `1`), aCI->getIterator());
479
480	DEBUG_WITH_TYPE("usenative", dbgs() << "<useNative> replace " << *aCI
481	<< " with native version of sin/cos");
482
483	replaceCall(I: aCI, With: sinval);
484	return true;
485	}
486	}
487	return false;
488	}
489
490	bool AMDGPULibCalls::useNative(CallInst *aCI) {
491	Function *Callee = aCI->getCalledFunction();
492	if (!Callee \|\| aCI->isNoBuiltin())
493	return false;
494
495	FuncInfo FInfo;
496	if (!parseFunctionName(FMangledName: Callee->getName(), FInfo) \|\| !FInfo.isMangled() \|\|
497	FInfo.getPrefix() != AMDGPULibFunc::NOPFX \|\|
498	getArgType(FInfo) == AMDGPULibFunc::F64 \|\| !HasNative(id: FInfo.getId()) \|\|
499	!(AllNative \|\| useNativeFunc(F: FInfo.getName()))) {
500	return false;
501	}
502
503	if (FInfo.getId() == AMDGPULibFunc::EI_SINCOS)
504	return sincosUseNative(aCI, FInfo);
505
506	FInfo.setPrefix(AMDGPULibFunc::NATIVE);
507	FunctionCallee F = getFunction(M: aCI->getModule(), fInfo: FInfo);
508	if (!F)
509	return false;
510
511	aCI->setCalledFunction(F);
512	DEBUG_WITH_TYPE("usenative", dbgs() << "<useNative> replace " << *aCI
513	<< " with native version");
514	return true;
515	}
516
517	// Clang emits call of __read_pipe_2 or __read_pipe_4 for OpenCL read_pipe
518	// builtin, with appended type size and alignment arguments, where 2 or 4
519	// indicates the original number of arguments. The library has optimized version
520	// of __read_pipe_2/__read_pipe_4 when the type size and alignment has the same
521	// power of 2 value. This function transforms __read_pipe_2 to __read_pipe_2_N
522	// for such cases where N is the size in bytes of the type (N = 1, 2, 4, 8, ...,
523	// 128). The same for __read_pipe_4, write_pipe_2, and write_pipe_4.
524	bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
525	const FuncInfo &FInfo) {
526	auto *Callee = CI->getCalledFunction();
527	if (!Callee->isDeclaration())
528	return false;
529
530	assert(Callee->hasName() && "Invalid read_pipe/write_pipe function");
531	auto *M = Callee->getParent();
532	std::string Name = std::string (Callee->getName());
533	auto NumArg = CI->arg_size();
534	if (NumArg != `4` && NumArg != `6`)
535	return false;
536	ConstantInt *PacketSize =
537	dyn_cast<ConstantInt>(Val: CI->getArgOperand(i: NumArg - `2`));
538	ConstantInt *PacketAlign =
539	dyn_cast<ConstantInt>(Val: CI->getArgOperand(i: NumArg - `1`));
540	if (!PacketSize \|\| !PacketAlign)
541	return false;
542
543	unsigned Size = PacketSize->getZExtValue();
544	Align Alignment = PacketAlign->getAlignValue();
545	if (Alignment != Size)
546	return false;
547
548	unsigned PtrArgLoc = CI->arg_size() - `3`;
549	Value *PtrArg = CI->getArgOperand(i: PtrArgLoc);
550	Type *PtrTy = PtrArg->getType();
551
552	SmallVector<llvm::Type *, `6`> ArgTys;
553	for (unsigned I = `0`; I != PtrArgLoc; ++I)
554	ArgTys.push_back(Elt: CI->getArgOperand(i: I)->getType());
555	ArgTys.push_back(Elt: PtrTy);
556
557	Name = Name + "_" + std::to_string(val: Size);
558	auto *FTy = FunctionType::get(Result: Callee->getReturnType(),
559	Params: ArrayRef<Type >(ArgTys), isVarArg: false*);
560	AMDGPULibFunc NewLibFunc(Name, FTy);
561	FunctionCallee F = AMDGPULibFunc::getOrInsertFunction(M, fInfo: NewLibFunc);
562	if (!F)
563	return false;
564
565	SmallVector<Value *, `6`> Args;
566	for (unsigned I = `0`; I != PtrArgLoc; ++I)
567	Args.push_back(Elt: CI->getArgOperand(i: I));
568	Args.push_back(Elt: PtrArg);
569
570	auto *NCI = B.CreateCall(Callee: F, Args);
571	NCI->setAttributes(CI->getAttributes());
572	CI->replaceAllUsesWith(V: NCI);
573	CI->dropAllReferences();
574	CI->eraseFromParent();
575
576	return true;
577	}
578
579	static bool isKnownIntegral(const Value V, const* DataLayout &DL,
580	FastMathFlags FMF) {
581	if (isa<PoisonValue>(Val: V))
582	return true;
583	if (isa<UndefValue>(Val: V))
584	return false;
585
586	if (const ConstantFP *CF = dyn_cast<ConstantFP>(Val: V))
587	return CF->getValueAPF().isInteger();
588
589	auto *VFVTy = dyn_cast<FixedVectorType>(Val: V->getType());
590	const Constant *CV = dyn_cast<Constant>(Val: V);
591	if (VFVTy && CV) {
592	unsigned NumElts = VFVTy->getNumElements();
593	for (unsigned i = `0`; i != NumElts; ++i) {
594	Constant *Elt = CV->getAggregateElement(Elt: i);
595	if (!Elt)
596	return false;
597	if (isa<PoisonValue>(Val: Elt))
598	continue;
599
600	const ConstantFP *CFP = dyn_cast<ConstantFP>(Val: Elt);
601	if (!CFP \|\| !CFP->getValue().isInteger())
602	return false;
603	}
604
605	return true;
606	}
607
608	const Instruction *I = dyn_cast<Instruction>(Val: V);
609	if (!I)
610	return false;
611
612	switch (I->getOpcode()) {
613	case Instruction::SIToFP:
614	case Instruction::UIToFP:
615	// TODO: Could check nofpclass(inf) on incoming argument
616	if (FMF.noInfs())
617	return true;
618
619	// Need to check int size cannot produce infinity, which computeKnownFPClass
620	// knows how to do already.
621	return isKnownNeverInfinity(V: I, /Depth=/`0`, SQ: SimplifyQuery (DL));
622	case Instruction::Call: {
623	const CallInst *CI = cast<CallInst>(Val: I);
624	switch (CI->getIntrinsicID()) {
625	case Intrinsic::trunc:
626	case Intrinsic::floor:
627	case Intrinsic::ceil:
628	case Intrinsic::rint:
629	case Intrinsic::nearbyint:
630	case Intrinsic::round:
631	case Intrinsic::roundeven:
632	return (FMF.noInfs() && FMF.noNaNs()) \|\|
633	isKnownNeverInfOrNaN(V: I, /Depth=/`0`, SQ: SimplifyQuery (DL));
634	default:
635	break;
636	}
637
638	break;
639	}
640	default:
641	break;
642	}
643
644	return false;
645	}
646
647	// This function returns false if no change; return true otherwise.
648	bool AMDGPULibCalls::fold(CallInst *CI) {
649	Function *Callee = CI->getCalledFunction();
650	// Ignore indirect calls.
651	if (!Callee \|\| Callee->isIntrinsic() \|\| CI->isNoBuiltin())
652	return false;
653
654	FuncInfo FInfo;
655	if (!parseFunctionName(FMangledName: Callee->getName(), FInfo))
656	return false;
657
658	// Further check the number of arguments to see if they match.
659	// TODO: Check calling convention matches too
660	if (!FInfo.isCompatibleSignature(FuncTy: CI->getFunctionType()))
661	return false;
662
663	LLVM_DEBUG(dbgs() << "AMDIC: try folding " << *CI << `'\n'`);
664
665	if (TDOFold(CI, FInfo))
666	return true;
667
668	IRBuilder<> B(CI);
669	if (CI->isStrictFP())
670	B.setIsFPConstrained(true);
671
672	if (FPMathOperator *FPOp = dyn_cast<FPMathOperator>(Val: CI)) {
673	// Under unsafe-math, evaluate calls if possible.
674	// According to Brian Sumner, we can do this for all f32 function calls
675	// using host's double function calls.
676	if (canIncreasePrecisionOfConstantFold(FPOp) && evaluateCall(aCI: CI, FInfo))
677	return true;
678
679	// Copy fast flags from the original call.
680	FastMathFlags FMF = FPOp->getFastMathFlags();
681	B.setFastMathFlags(FMF);
682
683	// Specialized optimizations for each function call.
684	//
685	// TODO: Handle native functions
686	switch (FInfo.getId()) {
687	case AMDGPULibFunc::EI_EXP:
688	if (FMF.none())
689	return false;
690	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::exp,
691	AllowMinSizeF32: FMF.approxFunc());
692	case AMDGPULibFunc::EI_EXP2:
693	if (FMF.none())
694	return false;
695	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::exp2,
696	AllowMinSizeF32: FMF.approxFunc());
697	case AMDGPULibFunc::EI_LOG:
698	if (FMF.none())
699	return false;
700	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::log,
701	AllowMinSizeF32: FMF.approxFunc());
702	case AMDGPULibFunc::EI_LOG2:
703	if (FMF.none())
704	return false;
705	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::log2,
706	AllowMinSizeF32: FMF.approxFunc());
707	case AMDGPULibFunc::EI_LOG10:
708	if (FMF.none())
709	return false;
710	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::log10,
711	AllowMinSizeF32: FMF.approxFunc());
712	case AMDGPULibFunc::EI_FMIN:
713	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::minnum,
714	AllowMinSizeF32: true, AllowF64: true);
715	case AMDGPULibFunc::EI_FMAX:
716	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::maxnum,
717	AllowMinSizeF32: true, AllowF64: true);
718	case AMDGPULibFunc::EI_FMA:
719	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::fma, AllowMinSizeF32: true,
720	AllowF64: true);
721	case AMDGPULibFunc::EI_MAD:
722	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::fmuladd,
723	AllowMinSizeF32: true, AllowF64: true);
724	case AMDGPULibFunc::EI_FABS:
725	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::fabs, AllowMinSizeF32: true,
726	AllowF64: true, AllowStrictFP: true);
727	case AMDGPULibFunc::EI_COPYSIGN:
728	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::copysign,
729	AllowMinSizeF32: true, AllowF64: true, AllowStrictFP: true);
730	case AMDGPULibFunc::EI_FLOOR:
731	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::floor, AllowMinSizeF32: true,
732	AllowF64: true);
733	case AMDGPULibFunc::EI_CEIL:
734	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::ceil, AllowMinSizeF32: true,
735	AllowF64: true);
736	case AMDGPULibFunc::EI_TRUNC:
737	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::trunc, AllowMinSizeF32: true,
738	AllowF64: true);
739	case AMDGPULibFunc::EI_RINT:
740	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::rint, AllowMinSizeF32: true,
741	AllowF64: true);
742	case AMDGPULibFunc::EI_ROUND:
743	return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::round, AllowMinSizeF32: true,
744	AllowF64: true);
745	case AMDGPULibFunc::EI_LDEXP: {
746	if (!shouldReplaceLibcallWithIntrinsic(CI, AllowMinSizeF32: true, AllowF64: true))
747	return false;
748
749	Value *Arg1 = CI->getArgOperand(i: `1`);
750	if (VectorType *VecTy = dyn_cast<VectorType>(Val: CI->getType());
751	VecTy && !isa<VectorType>(Val: Arg1->getType())) {
752	Value *SplatArg1 = B.CreateVectorSplat(EC: VecTy->getElementCount(), V: Arg1);
753	CI->setArgOperand(i: `1`, v: SplatArg1);
754	}
755
756	CI->setCalledFunction(Intrinsic::getDeclaration(
757	M: CI->getModule(), id: Intrinsic::ldexp,
758	Tys: {CI->getType(), CI->getArgOperand(i: `1`)->getType()}));
759	return true;
760	}
761	case AMDGPULibFunc::EI_POW: {
762	Module *M = Callee->getParent();
763	AMDGPULibFunc PowrInfo(AMDGPULibFunc::EI_POWR, FInfo);
764	FunctionCallee PowrFunc = getFunction(M, fInfo: PowrInfo);
765	CallInst *Call = cast<CallInst>(Val: FPOp);
766
767	// pow(x, y) -> powr(x, y) for x >= -0.0
768	// TODO: Account for flags on current call
769	if (PowrFunc &&
770	cannotBeOrderedLessThanZero(
771	V: FPOp->getOperand(i: `0`), /Depth=/`0`,
772	SQ: SimplifyQuery (M->getDataLayout(), TLInfo, DT, AC, Call))) {
773	Call->setCalledFunction(PowrFunc);
774	return fold_pow(FPOp, B, FInfo: PowrInfo) \|\| true;
775	}
776
777	// pow(x, y) -> pown(x, y) for known integral y
778	if (isKnownIntegral(V: FPOp->getOperand(i: `1`), DL: M->getDataLayout(),
779	FMF: FPOp->getFastMathFlags())) {
780	FunctionType *PownType = getPownType(FT: CI->getFunctionType());
781	AMDGPULibFunc PownInfo(AMDGPULibFunc::EI_POWN, PownType, true);
782	FunctionCallee PownFunc = getFunction(M, fInfo: PownInfo);
783	if (PownFunc) {
784	// TODO: If the incoming integral value is an sitofp/uitofp, it won't
785	// fold out without a known range. We can probably take the source
786	// value directly.
787	Value *CastedArg =
788	B.CreateFPToSI(V: FPOp->getOperand(i: `1`), DestTy: PownType->getParamType(i: `1`));
789	// Have to drop any nofpclass attributes on the original call site.
790	Call->removeParamAttrs(
791	ArgNo: `1`, AttrsToRemove: AttributeFuncs::typeIncompatible(Ty: CastedArg->getType()));
792	Call->setCalledFunction(PownFunc);
793	Call->setArgOperand(i: `1`, v: CastedArg);
794	return fold_pow(FPOp, B, FInfo: PownInfo) \|\| true;
795	}
796	}
797
798	return fold_pow(FPOp, B, FInfo);
799	}
800	case AMDGPULibFunc::EI_POWR:
801	case AMDGPULibFunc::EI_POWN:
802	return fold_pow(FPOp, B, FInfo);
803	case AMDGPULibFunc::EI_ROOTN:
804	return fold_rootn(FPOp, B, FInfo);
805	case AMDGPULibFunc::EI_SQRT:
806	// TODO: Allow with strictfp + constrained intrinsic
807	return tryReplaceLibcallWithSimpleIntrinsic(
808	B, CI, IntrID: Intrinsic::sqrt, AllowMinSizeF32: true, AllowF64: true, /AllowStrictFP=/false);
809	case AMDGPULibFunc::EI_COS:
810	case AMDGPULibFunc::EI_SIN:
811	return fold_sincos(FPOp, B, FInfo);
812	default:
813	break;
814	}
815	} else {
816	// Specialized optimizations for each function call
817	switch (FInfo.getId()) {
818	case AMDGPULibFunc::EI_READ_PIPE_2:
819	case AMDGPULibFunc::EI_READ_PIPE_4:
820	case AMDGPULibFunc::EI_WRITE_PIPE_2:
821	case AMDGPULibFunc::EI_WRITE_PIPE_4:
822	return fold_read_write_pipe(CI, B, FInfo);
823	default:
824	break;
825	}
826	}
827
828	return false;
829	}
830
831	bool AMDGPULibCalls::TDOFold(CallInst CI, const* FuncInfo &FInfo) {
832	// Table-Driven optimization
833	const TableRef tr = getOptTable(id: FInfo.getId());
834	if (tr.empty())
835	return false;
836
837	int const sz = (int)tr.size();
838	Value *opr0 = CI->getArgOperand(i: `0`);
839
840	if (getVecSize(FInfo) > `1`) {
841	if (ConstantDataVector *CV = dyn_cast<ConstantDataVector>(Val: opr0)) {
842	SmallVector<double, `0`> DVal;
843	for (int eltNo = `0`; eltNo < getVecSize(FInfo); ++eltNo) {
844	ConstantFP *eltval = dyn_cast<ConstantFP>(
845	Val: CV->getElementAsConstant(i: (unsigned)eltNo));
846	assert(eltval && "Non-FP arguments in math function!");
847	bool found = false;
848	for (int i=`0`; i < sz; ++i) {
849	if (eltval->isExactlyValue(V: tr [i].input)) {
850	DVal.push_back(Elt: tr [i].result);
851	found = true;
852	break;
853	}
854	}
855	if (!found) {
856	// This vector constants not handled yet.
857	return false;
858	}
859	}
860	LLVMContext &context = CI->getParent()->getParent()->getContext();
861	Constant *nval;
862	if (getArgType(FInfo) == AMDGPULibFunc::F32) {
863	SmallVector<float, `0`> FVal;
864	for (double D : DVal)
865	FVal.push_back(Elt: (float)D);
866	ArrayRef<float> tmp(FVal);
867	nval = ConstantDataVector::get(Context&: context, Elts: tmp);
868	} else { // F64
869	ArrayRef<double> tmp(DVal);
870	nval = ConstantDataVector::get(Context&: context, Elts: tmp);
871	}
872	LLVM_DEBUG(errs() << "AMDIC: " << CI << " ---> " << nval << "\n");
873	replaceCall(I: CI, With: nval);
874	return true;
875	}
876	} else {
877	// Scalar version
878	if (ConstantFP *CF = dyn_cast<ConstantFP>(Val: opr0)) {
879	for (int i = `0`; i < sz; ++i) {
880	if (CF->isExactlyValue(V: tr [i].input)) {
881	Value *nval = ConstantFP::get(Ty: CF->getType(), V: tr [i].result);
882	LLVM_DEBUG(errs() << "AMDIC: " << CI << " ---> " << nval << "\n");
883	replaceCall(I: CI, With: nval);
884	return true;
885	}
886	}
887	}
888	}
889
890	return false;
891	}
892
893	namespace llvm {
894	static double log2(double V) {
895	#if _XOPEN_SOURCE >= 600 \|\| defined(_ISOC99_SOURCE) \|\| _POSIX_C_SOURCE >= 200112L
896	return ::log2(x: V);
897	#else
898	return log(V) / numbers::ln2;
899	#endif
900	}
901	} // namespace llvm
902
903	bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B,
904	const FuncInfo &FInfo) {
905	assert((FInfo.getId() == AMDGPULibFunc::EI_POW \|\|
906	FInfo.getId() == AMDGPULibFunc::EI_POWR \|\|
907	FInfo.getId() == AMDGPULibFunc::EI_POWN) &&
908	"fold_pow: encounter a wrong function call");
909
910	Module *M = B.GetInsertBlock()->getModule();
911	Type *eltType = FPOp->getType()->getScalarType();
912	Value *opr0 = FPOp->getOperand(i: `0`);
913	Value *opr1 = FPOp->getOperand(i: `1`);
914
915	const APFloat CF = nullptr*;
916	const APInt CINT = nullptr*;
917	if (!match(V: opr1, P: m_APFloatAllowPoison(Res&: CF)))
918	match(V: opr1, P: m_APIntAllowPoison(Res&: CINT));
919
920	// 0x1111111 means that we don't do anything for this call.
921	int ci_opr1 = (CINT ? (int)CINT->getSExtValue() : `0x1111111`);
922
923	if ((CF && CF->isZero()) \|\| (CINT && ci_opr1 == `0`)) {
924	// pow/powr/pown(x, 0) == 1
925	LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1\n");
926	Constant *cnval = ConstantFP::get(Ty: eltType, V: `1.0`);
927	if (getVecSize(FInfo) > `1`) {
928	cnval = ConstantDataVector::getSplat(NumElts: getVecSize(FInfo), Elt: cnval);
929	}
930	replaceCall(I: FPOp, With: cnval);
931	return true;
932	}
933	if ((CF && CF->isExactlyValue(V: `1.0`)) \|\| (CINT && ci_opr1 == `1`)) {
934	// pow/powr/pown(x, 1.0) = x
935	LLVM_DEBUG(errs() << "AMDIC: " << FPOp << " ---> " << opr0 << "\n");
936	replaceCall(I: FPOp, With: opr0);
937	return true;
938	}
939	if ((CF && CF->isExactlyValue(V: `2.0`)) \|\| (CINT && ci_opr1 == `2`)) {
940	// pow/powr/pown(x, 2.0) = xx*
941	LLVM_DEBUG(errs() << "AMDIC: " << FPOp << " ---> " << opr0 << " * "
942	<< *opr0 << "\n");
943	Value *nval = B.CreateFMul(L: opr0, R: opr0, Name: "__pow2");
944	replaceCall(I: FPOp, With: nval);
945	return true;
946	}
947	if ((CF && CF->isExactlyValue(V: -`1.0`)) \|\| (CINT && ci_opr1 == -`1`)) {
948	// pow/powr/pown(x, -1.0) = 1.0/x
949	LLVM_DEBUG(errs() << "AMDIC: " << FPOp << " ---> 1 / " << opr0 << "\n");
950	Constant *cnval = ConstantFP::get(Ty: eltType, V: `1.0`);
951	if (getVecSize(FInfo) > `1`) {
952	cnval = ConstantDataVector::getSplat(NumElts: getVecSize(FInfo), Elt: cnval);
953	}
954	Value *nval = B.CreateFDiv(L: cnval, R: opr0, Name: "__powrecip");
955	replaceCall(I: FPOp, With: nval);
956	return true;
957	}
958
959	if (CF && (CF->isExactlyValue(V: `0.5`) \|\| CF->isExactlyValue(V: -`0.5`))) {
960	// pow[r](x, [-]0.5) = sqrt(x)
961	bool issqrt = CF->isExactlyValue(V: `0.5`);
962	if (FunctionCallee FPExpr =
963	getFunction(M, fInfo: AMDGPULibFunc (issqrt ? AMDGPULibFunc::EI_SQRT
964	: AMDGPULibFunc::EI_RSQRT,
965	FInfo))) {
966	LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << FInfo.getName()
967	<< `'('` << *opr0 << ")\n");
968	Value *nval = CreateCallEx(B,Callee: FPExpr, Arg: opr0, Name: issqrt ? "__pow2sqrt"
969	: "__pow2rsqrt");
970	replaceCall(I: FPOp, With: nval);
971	return true;
972	}
973	}
974
975	if (!isUnsafeFiniteOnlyMath(FPOp))
976	return false;
977
978	// Unsafe Math optimization
979
980	// Remember that ci_opr1 is set if opr1 is integral
981	if (CF) {
982	double dval = (getArgType(FInfo) == AMDGPULibFunc::F32)
983	? (double)CF->convertToFloat()
984	: CF->convertToDouble();
985	int ival = (int)dval;
986	if ((double)ival == dval) {
987	ci_opr1 = ival;
988	} else
989	ci_opr1 = `0x11111111`;
990	}
991
992	// pow/powr/pown(x, c) = [1/](xx..x); where
993	// trunc(c) == c && the number of x == c && \|c\| <= 12
994	unsigned abs_opr1 = (ci_opr1 < `0`) ? -ci_opr1 : ci_opr1;
995	if (abs_opr1 <= `12`) {
996	Constant *cnval;
997	Value *nval;
998	if (abs_opr1 == `0`) {
999	cnval = ConstantFP::get(Ty: eltType, V: `1.0`);
1000	if (getVecSize(FInfo) > `1`) {
1001	cnval = ConstantDataVector::getSplat(NumElts: getVecSize(FInfo), Elt: cnval);
1002	}
1003	nval = cnval;
1004	} else {
1005	Value valx2 = nullptr*;
1006	nval = nullptr;
1007	while (abs_opr1 > `0`) {
1008	valx2 = valx2 ? B.CreateFMul(L: valx2, R: valx2, Name: "__powx2") : opr0;
1009	if (abs_opr1 & `1`) {
1010	nval = nval ? B.CreateFMul(L: nval, R: valx2, Name: "__powprod") : valx2;
1011	}
1012	abs_opr1 >>= `1`;
1013	}
1014	}
1015
1016	if (ci_opr1 < `0`) {
1017	cnval = ConstantFP::get(Ty: eltType, V: `1.0`);
1018	if (getVecSize(FInfo) > `1`) {
1019	cnval = ConstantDataVector::getSplat(NumElts: getVecSize(FInfo), Elt: cnval);
1020	}
1021	nval = B.CreateFDiv(L: cnval, R: nval, Name: "__1powprod");
1022	}
1023	LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> "
1024	<< ((ci_opr1 < `0`) ? "1/prod(" : "prod(") << *opr0
1025	<< ")\n");
1026	replaceCall(I: FPOp, With: nval);
1027	return true;
1028	}
1029
1030	// If we should use the generic intrinsic instead of emitting a libcall
1031	const bool ShouldUseIntrinsic = eltType->isFloatTy() \|\| eltType->isHalfTy();
1032
1033	// powr ---> exp2(y log2(x))*
1034	// pown/pow ---> powr(fabs(x), y) \| (x & ((int)y << 31))
1035	FunctionCallee ExpExpr;
1036	if (ShouldUseIntrinsic)
1037	ExpExpr = Intrinsic::getDeclaration(M, id: Intrinsic::exp2, Tys: {FPOp->getType()});
1038	else {
1039	ExpExpr = getFunction(M, fInfo: AMDGPULibFunc (AMDGPULibFunc::EI_EXP2, FInfo));
1040	if (!ExpExpr)
1041	return false;
1042	}
1043
1044	bool needlog = false;
1045	bool needabs = false;
1046	bool needcopysign = false;
1047	Constant cnval = nullptr*;
1048	if (getVecSize(FInfo) == `1`) {
1049	CF = nullptr;
1050	match(V: opr0, P: m_APFloatAllowPoison(Res&: CF));
1051
1052	if (CF) {
1053	double V = (getArgType(FInfo) == AMDGPULibFunc::F32)
1054	? (double)CF->convertToFloat()
1055	: CF->convertToDouble();
1056
1057	V = log2(V: std::abs(x: V));
1058	cnval = ConstantFP::get(Ty: eltType, V);
1059	needcopysign = (FInfo.getId() != AMDGPULibFunc::EI_POWR) &&
1060	CF->isNegative();
1061	} else {
1062	needlog = true;
1063	needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR;
1064	}
1065	} else {
1066	ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(Val: opr0);
1067
1068	if (!CDV) {
1069	needlog = true;
1070	needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR;
1071	} else {
1072	assert ((int)CDV->getNumElements() == getVecSize(FInfo) &&
1073	"Wrong vector size detected");
1074
1075	SmallVector<double, `0`> DVal;
1076	for (int i=`0`; i < getVecSize(FInfo); ++i) {
1077	double V = CDV->getElementAsAPFloat(i).convertToDouble();
1078	if (V < `0.0`) needcopysign = true;
1079	V = log2(V: std::abs(x: V));
1080	DVal.push_back(Elt: V);
1081	}
1082	if (getArgType(FInfo) == AMDGPULibFunc::F32) {
1083	SmallVector<float, `0`> FVal;
1084	for (double D : DVal)
1085	FVal.push_back(Elt: (float)D);
1086	ArrayRef<float> tmp(FVal);
1087	cnval = ConstantDataVector::get(Context&: M->getContext(), Elts: tmp);
1088	} else {
1089	ArrayRef<double> tmp(DVal);
1090	cnval = ConstantDataVector::get(Context&: M->getContext(), Elts: tmp);
1091	}
1092	}
1093	}
1094
1095	if (needcopysign && (FInfo.getId() == AMDGPULibFunc::EI_POW)) {
1096	// We cannot handle corner cases for a general pow() function, give up
1097	// unless y is a constant integral value. Then proceed as if it were pown.
1098	if (!isKnownIntegral(V: opr1, DL: M->getDataLayout(), FMF: FPOp->getFastMathFlags()))
1099	return false;
1100	}
1101
1102	Value *nval;
1103	if (needabs) {
1104	nval = B.CreateUnaryIntrinsic(ID: Intrinsic::fabs, V: opr0, FMFSource: nullptr, Name: "__fabs");
1105	} else {
1106	nval = cnval ? cnval : opr0;
1107	}
1108	if (needlog) {
1109	FunctionCallee LogExpr;
1110	if (ShouldUseIntrinsic) {
1111	LogExpr =
1112	Intrinsic::getDeclaration(M, id: Intrinsic::log2, Tys: {FPOp->getType()});
1113	} else {
1114	LogExpr = getFunction(M, fInfo: AMDGPULibFunc (AMDGPULibFunc::EI_LOG2, FInfo));
1115	if (!LogExpr)
1116	return false;
1117	}
1118
1119	nval = CreateCallEx(B,Callee: LogExpr, Arg: nval, Name: "__log2");
1120	}
1121
1122	if (FInfo.getId() == AMDGPULibFunc::EI_POWN) {
1123	// convert int(32) to fp(f32 or f64)
1124	opr1 = B.CreateSIToFP(V: opr1, DestTy: nval->getType(), Name: "pownI2F");
1125	}
1126	nval = B.CreateFMul(L: opr1, R: nval, Name: "__ylogx");
1127	nval = CreateCallEx(B,Callee: ExpExpr, Arg: nval, Name: "__exp2");
1128
1129	if (needcopysign) {
1130	Type* nTyS = B.getIntNTy(N: eltType->getPrimitiveSizeInBits());
1131	Type *nTy = FPOp->getType()->getWithNewType(EltTy: nTyS);
1132	unsigned size = nTy->getScalarSizeInBits();
1133	Value *opr_n = FPOp->getOperand(i: `1`);
1134	if (opr_n->getType()->getScalarType()->isIntegerTy())
1135	opr_n = B.CreateZExtOrTrunc(V: opr_n, DestTy: nTy, Name: "__ytou");
1136	else
1137	opr_n = B.CreateFPToSI(V: opr1, DestTy: nTy, Name: "__ytou");
1138
1139	Value *sign = B.CreateShl(LHS: opr_n, RHS: size-`1`, Name: "__yeven");
1140	sign = B.CreateAnd(LHS: B.CreateBitCast(V: opr0, DestTy: nTy), RHS: sign, Name: "__pow_sign");
1141	nval = B.CreateOr(LHS: B.CreateBitCast(V: nval, DestTy: nTy), RHS: sign);
1142	nval = B.CreateBitCast(V: nval, DestTy: opr0->getType());
1143	}
1144
1145	LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> "
1146	<< "exp2(" << opr1 << " log2(" << *opr0 << "))\n");
1147	replaceCall(I: FPOp, With: nval);
1148
1149	return true;
1150	}
1151
1152	bool AMDGPULibCalls::fold_rootn(FPMathOperator *FPOp, IRBuilder<> &B,
1153	const FuncInfo &FInfo) {
1154	Value *opr0 = FPOp->getOperand(i: `0`);
1155	Value *opr1 = FPOp->getOperand(i: `1`);
1156
1157	const APInt CINT = nullptr*;
1158	if (!match(V: opr1, P: m_APIntAllowPoison(Res&: CINT)))
1159	return false;
1160
1161	Function *Parent = B.GetInsertBlock()->getParent();
1162
1163	int ci_opr1 = (int)CINT->getSExtValue();
1164	if (ci_opr1 == `1` && !Parent->hasFnAttribute(Kind: Attribute::StrictFP)) {
1165	// rootn(x, 1) = x
1166	//
1167	// TODO: Insert constrained canonicalize for strictfp case.
1168	LLVM_DEBUG(errs() << "AMDIC: " << FPOp << " ---> " << opr0 << `'\n'`);
1169	replaceCall(I: FPOp, With: opr0);
1170	return true;
1171	}
1172
1173	Module *M = B.GetInsertBlock()->getModule();
1174
1175	CallInst *CI = cast<CallInst>(Val: FPOp);
1176	if (ci_opr1 == `2` &&
1177	shouldReplaceLibcallWithIntrinsic(CI,
1178	/AllowMinSizeF32=/true,
1179	/AllowF64=/true)) {
1180	// rootn(x, 2) = sqrt(x)
1181	LLVM_DEBUG(errs() << "AMDIC: " << FPOp << " ---> sqrt(" << opr0 << ")\n");
1182
1183	CallInst *NewCall = B.CreateUnaryIntrinsic(ID: Intrinsic::sqrt, V: opr0, FMFSource: CI);
1184	NewCall->takeName(V: CI);
1185
1186	// OpenCL rootn has a looser ulp of 2 requirement than sqrt, so add some
1187	// metadata.
1188	MDBuilder MDHelper(M->getContext());
1189	MDNode *FPMD = MDHelper.createFPMath(Accuracy: std::max(a: FPOp->getFPAccuracy(), b: `2.0f`));
1190	NewCall->setMetadata(KindID: LLVMContext::MD_fpmath, Node: FPMD);
1191
1192	replaceCall(I: CI, With: NewCall);
1193	return true;
1194	}
1195
1196	if (ci_opr1 == `3`) { // rootn(x, 3) = cbrt(x)
1197	if (FunctionCallee FPExpr =
1198	getFunction(M, fInfo: AMDGPULibFunc (AMDGPULibFunc::EI_CBRT, FInfo))) {
1199	LLVM_DEBUG(errs() << "AMDIC: " << FPOp << " ---> cbrt(" << opr0
1200	<< ")\n");
1201	Value *nval = CreateCallEx(B,Callee: FPExpr, Arg: opr0, Name: "__rootn2cbrt");
1202	replaceCall(I: FPOp, With: nval);
1203	return true;
1204	}
1205	} else if (ci_opr1 == -`1`) { // rootn(x, -1) = 1.0/x
1206	LLVM_DEBUG(errs() << "AMDIC: " << FPOp << " ---> 1.0 / " << opr0 << "\n");
1207	Value *nval = B.CreateFDiv(L: ConstantFP::get(Ty: opr0->getType(), V: `1.0`),
1208	R: opr0,
1209	Name: "__rootn2div");
1210	replaceCall(I: FPOp, With: nval);
1211	return true;
1212	}
1213
1214	if (ci_opr1 == -`2` &&
1215	shouldReplaceLibcallWithIntrinsic(CI,
1216	/AllowMinSizeF32=/true,
1217	/AllowF64=/true)) {
1218	// rootn(x, -2) = rsqrt(x)
1219
1220	// The original rootn had looser ulp requirements than the resultant sqrt
1221	// and fdiv.
1222	MDBuilder MDHelper(M->getContext());
1223	MDNode *FPMD = MDHelper.createFPMath(Accuracy: std::max(a: FPOp->getFPAccuracy(), b: `2.0f`));
1224
1225	// TODO: Could handle strictfp but need to fix strict sqrt emission
1226	FastMathFlags FMF = FPOp->getFastMathFlags();
1227	FMF.setAllowContract(true);
1228
1229	CallInst *Sqrt = B.CreateUnaryIntrinsic(ID: Intrinsic::sqrt, V: opr0, FMFSource: CI);
1230	Instruction *RSqrt = cast<Instruction>(
1231	Val: B.CreateFDiv(L: ConstantFP::get(Ty: opr0->getType(), V: `1.0`), R: Sqrt));
1232	Sqrt->setFastMathFlags(FMF);
1233	RSqrt->setFastMathFlags(FMF);
1234	RSqrt->setMetadata(KindID: LLVMContext::MD_fpmath, Node: FPMD);
1235
1236	LLVM_DEBUG(errs() << "AMDIC: " << FPOp << " ---> rsqrt(" << opr0
1237	<< ")\n");
1238	replaceCall(I: CI, With: RSqrt);
1239	return true;
1240	}
1241
1242	return false;
1243	}
1244
1245	// Get a scalar native builtin single argument FP function
1246	FunctionCallee AMDGPULibCalls::getNativeFunction(Module *M,
1247	const FuncInfo &FInfo) {
1248	if (getArgType(FInfo) == AMDGPULibFunc::F64 \|\| !HasNative(id: FInfo.getId()))
1249	return nullptr;
1250	FuncInfo nf = FInfo;
1251	nf.setPrefix(AMDGPULibFunc::NATIVE);
1252	return getFunction(M, fInfo: nf);
1253	}
1254
1255	// Some library calls are just wrappers around llvm intrinsics, but compiled
1256	// conservatively. Preserve the flags from the original call site by
1257	// substituting them with direct calls with all the flags.
1258	bool AMDGPULibCalls::shouldReplaceLibcallWithIntrinsic(const CallInst *CI,
1259	bool AllowMinSizeF32,
1260	bool AllowF64,
1261	bool AllowStrictFP) {
1262	Type *FltTy = CI->getType()->getScalarType();
1263	const bool IsF32 = FltTy->isFloatTy();
1264
1265	// f64 intrinsics aren't implemented for most operations.
1266	if (!IsF32 && !FltTy->isHalfTy() && (!AllowF64 \|\| !FltTy->isDoubleTy()))
1267	return false;
1268
1269	// We're implicitly inlining by replacing the libcall with the intrinsic, so
1270	// don't do it for noinline call sites.
1271	if (CI->isNoInline())
1272	return false;
1273
1274	const Function *ParentF = CI->getFunction();
1275	// TODO: Handle strictfp
1276	if (!AllowStrictFP && ParentF->hasFnAttribute(Kind: Attribute::StrictFP))
1277	return false;
1278
1279	if (IsF32 && !AllowMinSizeF32 && ParentF->hasMinSize())
1280	return false;
1281	return true;
1282	}
1283
1284	void AMDGPULibCalls::replaceLibCallWithSimpleIntrinsic(IRBuilder<> &B,
1285	CallInst *CI,
1286	Intrinsic::ID IntrID) {
1287	if (CI->arg_size() == `2`) {
1288	Value *Arg0 = CI->getArgOperand(i: `0`);
1289	Value *Arg1 = CI->getArgOperand(i: `1`);
1290	VectorType *Arg0VecTy = dyn_cast<VectorType>(Val: Arg0->getType());
1291	VectorType *Arg1VecTy = dyn_cast<VectorType>(Val: Arg1->getType());
1292	if (Arg0VecTy && !Arg1VecTy) {
1293	Value *SplatRHS = B.CreateVectorSplat(EC: Arg0VecTy->getElementCount(), V: Arg1);
1294	CI->setArgOperand(i: `1`, v: SplatRHS);
1295	} else if (!Arg0VecTy && Arg1VecTy) {
1296	Value *SplatLHS = B.CreateVectorSplat(EC: Arg1VecTy->getElementCount(), V: Arg0);
1297	CI->setArgOperand(i: `0`, v: SplatLHS);
1298	}
1299	}
1300
1301	CI->setCalledFunction(
1302	Intrinsic::getDeclaration(M: CI->getModule(), id: IntrID, Tys: {CI->getType()}));
1303	}
1304
1305	bool AMDGPULibCalls::tryReplaceLibcallWithSimpleIntrinsic(
1306	IRBuilder<> &B, CallInst CI, Intrinsic::ID IntrID, bool* AllowMinSizeF32,
1307	bool AllowF64, bool AllowStrictFP) {
1308	if (!shouldReplaceLibcallWithIntrinsic(CI, AllowMinSizeF32, AllowF64,
1309	AllowStrictFP))
1310	return false;
1311	replaceLibCallWithSimpleIntrinsic(B, CI, IntrID);
1312	return true;
1313	}
1314
1315	std::tuple<Value , Value , Value *>
1316	AMDGPULibCalls::insertSinCos(Value *Arg, FastMathFlags FMF, IRBuilder<> &B,
1317	FunctionCallee Fsincos) {
1318	DebugLoc DL = B.getCurrentDebugLocation();
1319	Function *F = B.GetInsertBlock()->getParent();
1320	B.SetInsertPointPastAllocas(F);
1321
1322	AllocaInst Alloc = B.CreateAlloca(Ty: Arg->getType(), ArraySize: nullptr*, Name: "__sincos_");
1323
1324	if (Instruction *ArgInst = dyn_cast<Instruction>(Val: Arg)) {
1325	// If the argument is an instruction, it must dominate all uses so put our
1326	// sincos call there. Otherwise, right after the allocas works well enough
1327	// if it's an argument or constant.
1328
1329	B.SetInsertPoint(TheBB: ArgInst->getParent(), IP: ++ArgInst->getIterator());
1330
1331	// SetInsertPoint unwelcomely always tries to set the debug loc.
1332	B.SetCurrentDebugLocation(DL);
1333	}
1334
1335	Type *CosPtrTy = Fsincos.getFunctionType()->getParamType(i: `1`);
1336
1337	// The allocaInst allocates the memory in private address space. This need
1338	// to be addrspacecasted to point to the address space of cos pointer type.
1339	// In OpenCL 2.0 this is generic, while in 1.2 that is private.
1340	Value *CastAlloc = B.CreateAddrSpaceCast(V: Alloc, DestTy: CosPtrTy);
1341
1342	CallInst *SinCos = CreateCallEx2(B, Callee: Fsincos, Arg1: Arg, Arg2: CastAlloc);
1343
1344	// TODO: Is it worth trying to preserve the location for the cos calls for the
1345	// load?
1346
1347	LoadInst *LoadCos = B.CreateLoad(Ty: Alloc->getAllocatedType(), Ptr: Alloc);
1348	return {SinCos, LoadCos, SinCos};
1349	}
1350
1351	// fold sin, cos -> sincos.
1352	bool AMDGPULibCalls::fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B,
1353	const FuncInfo &fInfo) {
1354	assert(fInfo.getId() == AMDGPULibFunc::EI_SIN \|\|
1355	fInfo.getId() == AMDGPULibFunc::EI_COS);
1356
1357	if ((getArgType(FInfo: fInfo) != AMDGPULibFunc::F32 &&
1358	getArgType(FInfo: fInfo) != AMDGPULibFunc::F64) \|\|
1359	fInfo.getPrefix() != AMDGPULibFunc::NOPFX)
1360	return false;
1361
1362	bool const isSin = fInfo.getId() == AMDGPULibFunc::EI_SIN;
1363
1364	Value *CArgVal = FPOp->getOperand(i: `0`);
1365	CallInst *CI = cast<CallInst>(Val: FPOp);
1366
1367	Function *F = B.GetInsertBlock()->getParent();
1368	Module *M = F->getParent();
1369
1370	// Merge the sin and cos. For OpenCL 2.0, there may only be a generic pointer
1371	// implementation. Prefer the private form if available.
1372	AMDGPULibFunc SinCosLibFuncPrivate(AMDGPULibFunc::EI_SINCOS, fInfo);
1373	SinCosLibFuncPrivate.getLeads()[`0`].PtrKind =
1374	AMDGPULibFunc::getEPtrKindFromAddrSpace(AS: AMDGPUAS::PRIVATE_ADDRESS);
1375
1376	AMDGPULibFunc SinCosLibFuncGeneric(AMDGPULibFunc::EI_SINCOS, fInfo);
1377	SinCosLibFuncGeneric.getLeads()[`0`].PtrKind =
1378	AMDGPULibFunc::getEPtrKindFromAddrSpace(AS: AMDGPUAS::FLAT_ADDRESS);
1379
1380	FunctionCallee FSinCosPrivate = getFunction(M, fInfo: SinCosLibFuncPrivate);
1381	FunctionCallee FSinCosGeneric = getFunction(M, fInfo: SinCosLibFuncGeneric);
1382	FunctionCallee FSinCos = FSinCosPrivate ? FSinCosPrivate : FSinCosGeneric;
1383	if (!FSinCos)
1384	return false;
1385
1386	SmallVector<CallInst *> SinCalls;
1387	SmallVector<CallInst *> CosCalls;
1388	SmallVector<CallInst *> SinCosCalls;
1389	FuncInfo PartnerInfo(isSin ? AMDGPULibFunc::EI_COS : AMDGPULibFunc::EI_SIN,
1390	fInfo);
1391	const std::string PairName = PartnerInfo.mangle();
1392
1393	StringRef SinName = isSin ? CI->getCalledFunction()->getName() : PairName;
1394	StringRef CosName = isSin ? PairName : CI->getCalledFunction()->getName();
1395	const std::string SinCosPrivateName = SinCosLibFuncPrivate.mangle();
1396	const std::string SinCosGenericName = SinCosLibFuncGeneric.mangle();
1397
1398	// Intersect the two sets of flags.
1399	FastMathFlags FMF = FPOp->getFastMathFlags();
1400	MDNode *FPMath = CI->getMetadata(KindID: LLVMContext::MD_fpmath);
1401
1402	SmallVector<DILocation *> MergeDbgLocs = {CI->getDebugLoc()};
1403
1404	for (User* U : CArgVal->users()) {
1405	CallInst *XI = dyn_cast<CallInst>(Val: U);
1406	if (!XI \|\| XI->getFunction() != F \|\| XI->isNoBuiltin())
1407	continue;
1408
1409	Function *UCallee = XI->getCalledFunction();
1410	if (!UCallee)
1411	continue;
1412
1413	bool Handled = true;
1414
1415	if (UCallee->getName() == SinName)
1416	SinCalls.push_back(Elt: XI);
1417	else if (UCallee->getName() == CosName)
1418	CosCalls.push_back(Elt: XI);
1419	else if (UCallee->getName() == SinCosPrivateName \|\|
1420	UCallee->getName() == SinCosGenericName)
1421	SinCosCalls.push_back(Elt: XI);
1422	else
1423	Handled = false;
1424
1425	if (Handled) {
1426	MergeDbgLocs.push_back(Elt: XI->getDebugLoc());
1427	auto *OtherOp = cast<FPMathOperator>(Val: XI);
1428	FMF &= OtherOp->getFastMathFlags();
1429	FPMath = MDNode::getMostGenericFPMath(
1430	A: FPMath, B: XI->getMetadata(KindID: LLVMContext::MD_fpmath));
1431	}
1432	}
1433
1434	if (SinCalls.empty() \|\| CosCalls.empty())
1435	return false;
1436
1437	B.setFastMathFlags(FMF);
1438	B.setDefaultFPMathTag(FPMath);
1439	DILocation *DbgLoc = DILocation::getMergedLocations(Locs: MergeDbgLocs);
1440	B.SetCurrentDebugLocation(DbgLoc);
1441
1442	auto [Sin, Cos, SinCos] = insertSinCos(Arg: CArgVal, FMF, B, Fsincos: FSinCos);
1443
1444	auto replaceTrigInsts = [](ArrayRef<CallInst > Calls, Value Res) {
1445	for (CallInst *C : Calls)
1446	C->replaceAllUsesWith(V: Res);
1447
1448	// Leave the other dead instructions to avoid clobbering iterators.
1449	};
1450
1451	replaceTrigInsts (SinCalls, Sin);
1452	replaceTrigInsts (CosCalls, Cos);
1453	replaceTrigInsts (SinCosCalls, SinCos);
1454
1455	// It's safe to delete the original now.
1456	CI->eraseFromParent();
1457	return true;
1458	}
1459
1460	bool AMDGPULibCalls::evaluateScalarMathFunc(const FuncInfo &FInfo, double &Res0,
1461	double &Res1, Constant *copr0,
1462	Constant *copr1) {
1463	// By default, opr0/opr1/opr3 holds values of float/double type.
1464	// If they are not float/double, each function has to its
1465	// operand separately.
1466	double opr0 = `0.0`, opr1 = `0.0`;
1467	ConstantFP *fpopr0 = dyn_cast_or_null<ConstantFP>(Val: copr0);
1468	ConstantFP *fpopr1 = dyn_cast_or_null<ConstantFP>(Val: copr1);
1469	if (fpopr0) {
1470	opr0 = (getArgType(FInfo) == AMDGPULibFunc::F64)
1471	? fpopr0->getValueAPF().convertToDouble()
1472	: (double)fpopr0->getValueAPF().convertToFloat();
1473	}
1474
1475	if (fpopr1) {
1476	opr1 = (getArgType(FInfo) == AMDGPULibFunc::F64)
1477	? fpopr1->getValueAPF().convertToDouble()
1478	: (double)fpopr1->getValueAPF().convertToFloat();
1479	}
1480
1481	switch (FInfo.getId()) {
1482	default : return false;
1483
1484	case AMDGPULibFunc::EI_ACOS:
1485	Res0 = acos(x: opr0);
1486	return true;
1487
1488	case AMDGPULibFunc::EI_ACOSH:
1489	// acosh(x) == log(x + sqrt(xx - 1))*
1490	Res0 = log(x: opr0 + sqrt(x: opr0*opr0 - `1.0`));
1491	return true;
1492
1493	case AMDGPULibFunc::EI_ACOSPI:
1494	Res0 = acos(x: opr0) / MATH_PI;
1495	return true;
1496
1497	case AMDGPULibFunc::EI_ASIN:
1498	Res0 = asin(x: opr0);
1499	return true;
1500
1501	case AMDGPULibFunc::EI_ASINH:
1502	// asinh(x) == log(x + sqrt(xx + 1))*
1503	Res0 = log(x: opr0 + sqrt(x: opr0*opr0 + `1.0`));
1504	return true;
1505
1506	case AMDGPULibFunc::EI_ASINPI:
1507	Res0 = asin(x: opr0) / MATH_PI;
1508	return true;
1509
1510	case AMDGPULibFunc::EI_ATAN:
1511	Res0 = atan(x: opr0);
1512	return true;
1513
1514	case AMDGPULibFunc::EI_ATANH:
1515	// atanh(x) == (log(x+1) - log(x-1))/2;
1516	Res0 = (log(x: opr0 + `1.0`) - log(x: opr0 - `1.0`))/`2.0`;
1517	return true;
1518
1519	case AMDGPULibFunc::EI_ATANPI:
1520	Res0 = atan(x: opr0) / MATH_PI;
1521	return true;
1522
1523	case AMDGPULibFunc::EI_CBRT:
1524	Res0 = (opr0 < `0.0`) ? -pow(x: -opr0, y: `1.0`/`3.0`) : pow(x: opr0, y: `1.0`/`3.0`);
1525	return true;
1526
1527	case AMDGPULibFunc::EI_COS:
1528	Res0 = cos(x: opr0);
1529	return true;
1530
1531	case AMDGPULibFunc::EI_COSH:
1532	Res0 = cosh(x: opr0);
1533	return true;
1534
1535	case AMDGPULibFunc::EI_COSPI:
1536	Res0 = cos(MATH_PI * opr0);
1537	return true;
1538
1539	case AMDGPULibFunc::EI_EXP:
1540	Res0 = exp(x: opr0);
1541	return true;
1542
1543	case AMDGPULibFunc::EI_EXP2:
1544	Res0 = pow(x: `2.0`, y: opr0);
1545	return true;
1546
1547	case AMDGPULibFunc::EI_EXP10:
1548	Res0 = pow(x: `10.0`, y: opr0);
1549	return true;
1550
1551	case AMDGPULibFunc::EI_LOG:
1552	Res0 = log(x: opr0);
1553	return true;
1554
1555	case AMDGPULibFunc::EI_LOG2:
1556	Res0 = log(x: opr0) / log(x: `2.0`);
1557	return true;
1558
1559	case AMDGPULibFunc::EI_LOG10:
1560	Res0 = log(x: opr0) / log(x: `10.0`);
1561	return true;
1562
1563	case AMDGPULibFunc::EI_RSQRT:
1564	Res0 = `1.0` / sqrt(x: opr0);
1565	return true;
1566
1567	case AMDGPULibFunc::EI_SIN:
1568	Res0 = sin(x: opr0);
1569	return true;
1570
1571	case AMDGPULibFunc::EI_SINH:
1572	Res0 = sinh(x: opr0);
1573	return true;
1574
1575	case AMDGPULibFunc::EI_SINPI:
1576	Res0 = sin(MATH_PI * opr0);
1577	return true;
1578
1579	case AMDGPULibFunc::EI_TAN:
1580	Res0 = tan(x: opr0);
1581	return true;
1582
1583	case AMDGPULibFunc::EI_TANH:
1584	Res0 = tanh(x: opr0);
1585	return true;
1586
1587	case AMDGPULibFunc::EI_TANPI:
1588	Res0 = tan(MATH_PI * opr0);
1589	return true;
1590
1591	// two-arg functions
1592	case AMDGPULibFunc::EI_POW:
1593	case AMDGPULibFunc::EI_POWR:
1594	Res0 = pow(x: opr0, y: opr1);
1595	return true;
1596
1597	case AMDGPULibFunc::EI_POWN: {
1598	if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(Val: copr1)) {
1599	double val = (double)iopr1->getSExtValue();
1600	Res0 = pow(x: opr0, y: val);
1601	return true;
1602	}
1603	return false;
1604	}
1605
1606	case AMDGPULibFunc::EI_ROOTN: {
1607	if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(Val: copr1)) {
1608	double val = (double)iopr1->getSExtValue();
1609	Res0 = pow(x: opr0, y: `1.0` / val);
1610	return true;
1611	}
1612	return false;
1613	}
1614
1615	// with ptr arg
1616	case AMDGPULibFunc::EI_SINCOS:
1617	Res0 = sin(x: opr0);
1618	Res1 = cos(x: opr0);
1619	return true;
1620	}
1621
1622	return false;
1623	}
1624
1625	bool AMDGPULibCalls::evaluateCall(CallInst aCI, const* FuncInfo &FInfo) {
1626	int numArgs = (int)aCI->arg_size();
1627	if (numArgs > `3`)
1628	return false;
1629
1630	Constant copr0 = nullptr*;
1631	Constant copr1 = nullptr*;
1632	if (numArgs > `0`) {
1633	if ((copr0 = dyn_cast<Constant>(Val: aCI->getArgOperand(i: `0`))) == nullptr)
1634	return false;
1635	}
1636
1637	if (numArgs > `1`) {
1638	if ((copr1 = dyn_cast<Constant>(Val: aCI->getArgOperand(i: `1`))) == nullptr) {
1639	if (FInfo.getId() != AMDGPULibFunc::EI_SINCOS)
1640	return false;
1641	}
1642	}
1643
1644	// At this point, all arguments to aCI are constants.
1645
1646	// max vector size is 16, and sincos will generate two results.
1647	double DVal0[`16`], DVal1[`16`];
1648	int FuncVecSize = getVecSize(FInfo);
1649	bool hasTwoResults = (FInfo.getId() == AMDGPULibFunc::EI_SINCOS);
1650	if (FuncVecSize == `1`) {
1651	if (!evaluateScalarMathFunc(FInfo, Res0&: DVal0[`0`], Res1&: DVal1[`0`], copr0, copr1)) {
1652	return false;
1653	}
1654	} else {
1655	ConstantDataVector *CDV0 = dyn_cast_or_null<ConstantDataVector>(Val: copr0);
1656	ConstantDataVector *CDV1 = dyn_cast_or_null<ConstantDataVector>(Val: copr1);
1657	for (int i = `0`; i < FuncVecSize; ++i) {
1658	Constant celt0 = CDV0 ? CDV0->getElementAsConstant(i) : nullptr*;
1659	Constant celt1 = CDV1 ? CDV1->getElementAsConstant(i) : nullptr*;
1660	if (!evaluateScalarMathFunc(FInfo, Res0&: DVal0[i], Res1&: DVal1[i], copr0: celt0, copr1: celt1)) {
1661	return false;
1662	}
1663	}
1664	}
1665
1666	LLVMContext &context = aCI->getContext();
1667	Constant nval0, nval1;
1668	if (FuncVecSize == `1`) {
1669	nval0 = ConstantFP::get(Ty: aCI->getType(), V: DVal0[`0`]);
1670	if (hasTwoResults)
1671	nval1 = ConstantFP::get(Ty: aCI->getType(), V: DVal1[`0`]);
1672	} else {
1673	if (getArgType(FInfo) == AMDGPULibFunc::F32) {
1674	SmallVector <float, `0`> FVal0, FVal1;
1675	for (int i = `0`; i < FuncVecSize; ++i)
1676	FVal0.push_back(Elt: (float)DVal0[i]);
1677	ArrayRef<float> tmp0(FVal0);
1678	nval0 = ConstantDataVector::get(Context&: context, Elts: tmp0);
1679	if (hasTwoResults) {
1680	for (int i = `0`; i < FuncVecSize; ++i)
1681	FVal1.push_back(Elt: (float)DVal1[i]);
1682	ArrayRef<float> tmp1(FVal1);
1683	nval1 = ConstantDataVector::get(Context&: context, Elts: tmp1);
1684	}
1685	} else {
1686	ArrayRef<double> tmp0(DVal0);
1687	nval0 = ConstantDataVector::get(Context&: context, Elts: tmp0);
1688	if (hasTwoResults) {
1689	ArrayRef<double> tmp1(DVal1);
1690	nval1 = ConstantDataVector::get(Context&: context, Elts: tmp1);
1691	}
1692	}
1693	}
1694
1695	if (hasTwoResults) {
1696	// sincos
1697	assert(FInfo.getId() == AMDGPULibFunc::EI_SINCOS &&
1698	"math function with ptr arg not supported yet");
1699	new StoreInst (nval1, aCI->getArgOperand(i: `1`), aCI->getIterator());
1700	}
1701
1702	replaceCall(I: aCI, With: nval0);
1703	return true;
1704	}
1705
1706	PreservedAnalyses AMDGPUSimplifyLibCallsPass::run(Function &F,
1707	FunctionAnalysisManager &AM) {
1708	AMDGPULibCalls Simplifier;
1709	Simplifier.initNativeFuncs();
1710	Simplifier.initFunction(F, FAM&: AM);
1711
1712	bool Changed = false;
1713
1714	LLVM_DEBUG(dbgs() << "AMDIC: process function ";
1715	F.printAsOperand(dbgs(), false, F.getParent()); dbgs() << `'\n'`;);
1716
1717	for (auto &BB : F) {
1718	for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;) {
1719	// Ignore non-calls.
1720	CallInst *CI = dyn_cast<CallInst>(Val&: I);
1721	++I;
1722
1723	if (CI) {
1724	if (Simplifier.fold(CI))
1725	Changed = true;
1726	}
1727	}
1728	}
1729	return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
1730	}
1731
1732	PreservedAnalyses AMDGPUUseNativeCallsPass::run(Function &F,
1733	FunctionAnalysisManager &AM) {
1734	if (UseNative.empty())
1735	return PreservedAnalyses::all();
1736
1737	AMDGPULibCalls Simplifier;
1738	Simplifier.initNativeFuncs();
1739	Simplifier.initFunction(F, FAM&: AM);
1740
1741	bool Changed = false;
1742	for (auto &BB : F) {
1743	for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;) {
1744	// Ignore non-calls.
1745	CallInst *CI = dyn_cast<CallInst>(Val&: I);
1746	++I;
1747	if (CI && Simplifier.useNative(aCI: CI))
1748	Changed = true;
1749	}
1750	}
1751	return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
1752	}
1753

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp