1//===- AMDGPULibCalls.cpp -------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file does AMD library function optimizations.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
15#include "AMDGPULibFunc.h"
16#include "llvm/Analysis/AssumptionCache.h"
17#include "llvm/Analysis/TargetLibraryInfo.h"
18#include "llvm/Analysis/ValueTracking.h"
19#include "llvm/IR/AttributeMask.h"
20#include "llvm/IR/Dominators.h"
21#include "llvm/IR/IRBuilder.h"
22#include "llvm/IR/MDBuilder.h"
23#include "llvm/IR/PatternMatch.h"
24#include <cmath>
25
26#define DEBUG_TYPE "amdgpu-simplifylib"
27
28using namespace llvm;
29using namespace llvm::PatternMatch;
30
31static cl::opt<bool> EnablePreLink("amdgpu-prelink",
32 cl::desc("Enable pre-link mode optimizations"),
33 cl::init(Val: false),
34 cl::Hidden);
35
36static cl::list<std::string> UseNative("amdgpu-use-native",
37 cl::desc("Comma separated list of functions to replace with native, or all"),
38 cl::CommaSeparated, cl::ValueOptional,
39 cl::Hidden);
40
41#define MATH_PI numbers::pi
42#define MATH_E numbers::e
43#define MATH_SQRT2 numbers::sqrt2
44#define MATH_SQRT1_2 numbers::inv_sqrt2
45
46namespace llvm {
47
48class AMDGPULibCalls {
49private:
50 SimplifyQuery SQ;
51
52 using FuncInfo = llvm::AMDGPULibFunc;
53
54 // -fuse-native.
55 bool AllNative = false;
56
57 bool useNativeFunc(const StringRef F) const;
58
59 // Return a pointer (pointer expr) to the function if function definition with
60 // "FuncName" exists. It may create a new function prototype in pre-link mode.
61 FunctionCallee getFunction(Module *M, const FuncInfo &fInfo);
62
63 bool parseFunctionName(const StringRef &FMangledName, FuncInfo &FInfo);
64
65 bool TDOFold(CallInst *CI, const FuncInfo &FInfo);
66
67 /* Specialized optimizations */
68
69 // pow/powr/pown
70 bool fold_pow(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);
71
72 // rootn
73 bool fold_rootn(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);
74
75 // -fuse-native for sincos
76 bool sincosUseNative(CallInst *aCI, const FuncInfo &FInfo);
77
78 // evaluate calls if calls' arguments are constants.
79 bool evaluateScalarMathFunc(const FuncInfo &FInfo, double &Res0, double &Res1,
80 Constant *copr0, Constant *copr1);
81 bool evaluateCall(CallInst *aCI, const FuncInfo &FInfo);
82
83 /// Insert a value to sincos function \p Fsincos. Returns (value of sin, value
84 /// of cos, sincos call).
85 std::tuple<Value *, Value *, Value *> insertSinCos(Value *Arg,
86 FastMathFlags FMF,
87 IRBuilder<> &B,
88 FunctionCallee Fsincos);
89
90 // sin/cos
91 bool fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);
92
93 // __read_pipe/__write_pipe
94 bool fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
95 const FuncInfo &FInfo);
96
97 // Get a scalar native builtin single argument FP function
98 FunctionCallee getNativeFunction(Module *M, const FuncInfo &FInfo);
99
100 /// Substitute a call to a known libcall with an intrinsic call. If \p
101 /// AllowMinSize is true, allow the replacement in a minsize function.
102 bool shouldReplaceLibcallWithIntrinsic(const CallInst *CI,
103 bool AllowMinSizeF32 = false,
104 bool AllowF64 = false,
105 bool AllowStrictFP = false);
106 void replaceLibCallWithSimpleIntrinsic(IRBuilder<> &B, CallInst *CI,
107 Intrinsic::ID IntrID);
108
109 bool tryReplaceLibcallWithSimpleIntrinsic(IRBuilder<> &B, CallInst *CI,
110 Intrinsic::ID IntrID,
111 bool AllowMinSizeF32 = false,
112 bool AllowF64 = false,
113 bool AllowStrictFP = false);
114
115protected:
116 bool isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const;
117
118 bool canIncreasePrecisionOfConstantFold(const FPMathOperator *FPOp) const;
119
120 static void replaceCall(Instruction *I, Value *With) {
121 I->replaceAllUsesWith(V: With);
122 I->eraseFromParent();
123 }
124
125 static void replaceCall(FPMathOperator *I, Value *With) {
126 replaceCall(I: cast<Instruction>(Val: I), With);
127 }
128
129public:
130 AMDGPULibCalls(Function &F, FunctionAnalysisManager &FAM);
131
132 bool fold(CallInst *CI);
133
134 void initNativeFuncs();
135
136 // Replace a normal math function call with that native version
137 bool useNative(CallInst *CI);
138};
139
140} // end namespace llvm
141
142template <typename IRB>
143static CallInst *CreateCallEx(IRB &B, FunctionCallee Callee, Value *Arg,
144 const Twine &Name = "") {
145 CallInst *R = B.CreateCall(Callee, Arg, Name);
146 if (Function *F = dyn_cast<Function>(Val: Callee.getCallee()))
147 R->setCallingConv(F->getCallingConv());
148 return R;
149}
150
151template <typename IRB>
152static CallInst *CreateCallEx2(IRB &B, FunctionCallee Callee, Value *Arg1,
153 Value *Arg2, const Twine &Name = "") {
154 CallInst *R = B.CreateCall(Callee, {Arg1, Arg2}, Name);
155 if (Function *F = dyn_cast<Function>(Val: Callee.getCallee()))
156 R->setCallingConv(F->getCallingConv());
157 return R;
158}
159
160static FunctionType *getPownType(FunctionType *FT) {
161 Type *PowNExpTy = Type::getInt32Ty(C&: FT->getContext());
162 if (VectorType *VecTy = dyn_cast<VectorType>(Val: FT->getReturnType()))
163 PowNExpTy = VectorType::get(ElementType: PowNExpTy, EC: VecTy->getElementCount());
164
165 return FunctionType::get(Result: FT->getReturnType(),
166 Params: {FT->getParamType(i: 0), PowNExpTy}, isVarArg: false);
167}
168
169// Data structures for table-driven optimizations.
170// FuncTbl works for both f32 and f64 functions with 1 input argument
171
172struct TableEntry {
173 double result;
174 double input;
175};
176
177/* a list of {result, input} */
178static const TableEntry tbl_acos[] = {
179 {MATH_PI / 2.0, .input: 0.0},
180 {MATH_PI / 2.0, .input: -0.0},
181 {.result: 0.0, .input: 1.0},
182 {MATH_PI, .input: -1.0}
183};
184static const TableEntry tbl_acosh[] = {
185 {.result: 0.0, .input: 1.0}
186};
187static const TableEntry tbl_acospi[] = {
188 {.result: 0.5, .input: 0.0},
189 {.result: 0.5, .input: -0.0},
190 {.result: 0.0, .input: 1.0},
191 {.result: 1.0, .input: -1.0}
192};
193static const TableEntry tbl_asin[] = {
194 {.result: 0.0, .input: 0.0},
195 {.result: -0.0, .input: -0.0},
196 {MATH_PI / 2.0, .input: 1.0},
197 {.result: -MATH_PI / 2.0, .input: -1.0}
198};
199static const TableEntry tbl_asinh[] = {
200 {.result: 0.0, .input: 0.0},
201 {.result: -0.0, .input: -0.0}
202};
203static const TableEntry tbl_asinpi[] = {
204 {.result: 0.0, .input: 0.0},
205 {.result: -0.0, .input: -0.0},
206 {.result: 0.5, .input: 1.0},
207 {.result: -0.5, .input: -1.0}
208};
209static const TableEntry tbl_atan[] = {
210 {.result: 0.0, .input: 0.0},
211 {.result: -0.0, .input: -0.0},
212 {MATH_PI / 4.0, .input: 1.0},
213 {.result: -MATH_PI / 4.0, .input: -1.0}
214};
215static const TableEntry tbl_atanh[] = {
216 {.result: 0.0, .input: 0.0},
217 {.result: -0.0, .input: -0.0}
218};
219static const TableEntry tbl_atanpi[] = {
220 {.result: 0.0, .input: 0.0},
221 {.result: -0.0, .input: -0.0},
222 {.result: 0.25, .input: 1.0},
223 {.result: -0.25, .input: -1.0}
224};
225static const TableEntry tbl_cbrt[] = {
226 {.result: 0.0, .input: 0.0},
227 {.result: -0.0, .input: -0.0},
228 {.result: 1.0, .input: 1.0},
229 {.result: -1.0, .input: -1.0},
230};
231static const TableEntry tbl_cos[] = {
232 {.result: 1.0, .input: 0.0},
233 {.result: 1.0, .input: -0.0}
234};
235static const TableEntry tbl_cosh[] = {
236 {.result: 1.0, .input: 0.0},
237 {.result: 1.0, .input: -0.0}
238};
239static const TableEntry tbl_cospi[] = {
240 {.result: 1.0, .input: 0.0},
241 {.result: 1.0, .input: -0.0}
242};
243static const TableEntry tbl_erfc[] = {
244 {.result: 1.0, .input: 0.0},
245 {.result: 1.0, .input: -0.0}
246};
247static const TableEntry tbl_erf[] = {
248 {.result: 0.0, .input: 0.0},
249 {.result: -0.0, .input: -0.0}
250};
251static const TableEntry tbl_exp[] = {
252 {.result: 1.0, .input: 0.0},
253 {.result: 1.0, .input: -0.0},
254 {MATH_E, .input: 1.0}
255};
256static const TableEntry tbl_exp2[] = {
257 {.result: 1.0, .input: 0.0},
258 {.result: 1.0, .input: -0.0},
259 {.result: 2.0, .input: 1.0}
260};
261static const TableEntry tbl_exp10[] = {
262 {.result: 1.0, .input: 0.0},
263 {.result: 1.0, .input: -0.0},
264 {.result: 10.0, .input: 1.0}
265};
266static const TableEntry tbl_expm1[] = {
267 {.result: 0.0, .input: 0.0},
268 {.result: -0.0, .input: -0.0}
269};
270static const TableEntry tbl_log[] = {
271 {.result: 0.0, .input: 1.0},
272 {.result: 1.0, MATH_E}
273};
274static const TableEntry tbl_log2[] = {
275 {.result: 0.0, .input: 1.0},
276 {.result: 1.0, .input: 2.0}
277};
278static const TableEntry tbl_log10[] = {
279 {.result: 0.0, .input: 1.0},
280 {.result: 1.0, .input: 10.0}
281};
282static const TableEntry tbl_rsqrt[] = {
283 {.result: 1.0, .input: 1.0},
284 {MATH_SQRT1_2, .input: 2.0}
285};
286static const TableEntry tbl_sin[] = {
287 {.result: 0.0, .input: 0.0},
288 {.result: -0.0, .input: -0.0}
289};
290static const TableEntry tbl_sinh[] = {
291 {.result: 0.0, .input: 0.0},
292 {.result: -0.0, .input: -0.0}
293};
294static const TableEntry tbl_sinpi[] = {
295 {.result: 0.0, .input: 0.0},
296 {.result: -0.0, .input: -0.0}
297};
298static const TableEntry tbl_sqrt[] = {
299 {.result: 0.0, .input: 0.0},
300 {.result: 1.0, .input: 1.0},
301 {MATH_SQRT2, .input: 2.0}
302};
303static const TableEntry tbl_tan[] = {
304 {.result: 0.0, .input: 0.0},
305 {.result: -0.0, .input: -0.0}
306};
307static const TableEntry tbl_tanh[] = {
308 {.result: 0.0, .input: 0.0},
309 {.result: -0.0, .input: -0.0}
310};
311static const TableEntry tbl_tanpi[] = {
312 {.result: 0.0, .input: 0.0},
313 {.result: -0.0, .input: -0.0}
314};
315static const TableEntry tbl_tgamma[] = {
316 {.result: 1.0, .input: 1.0},
317 {.result: 1.0, .input: 2.0},
318 {.result: 2.0, .input: 3.0},
319 {.result: 6.0, .input: 4.0}
320};
321
322static bool HasNative(AMDGPULibFunc::EFuncId id) {
323 switch(id) {
324 case AMDGPULibFunc::EI_DIVIDE:
325 case AMDGPULibFunc::EI_COS:
326 case AMDGPULibFunc::EI_EXP:
327 case AMDGPULibFunc::EI_EXP2:
328 case AMDGPULibFunc::EI_EXP10:
329 case AMDGPULibFunc::EI_LOG:
330 case AMDGPULibFunc::EI_LOG2:
331 case AMDGPULibFunc::EI_LOG10:
332 case AMDGPULibFunc::EI_POWR:
333 case AMDGPULibFunc::EI_RECIP:
334 case AMDGPULibFunc::EI_RSQRT:
335 case AMDGPULibFunc::EI_SIN:
336 case AMDGPULibFunc::EI_SINCOS:
337 case AMDGPULibFunc::EI_SQRT:
338 case AMDGPULibFunc::EI_TAN:
339 return true;
340 default:;
341 }
342 return false;
343}
344
345using TableRef = ArrayRef<TableEntry>;
346
347static TableRef getOptTable(AMDGPULibFunc::EFuncId id) {
348 switch(id) {
349 case AMDGPULibFunc::EI_ACOS: return TableRef(tbl_acos);
350 case AMDGPULibFunc::EI_ACOSH: return TableRef(tbl_acosh);
351 case AMDGPULibFunc::EI_ACOSPI: return TableRef(tbl_acospi);
352 case AMDGPULibFunc::EI_ASIN: return TableRef(tbl_asin);
353 case AMDGPULibFunc::EI_ASINH: return TableRef(tbl_asinh);
354 case AMDGPULibFunc::EI_ASINPI: return TableRef(tbl_asinpi);
355 case AMDGPULibFunc::EI_ATAN: return TableRef(tbl_atan);
356 case AMDGPULibFunc::EI_ATANH: return TableRef(tbl_atanh);
357 case AMDGPULibFunc::EI_ATANPI: return TableRef(tbl_atanpi);
358 case AMDGPULibFunc::EI_CBRT: return TableRef(tbl_cbrt);
359 case AMDGPULibFunc::EI_NCOS:
360 case AMDGPULibFunc::EI_COS: return TableRef(tbl_cos);
361 case AMDGPULibFunc::EI_COSH: return TableRef(tbl_cosh);
362 case AMDGPULibFunc::EI_COSPI: return TableRef(tbl_cospi);
363 case AMDGPULibFunc::EI_ERFC: return TableRef(tbl_erfc);
364 case AMDGPULibFunc::EI_ERF: return TableRef(tbl_erf);
365 case AMDGPULibFunc::EI_EXP: return TableRef(tbl_exp);
366 case AMDGPULibFunc::EI_NEXP2:
367 case AMDGPULibFunc::EI_EXP2: return TableRef(tbl_exp2);
368 case AMDGPULibFunc::EI_EXP10: return TableRef(tbl_exp10);
369 case AMDGPULibFunc::EI_EXPM1: return TableRef(tbl_expm1);
370 case AMDGPULibFunc::EI_LOG: return TableRef(tbl_log);
371 case AMDGPULibFunc::EI_NLOG2:
372 case AMDGPULibFunc::EI_LOG2: return TableRef(tbl_log2);
373 case AMDGPULibFunc::EI_LOG10: return TableRef(tbl_log10);
374 case AMDGPULibFunc::EI_NRSQRT:
375 case AMDGPULibFunc::EI_RSQRT: return TableRef(tbl_rsqrt);
376 case AMDGPULibFunc::EI_NSIN:
377 case AMDGPULibFunc::EI_SIN: return TableRef(tbl_sin);
378 case AMDGPULibFunc::EI_SINH: return TableRef(tbl_sinh);
379 case AMDGPULibFunc::EI_SINPI: return TableRef(tbl_sinpi);
380 case AMDGPULibFunc::EI_NSQRT:
381 case AMDGPULibFunc::EI_SQRT: return TableRef(tbl_sqrt);
382 case AMDGPULibFunc::EI_TAN: return TableRef(tbl_tan);
383 case AMDGPULibFunc::EI_TANH: return TableRef(tbl_tanh);
384 case AMDGPULibFunc::EI_TANPI: return TableRef(tbl_tanpi);
385 case AMDGPULibFunc::EI_TGAMMA: return TableRef(tbl_tgamma);
386 default:;
387 }
388 return TableRef();
389}
390
391static inline int getVecSize(const AMDGPULibFunc& FInfo) {
392 return FInfo.getLeads()[0].VectorSize;
393}
394
395static inline AMDGPULibFunc::EType getArgType(const AMDGPULibFunc& FInfo) {
396 return (AMDGPULibFunc::EType)FInfo.getLeads()[0].ArgType;
397}
398
399FunctionCallee AMDGPULibCalls::getFunction(Module *M, const FuncInfo &fInfo) {
400 // If we are doing PreLinkOpt, the function is external. So it is safe to
401 // use getOrInsertFunction() at this stage.
402
403 return EnablePreLink ? AMDGPULibFunc::getOrInsertFunction(M, fInfo)
404 : AMDGPULibFunc::getFunction(M, fInfo);
405}
406
407bool AMDGPULibCalls::parseFunctionName(const StringRef &FMangledName,
408 FuncInfo &FInfo) {
409 return AMDGPULibFunc::parse(MangledName: FMangledName, Ptr&: FInfo);
410}
411
412bool AMDGPULibCalls::isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const {
413 return FPOp->hasApproxFunc() && FPOp->hasNoNaNs() && FPOp->hasNoInfs();
414}
415
416bool AMDGPULibCalls::canIncreasePrecisionOfConstantFold(
417 const FPMathOperator *FPOp) const {
418 // TODO: Refine to approxFunc or contract
419 return FPOp->isFast();
420}
421
422AMDGPULibCalls::AMDGPULibCalls(Function &F, FunctionAnalysisManager &FAM)
423 : SQ(F.getParent()->getDataLayout(),
424 &FAM.getResult<TargetLibraryAnalysis>(IR&: F),
425 FAM.getCachedResult<DominatorTreeAnalysis>(IR&: F),
426 &FAM.getResult<AssumptionAnalysis>(IR&: F)) {}
427
428bool AMDGPULibCalls::useNativeFunc(const StringRef F) const {
429 return AllNative || llvm::is_contained(Range&: UseNative, Element: F);
430}
431
432void AMDGPULibCalls::initNativeFuncs() {
433 AllNative = useNativeFunc(F: "all") ||
434 (UseNative.getNumOccurrences() && UseNative.size() == 1 &&
435 UseNative.begin()->empty());
436}
437
438bool AMDGPULibCalls::sincosUseNative(CallInst *aCI, const FuncInfo &FInfo) {
439 bool native_sin = useNativeFunc(F: "sin");
440 bool native_cos = useNativeFunc(F: "cos");
441
442 if (native_sin && native_cos) {
443 Module *M = aCI->getModule();
444 Value *opr0 = aCI->getArgOperand(i: 0);
445
446 AMDGPULibFunc nf;
447 nf.getLeads()[0].ArgType = FInfo.getLeads()[0].ArgType;
448 nf.getLeads()[0].VectorSize = FInfo.getLeads()[0].VectorSize;
449
450 nf.setPrefix(AMDGPULibFunc::NATIVE);
451 nf.setId(AMDGPULibFunc::EI_SIN);
452 FunctionCallee sinExpr = getFunction(M, fInfo: nf);
453
454 nf.setPrefix(AMDGPULibFunc::NATIVE);
455 nf.setId(AMDGPULibFunc::EI_COS);
456 FunctionCallee cosExpr = getFunction(M, fInfo: nf);
457 if (sinExpr && cosExpr) {
458 Value *sinval =
459 CallInst::Create(Func: sinExpr, Args: opr0, NameStr: "splitsin", InsertBefore: aCI->getIterator());
460 Value *cosval =
461 CallInst::Create(Func: cosExpr, Args: opr0, NameStr: "splitcos", InsertBefore: aCI->getIterator());
462 new StoreInst(cosval, aCI->getArgOperand(i: 1), aCI->getIterator());
463
464 DEBUG_WITH_TYPE("usenative", dbgs() << "<useNative> replace " << *aCI
465 << " with native version of sin/cos");
466
467 replaceCall(I: aCI, With: sinval);
468 return true;
469 }
470 }
471 return false;
472}
473
474bool AMDGPULibCalls::useNative(CallInst *aCI) {
475 Function *Callee = aCI->getCalledFunction();
476 if (!Callee || aCI->isNoBuiltin())
477 return false;
478
479 FuncInfo FInfo;
480 if (!parseFunctionName(FMangledName: Callee->getName(), FInfo) || !FInfo.isMangled() ||
481 FInfo.getPrefix() != AMDGPULibFunc::NOPFX ||
482 getArgType(FInfo) == AMDGPULibFunc::F64 || !HasNative(id: FInfo.getId()) ||
483 !(AllNative || useNativeFunc(F: FInfo.getName()))) {
484 return false;
485 }
486
487 if (FInfo.getId() == AMDGPULibFunc::EI_SINCOS)
488 return sincosUseNative(aCI, FInfo);
489
490 FInfo.setPrefix(AMDGPULibFunc::NATIVE);
491 FunctionCallee F = getFunction(M: aCI->getModule(), fInfo: FInfo);
492 if (!F)
493 return false;
494
495 aCI->setCalledFunction(F);
496 DEBUG_WITH_TYPE("usenative", dbgs() << "<useNative> replace " << *aCI
497 << " with native version");
498 return true;
499}
500
501// Clang emits call of __read_pipe_2 or __read_pipe_4 for OpenCL read_pipe
502// builtin, with appended type size and alignment arguments, where 2 or 4
503// indicates the original number of arguments. The library has optimized version
504// of __read_pipe_2/__read_pipe_4 when the type size and alignment has the same
505// power of 2 value. This function transforms __read_pipe_2 to __read_pipe_2_N
506// for such cases where N is the size in bytes of the type (N = 1, 2, 4, 8, ...,
507// 128). The same for __read_pipe_4, write_pipe_2, and write_pipe_4.
508bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
509 const FuncInfo &FInfo) {
510 auto *Callee = CI->getCalledFunction();
511 if (!Callee->isDeclaration())
512 return false;
513
514 assert(Callee->hasName() && "Invalid read_pipe/write_pipe function");
515 auto *M = Callee->getParent();
516 std::string Name = std::string(Callee->getName());
517 auto NumArg = CI->arg_size();
518 if (NumArg != 4 && NumArg != 6)
519 return false;
520 ConstantInt *PacketSize =
521 dyn_cast<ConstantInt>(Val: CI->getArgOperand(i: NumArg - 2));
522 ConstantInt *PacketAlign =
523 dyn_cast<ConstantInt>(Val: CI->getArgOperand(i: NumArg - 1));
524 if (!PacketSize || !PacketAlign)
525 return false;
526
527 unsigned Size = PacketSize->getZExtValue();
528 Align Alignment = PacketAlign->getAlignValue();
529 if (Alignment != Size)
530 return false;
531
532 unsigned PtrArgLoc = CI->arg_size() - 3;
533 Value *PtrArg = CI->getArgOperand(i: PtrArgLoc);
534 Type *PtrTy = PtrArg->getType();
535
536 SmallVector<llvm::Type *, 6> ArgTys;
537 for (unsigned I = 0; I != PtrArgLoc; ++I)
538 ArgTys.push_back(Elt: CI->getArgOperand(i: I)->getType());
539 ArgTys.push_back(Elt: PtrTy);
540
541 Name = Name + "_" + std::to_string(val: Size);
542 auto *FTy = FunctionType::get(Result: Callee->getReturnType(),
543 Params: ArrayRef<Type *>(ArgTys), isVarArg: false);
544 AMDGPULibFunc NewLibFunc(Name, FTy);
545 FunctionCallee F = AMDGPULibFunc::getOrInsertFunction(M, fInfo: NewLibFunc);
546 if (!F)
547 return false;
548
549 SmallVector<Value *, 6> Args;
550 for (unsigned I = 0; I != PtrArgLoc; ++I)
551 Args.push_back(Elt: CI->getArgOperand(i: I));
552 Args.push_back(Elt: PtrArg);
553
554 auto *NCI = B.CreateCall(Callee: F, Args);
555 NCI->setAttributes(CI->getAttributes());
556 CI->replaceAllUsesWith(V: NCI);
557 CI->dropAllReferences();
558 CI->eraseFromParent();
559
560 return true;
561}
562
563// This function returns false if no change; return true otherwise.
564bool AMDGPULibCalls::fold(CallInst *CI) {
565 Function *Callee = CI->getCalledFunction();
566 // Ignore indirect calls.
567 if (!Callee || Callee->isIntrinsic() || CI->isNoBuiltin())
568 return false;
569
570 FuncInfo FInfo;
571 if (!parseFunctionName(FMangledName: Callee->getName(), FInfo))
572 return false;
573
574 // Further check the number of arguments to see if they match.
575 // TODO: Check calling convention matches too
576 if (!FInfo.isCompatibleSignature(M: *Callee->getParent(), FuncTy: CI->getFunctionType()))
577 return false;
578
579 LLVM_DEBUG(dbgs() << "AMDIC: try folding " << *CI << '\n');
580
581 if (TDOFold(CI, FInfo))
582 return true;
583
584 IRBuilder<> B(CI);
585 if (CI->isStrictFP())
586 B.setIsFPConstrained(true);
587
588 if (FPMathOperator *FPOp = dyn_cast<FPMathOperator>(Val: CI)) {
589 // Under unsafe-math, evaluate calls if possible.
590 // According to Brian Sumner, we can do this for all f32 function calls
591 // using host's double function calls.
592 if (canIncreasePrecisionOfConstantFold(FPOp) && evaluateCall(aCI: CI, FInfo))
593 return true;
594
595 // Copy fast flags from the original call.
596 FastMathFlags FMF = FPOp->getFastMathFlags();
597 B.setFastMathFlags(FMF);
598
599 // Specialized optimizations for each function call.
600 //
601 // TODO: Handle native functions
602 switch (FInfo.getId()) {
603 case AMDGPULibFunc::EI_EXP:
604 if (FMF.none())
605 return false;
606 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::exp,
607 AllowMinSizeF32: FMF.approxFunc());
608 case AMDGPULibFunc::EI_EXP2:
609 if (FMF.none())
610 return false;
611 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::exp2,
612 AllowMinSizeF32: FMF.approxFunc());
613 case AMDGPULibFunc::EI_LOG:
614 if (FMF.none())
615 return false;
616 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::log,
617 AllowMinSizeF32: FMF.approxFunc());
618 case AMDGPULibFunc::EI_LOG2:
619 if (FMF.none())
620 return false;
621 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::log2,
622 AllowMinSizeF32: FMF.approxFunc());
623 case AMDGPULibFunc::EI_LOG10:
624 if (FMF.none())
625 return false;
626 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::log10,
627 AllowMinSizeF32: FMF.approxFunc());
628 case AMDGPULibFunc::EI_FMIN:
629 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::minnum,
630 AllowMinSizeF32: true, AllowF64: true);
631 case AMDGPULibFunc::EI_FMAX:
632 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::maxnum,
633 AllowMinSizeF32: true, AllowF64: true);
634 case AMDGPULibFunc::EI_FMA:
635 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::fma, AllowMinSizeF32: true,
636 AllowF64: true);
637 case AMDGPULibFunc::EI_MAD:
638 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::fmuladd,
639 AllowMinSizeF32: true, AllowF64: true);
640 case AMDGPULibFunc::EI_FABS:
641 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::fabs, AllowMinSizeF32: true,
642 AllowF64: true, AllowStrictFP: true);
643 case AMDGPULibFunc::EI_COPYSIGN:
644 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::copysign,
645 AllowMinSizeF32: true, AllowF64: true, AllowStrictFP: true);
646 case AMDGPULibFunc::EI_FLOOR:
647 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::floor, AllowMinSizeF32: true,
648 AllowF64: true);
649 case AMDGPULibFunc::EI_CEIL:
650 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::ceil, AllowMinSizeF32: true,
651 AllowF64: true);
652 case AMDGPULibFunc::EI_TRUNC:
653 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::trunc, AllowMinSizeF32: true,
654 AllowF64: true);
655 case AMDGPULibFunc::EI_RINT:
656 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::rint, AllowMinSizeF32: true,
657 AllowF64: true);
658 case AMDGPULibFunc::EI_ROUND:
659 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::round, AllowMinSizeF32: true,
660 AllowF64: true);
661 case AMDGPULibFunc::EI_LDEXP: {
662 if (!shouldReplaceLibcallWithIntrinsic(CI, AllowMinSizeF32: true, AllowF64: true))
663 return false;
664
665 Value *Arg1 = CI->getArgOperand(i: 1);
666 if (VectorType *VecTy = dyn_cast<VectorType>(Val: CI->getType());
667 VecTy && !isa<VectorType>(Val: Arg1->getType())) {
668 Value *SplatArg1 = B.CreateVectorSplat(EC: VecTy->getElementCount(), V: Arg1);
669 CI->setArgOperand(i: 1, v: SplatArg1);
670 }
671
672 CI->setCalledFunction(Intrinsic::getOrInsertDeclaration(
673 M: CI->getModule(), id: Intrinsic::ldexp,
674 Tys: {CI->getType(), CI->getArgOperand(i: 1)->getType()}));
675 return true;
676 }
677 case AMDGPULibFunc::EI_POW: {
678 Module *M = Callee->getParent();
679 AMDGPULibFunc PowrInfo(AMDGPULibFunc::EI_POWR, FInfo);
680 FunctionCallee PowrFunc = getFunction(M, fInfo: PowrInfo);
681 CallInst *Call = cast<CallInst>(Val: FPOp);
682
683 // pow(x, y) -> powr(x, y) for x >= -0.0
684 // TODO: Account for flags on current call
685 if (PowrFunc && cannotBeOrderedLessThanZero(
686 V: FPOp->getOperand(i: 0), SQ: SQ.getWithInstruction(I: Call))) {
687 Call->setCalledFunction(PowrFunc);
688 return fold_pow(FPOp, B, FInfo: PowrInfo) || true;
689 }
690
691 // pow(x, y) -> pown(x, y) for known integral y
692 if (isKnownIntegral(V: FPOp->getOperand(i: 1), SQ: SQ.getWithInstruction(I: CI),
693 FMF: FPOp->getFastMathFlags())) {
694 FunctionType *PownType = getPownType(FT: CI->getFunctionType());
695 AMDGPULibFunc PownInfo(AMDGPULibFunc::EI_POWN, PownType, true);
696 FunctionCallee PownFunc = getFunction(M, fInfo: PownInfo);
697 if (PownFunc) {
698 // TODO: If the incoming integral value is an sitofp/uitofp, it won't
699 // fold out without a known range. We can probably take the source
700 // value directly.
701 Value *CastedArg =
702 B.CreateFPToSI(V: FPOp->getOperand(i: 1), DestTy: PownType->getParamType(i: 1));
703 // Have to drop any nofpclass attributes on the original call site.
704 Call->removeParamAttrs(
705 ArgNo: 1, AttrsToRemove: AttributeFuncs::typeIncompatible(Ty: CastedArg->getType(),
706 AS: Call->getParamAttributes(ArgNo: 1)));
707 Call->setCalledFunction(PownFunc);
708 Call->setArgOperand(i: 1, v: CastedArg);
709 return fold_pow(FPOp, B, FInfo: PownInfo) || true;
710 }
711 }
712
713 return fold_pow(FPOp, B, FInfo);
714 }
715 case AMDGPULibFunc::EI_POWR:
716 case AMDGPULibFunc::EI_POWN:
717 return fold_pow(FPOp, B, FInfo);
718 case AMDGPULibFunc::EI_ROOTN:
719 return fold_rootn(FPOp, B, FInfo);
720 case AMDGPULibFunc::EI_SQRT:
721 // TODO: Allow with strictfp + constrained intrinsic
722 return tryReplaceLibcallWithSimpleIntrinsic(
723 B, CI, IntrID: Intrinsic::sqrt, AllowMinSizeF32: true, AllowF64: true, /*AllowStrictFP=*/false);
724 case AMDGPULibFunc::EI_COS:
725 case AMDGPULibFunc::EI_SIN:
726 return fold_sincos(FPOp, B, FInfo);
727 default:
728 break;
729 }
730 } else {
731 // Specialized optimizations for each function call
732 switch (FInfo.getId()) {
733 case AMDGPULibFunc::EI_READ_PIPE_2:
734 case AMDGPULibFunc::EI_READ_PIPE_4:
735 case AMDGPULibFunc::EI_WRITE_PIPE_2:
736 case AMDGPULibFunc::EI_WRITE_PIPE_4:
737 return fold_read_write_pipe(CI, B, FInfo);
738 default:
739 break;
740 }
741 }
742
743 return false;
744}
745
746bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
747 // Table-Driven optimization
748 const TableRef tr = getOptTable(id: FInfo.getId());
749 if (tr.empty())
750 return false;
751
752 int const sz = (int)tr.size();
753 Value *opr0 = CI->getArgOperand(i: 0);
754
755 if (getVecSize(FInfo) > 1) {
756 if (ConstantDataVector *CV = dyn_cast<ConstantDataVector>(Val: opr0)) {
757 SmallVector<double, 0> DVal;
758 for (int eltNo = 0; eltNo < getVecSize(FInfo); ++eltNo) {
759 ConstantFP *eltval = dyn_cast<ConstantFP>(
760 Val: CV->getElementAsConstant(i: (unsigned)eltNo));
761 assert(eltval && "Non-FP arguments in math function!");
762 bool found = false;
763 for (int i=0; i < sz; ++i) {
764 if (eltval->isExactlyValue(V: tr[i].input)) {
765 DVal.push_back(Elt: tr[i].result);
766 found = true;
767 break;
768 }
769 }
770 if (!found) {
771 // This vector constants not handled yet.
772 return false;
773 }
774 }
775 LLVMContext &context = CI->getContext();
776 Constant *nval;
777 if (getArgType(FInfo) == AMDGPULibFunc::F32) {
778 SmallVector<float, 0> FVal;
779 for (double D : DVal)
780 FVal.push_back(Elt: (float)D);
781 ArrayRef<float> tmp(FVal);
782 nval = ConstantDataVector::get(Context&: context, Elts: tmp);
783 } else { // F64
784 ArrayRef<double> tmp(DVal);
785 nval = ConstantDataVector::get(Context&: context, Elts: tmp);
786 }
787 LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n");
788 replaceCall(I: CI, With: nval);
789 return true;
790 }
791 } else {
792 // Scalar version
793 if (ConstantFP *CF = dyn_cast<ConstantFP>(Val: opr0)) {
794 for (int i = 0; i < sz; ++i) {
795 if (CF->isExactlyValue(V: tr[i].input)) {
796 Value *nval = ConstantFP::get(Ty: CF->getType(), V: tr[i].result);
797 LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n");
798 replaceCall(I: CI, With: nval);
799 return true;
800 }
801 }
802 }
803 }
804
805 return false;
806}
807
808namespace llvm {
809static double log2(double V) {
810#if _XOPEN_SOURCE >= 600 || defined(_ISOC99_SOURCE) || _POSIX_C_SOURCE >= 200112L
811 return ::log2(x: V);
812#else
813 return log(V) / numbers::ln2;
814#endif
815}
816} // namespace llvm
817
818bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B,
819 const FuncInfo &FInfo) {
820 assert((FInfo.getId() == AMDGPULibFunc::EI_POW ||
821 FInfo.getId() == AMDGPULibFunc::EI_POWR ||
822 FInfo.getId() == AMDGPULibFunc::EI_POWN) &&
823 "fold_pow: encounter a wrong function call");
824
825 Module *M = B.GetInsertBlock()->getModule();
826 Type *eltType = FPOp->getType()->getScalarType();
827 Value *opr0 = FPOp->getOperand(i: 0);
828 Value *opr1 = FPOp->getOperand(i: 1);
829
830 const APFloat *CF = nullptr;
831 const APInt *CINT = nullptr;
832 if (!match(V: opr1, P: m_APFloatAllowPoison(Res&: CF)))
833 match(V: opr1, P: m_APIntAllowPoison(Res&: CINT));
834
835 // 0x1111111 means that we don't do anything for this call.
836 int ci_opr1 = (CINT ? (int)CINT->getSExtValue() : 0x1111111);
837
838 if ((CF && CF->isZero()) || (CINT && ci_opr1 == 0)) {
839 // pow/powr/pown(x, 0) == 1
840 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1\n");
841 Constant *cnval = ConstantFP::get(Ty: eltType, V: 1.0);
842 if (getVecSize(FInfo) > 1) {
843 cnval = ConstantDataVector::getSplat(NumElts: getVecSize(FInfo), Elt: cnval);
844 }
845 replaceCall(I: FPOp, With: cnval);
846 return true;
847 }
848 if ((CF && CF->isExactlyValue(V: 1.0)) || (CINT && ci_opr1 == 1)) {
849 // pow/powr/pown(x, 1.0) = x
850 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << *opr0 << "\n");
851 replaceCall(I: FPOp, With: opr0);
852 return true;
853 }
854 if ((CF && CF->isExactlyValue(V: 2.0)) || (CINT && ci_opr1 == 2)) {
855 // pow/powr/pown(x, 2.0) = x*x
856 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << *opr0 << " * "
857 << *opr0 << "\n");
858 Value *nval = B.CreateFMul(L: opr0, R: opr0, Name: "__pow2");
859 replaceCall(I: FPOp, With: nval);
860 return true;
861 }
862 if ((CF && CF->isExactlyValue(V: -1.0)) || (CINT && ci_opr1 == -1)) {
863 // pow/powr/pown(x, -1.0) = 1.0/x
864 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1 / " << *opr0 << "\n");
865 Constant *cnval = ConstantFP::get(Ty: eltType, V: 1.0);
866 if (getVecSize(FInfo) > 1) {
867 cnval = ConstantDataVector::getSplat(NumElts: getVecSize(FInfo), Elt: cnval);
868 }
869 Value *nval = B.CreateFDiv(L: cnval, R: opr0, Name: "__powrecip");
870 replaceCall(I: FPOp, With: nval);
871 return true;
872 }
873
874 if (CF && (CF->isExactlyValue(V: 0.5) || CF->isExactlyValue(V: -0.5))) {
875 // pow[r](x, [-]0.5) = sqrt(x)
876 bool issqrt = CF->isExactlyValue(V: 0.5);
877 if (FunctionCallee FPExpr =
878 getFunction(M, fInfo: AMDGPULibFunc(issqrt ? AMDGPULibFunc::EI_SQRT
879 : AMDGPULibFunc::EI_RSQRT,
880 FInfo))) {
881 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << FInfo.getName()
882 << '(' << *opr0 << ")\n");
883 Value *nval = CreateCallEx(B,Callee: FPExpr, Arg: opr0, Name: issqrt ? "__pow2sqrt"
884 : "__pow2rsqrt");
885 replaceCall(I: FPOp, With: nval);
886 return true;
887 }
888 }
889
890 if (!isUnsafeFiniteOnlyMath(FPOp))
891 return false;
892
893 // Unsafe Math optimization
894
895 // Remember that ci_opr1 is set if opr1 is integral
896 if (CF) {
897 double dval = (getArgType(FInfo) == AMDGPULibFunc::F32)
898 ? (double)CF->convertToFloat()
899 : CF->convertToDouble();
900 int ival = (int)dval;
901 if ((double)ival == dval) {
902 ci_opr1 = ival;
903 } else
904 ci_opr1 = 0x11111111;
905 }
906
907 // pow/powr/pown(x, c) = [1/](x*x*..x); where
908 // trunc(c) == c && the number of x == c && |c| <= 12
909 unsigned abs_opr1 = (ci_opr1 < 0) ? -ci_opr1 : ci_opr1;
910 if (abs_opr1 <= 12) {
911 Constant *cnval;
912 Value *nval;
913 if (abs_opr1 == 0) {
914 cnval = ConstantFP::get(Ty: eltType, V: 1.0);
915 if (getVecSize(FInfo) > 1) {
916 cnval = ConstantDataVector::getSplat(NumElts: getVecSize(FInfo), Elt: cnval);
917 }
918 nval = cnval;
919 } else {
920 Value *valx2 = nullptr;
921 nval = nullptr;
922 while (abs_opr1 > 0) {
923 valx2 = valx2 ? B.CreateFMul(L: valx2, R: valx2, Name: "__powx2") : opr0;
924 if (abs_opr1 & 1) {
925 nval = nval ? B.CreateFMul(L: nval, R: valx2, Name: "__powprod") : valx2;
926 }
927 abs_opr1 >>= 1;
928 }
929 }
930
931 if (ci_opr1 < 0) {
932 cnval = ConstantFP::get(Ty: eltType, V: 1.0);
933 if (getVecSize(FInfo) > 1) {
934 cnval = ConstantDataVector::getSplat(NumElts: getVecSize(FInfo), Elt: cnval);
935 }
936 nval = B.CreateFDiv(L: cnval, R: nval, Name: "__1powprod");
937 }
938 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> "
939 << ((ci_opr1 < 0) ? "1/prod(" : "prod(") << *opr0
940 << ")\n");
941 replaceCall(I: FPOp, With: nval);
942 return true;
943 }
944
945 // If we should use the generic intrinsic instead of emitting a libcall
946 const bool ShouldUseIntrinsic = eltType->isFloatTy() || eltType->isHalfTy();
947
948 // powr ---> exp2(y * log2(x))
949 // pown/pow ---> powr(fabs(x), y) | (x & ((int)y << 31))
950 FunctionCallee ExpExpr;
951 if (ShouldUseIntrinsic)
952 ExpExpr = Intrinsic::getOrInsertDeclaration(M, id: Intrinsic::exp2,
953 Tys: {FPOp->getType()});
954 else {
955 ExpExpr = getFunction(M, fInfo: AMDGPULibFunc(AMDGPULibFunc::EI_EXP2, FInfo));
956 if (!ExpExpr)
957 return false;
958 }
959
960 bool needlog = false;
961 bool needabs = false;
962 bool needcopysign = false;
963 Constant *cnval = nullptr;
964 if (getVecSize(FInfo) == 1) {
965 CF = nullptr;
966 match(V: opr0, P: m_APFloatAllowPoison(Res&: CF));
967
968 if (CF) {
969 double V = (getArgType(FInfo) == AMDGPULibFunc::F32)
970 ? (double)CF->convertToFloat()
971 : CF->convertToDouble();
972
973 V = log2(V: std::abs(x: V));
974 cnval = ConstantFP::get(Ty: eltType, V);
975 needcopysign = (FInfo.getId() != AMDGPULibFunc::EI_POWR) &&
976 CF->isNegative();
977 } else {
978 needlog = true;
979 needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR;
980 }
981 } else {
982 ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(Val: opr0);
983
984 if (!CDV) {
985 needlog = true;
986 needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR;
987 } else {
988 assert ((int)CDV->getNumElements() == getVecSize(FInfo) &&
989 "Wrong vector size detected");
990
991 SmallVector<double, 0> DVal;
992 for (int i=0; i < getVecSize(FInfo); ++i) {
993 double V = CDV->getElementAsAPFloat(i).convertToDouble();
994 if (V < 0.0) needcopysign = true;
995 V = log2(V: std::abs(x: V));
996 DVal.push_back(Elt: V);
997 }
998 if (getArgType(FInfo) == AMDGPULibFunc::F32) {
999 SmallVector<float, 0> FVal;
1000 for (double D : DVal)
1001 FVal.push_back(Elt: (float)D);
1002 ArrayRef<float> tmp(FVal);
1003 cnval = ConstantDataVector::get(Context&: M->getContext(), Elts: tmp);
1004 } else {
1005 ArrayRef<double> tmp(DVal);
1006 cnval = ConstantDataVector::get(Context&: M->getContext(), Elts: tmp);
1007 }
1008 }
1009 }
1010
1011 if (needcopysign && (FInfo.getId() == AMDGPULibFunc::EI_POW)) {
1012 // We cannot handle corner cases for a general pow() function, give up
1013 // unless y is a constant integral value. Then proceed as if it were pown.
1014 if (!isKnownIntegral(V: opr1, SQ: SQ.getWithInstruction(I: cast<Instruction>(Val: FPOp)),
1015 FMF: FPOp->getFastMathFlags()))
1016 return false;
1017 }
1018
1019 Value *nval;
1020 if (needabs) {
1021 nval = B.CreateUnaryIntrinsic(ID: Intrinsic::fabs, V: opr0, FMFSource: nullptr, Name: "__fabs");
1022 } else {
1023 nval = cnval ? cnval : opr0;
1024 }
1025 if (needlog) {
1026 FunctionCallee LogExpr;
1027 if (ShouldUseIntrinsic) {
1028 LogExpr = Intrinsic::getOrInsertDeclaration(M, id: Intrinsic::log2,
1029 Tys: {FPOp->getType()});
1030 } else {
1031 LogExpr = getFunction(M, fInfo: AMDGPULibFunc(AMDGPULibFunc::EI_LOG2, FInfo));
1032 if (!LogExpr)
1033 return false;
1034 }
1035
1036 nval = CreateCallEx(B,Callee: LogExpr, Arg: nval, Name: "__log2");
1037 }
1038
1039 if (FInfo.getId() == AMDGPULibFunc::EI_POWN) {
1040 // convert int(32) to fp(f32 or f64)
1041 opr1 = B.CreateSIToFP(V: opr1, DestTy: nval->getType(), Name: "pownI2F");
1042 }
1043 nval = B.CreateFMul(L: opr1, R: nval, Name: "__ylogx");
1044 nval = CreateCallEx(B,Callee: ExpExpr, Arg: nval, Name: "__exp2");
1045
1046 if (needcopysign) {
1047 Type* nTyS = B.getIntNTy(N: eltType->getPrimitiveSizeInBits());
1048 Type *nTy = FPOp->getType()->getWithNewType(EltTy: nTyS);
1049 unsigned size = nTy->getScalarSizeInBits();
1050 Value *opr_n = FPOp->getOperand(i: 1);
1051 if (opr_n->getType()->getScalarType()->isIntegerTy())
1052 opr_n = B.CreateZExtOrTrunc(V: opr_n, DestTy: nTy, Name: "__ytou");
1053 else
1054 opr_n = B.CreateFPToSI(V: opr1, DestTy: nTy, Name: "__ytou");
1055
1056 Value *sign = B.CreateShl(LHS: opr_n, RHS: size-1, Name: "__yeven");
1057 sign = B.CreateAnd(LHS: B.CreateBitCast(V: opr0, DestTy: nTy), RHS: sign, Name: "__pow_sign");
1058 nval = B.CreateOr(LHS: B.CreateBitCast(V: nval, DestTy: nTy), RHS: sign);
1059 nval = B.CreateBitCast(V: nval, DestTy: opr0->getType());
1060 }
1061
1062 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> "
1063 << "exp2(" << *opr1 << " * log2(" << *opr0 << "))\n");
1064 replaceCall(I: FPOp, With: nval);
1065
1066 return true;
1067}
1068
1069bool AMDGPULibCalls::fold_rootn(FPMathOperator *FPOp, IRBuilder<> &B,
1070 const FuncInfo &FInfo) {
1071 Value *opr0 = FPOp->getOperand(i: 0);
1072 Value *opr1 = FPOp->getOperand(i: 1);
1073
1074 const APInt *CINT = nullptr;
1075 if (!match(V: opr1, P: m_APIntAllowPoison(Res&: CINT)))
1076 return false;
1077
1078 Function *Parent = B.GetInsertBlock()->getParent();
1079
1080 int ci_opr1 = (int)CINT->getSExtValue();
1081 if (ci_opr1 == 1 && !Parent->hasFnAttribute(Kind: Attribute::StrictFP)) {
1082 // rootn(x, 1) = x
1083 //
1084 // TODO: Insert constrained canonicalize for strictfp case.
1085 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << *opr0 << '\n');
1086 replaceCall(I: FPOp, With: opr0);
1087 return true;
1088 }
1089
1090 Module *M = B.GetInsertBlock()->getModule();
1091
1092 CallInst *CI = cast<CallInst>(Val: FPOp);
1093 if (ci_opr1 == 2 &&
1094 shouldReplaceLibcallWithIntrinsic(CI,
1095 /*AllowMinSizeF32=*/true,
1096 /*AllowF64=*/true)) {
1097 // rootn(x, 2) = sqrt(x)
1098 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> sqrt(" << *opr0 << ")\n");
1099
1100 CallInst *NewCall = B.CreateUnaryIntrinsic(ID: Intrinsic::sqrt, V: opr0, FMFSource: CI);
1101 NewCall->takeName(V: CI);
1102
1103 // OpenCL rootn has a looser ulp of 2 requirement than sqrt, so add some
1104 // metadata.
1105 MDBuilder MDHelper(M->getContext());
1106 MDNode *FPMD = MDHelper.createFPMath(Accuracy: std::max(a: FPOp->getFPAccuracy(), b: 2.0f));
1107 NewCall->setMetadata(KindID: LLVMContext::MD_fpmath, Node: FPMD);
1108
1109 replaceCall(I: CI, With: NewCall);
1110 return true;
1111 }
1112
1113 if (ci_opr1 == 3) { // rootn(x, 3) = cbrt(x)
1114 if (FunctionCallee FPExpr =
1115 getFunction(M, fInfo: AMDGPULibFunc(AMDGPULibFunc::EI_CBRT, FInfo))) {
1116 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> cbrt(" << *opr0
1117 << ")\n");
1118 Value *nval = CreateCallEx(B,Callee: FPExpr, Arg: opr0, Name: "__rootn2cbrt");
1119 replaceCall(I: FPOp, With: nval);
1120 return true;
1121 }
1122 } else if (ci_opr1 == -1) { // rootn(x, -1) = 1.0/x
1123 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1.0 / " << *opr0 << "\n");
1124 Value *nval = B.CreateFDiv(L: ConstantFP::get(Ty: opr0->getType(), V: 1.0),
1125 R: opr0,
1126 Name: "__rootn2div");
1127 replaceCall(I: FPOp, With: nval);
1128 return true;
1129 }
1130
1131 if (ci_opr1 == -2 &&
1132 shouldReplaceLibcallWithIntrinsic(CI,
1133 /*AllowMinSizeF32=*/true,
1134 /*AllowF64=*/true)) {
1135 // rootn(x, -2) = rsqrt(x)
1136
1137 // The original rootn had looser ulp requirements than the resultant sqrt
1138 // and fdiv.
1139 MDBuilder MDHelper(M->getContext());
1140 MDNode *FPMD = MDHelper.createFPMath(Accuracy: std::max(a: FPOp->getFPAccuracy(), b: 2.0f));
1141
1142 // TODO: Could handle strictfp but need to fix strict sqrt emission
1143 FastMathFlags FMF = FPOp->getFastMathFlags();
1144 FMF.setAllowContract(true);
1145
1146 CallInst *Sqrt = B.CreateUnaryIntrinsic(ID: Intrinsic::sqrt, V: opr0, FMFSource: CI);
1147 Instruction *RSqrt = cast<Instruction>(
1148 Val: B.CreateFDiv(L: ConstantFP::get(Ty: opr0->getType(), V: 1.0), R: Sqrt));
1149 Sqrt->setFastMathFlags(FMF);
1150 RSqrt->setFastMathFlags(FMF);
1151 RSqrt->setMetadata(KindID: LLVMContext::MD_fpmath, Node: FPMD);
1152
1153 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> rsqrt(" << *opr0
1154 << ")\n");
1155 replaceCall(I: CI, With: RSqrt);
1156 return true;
1157 }
1158
1159 return false;
1160}
1161
1162// Get a scalar native builtin single argument FP function
1163FunctionCallee AMDGPULibCalls::getNativeFunction(Module *M,
1164 const FuncInfo &FInfo) {
1165 if (getArgType(FInfo) == AMDGPULibFunc::F64 || !HasNative(id: FInfo.getId()))
1166 return nullptr;
1167 FuncInfo nf = FInfo;
1168 nf.setPrefix(AMDGPULibFunc::NATIVE);
1169 return getFunction(M, fInfo: nf);
1170}
1171
1172// Some library calls are just wrappers around llvm intrinsics, but compiled
1173// conservatively. Preserve the flags from the original call site by
1174// substituting them with direct calls with all the flags.
1175bool AMDGPULibCalls::shouldReplaceLibcallWithIntrinsic(const CallInst *CI,
1176 bool AllowMinSizeF32,
1177 bool AllowF64,
1178 bool AllowStrictFP) {
1179 Type *FltTy = CI->getType()->getScalarType();
1180 const bool IsF32 = FltTy->isFloatTy();
1181
1182 // f64 intrinsics aren't implemented for most operations.
1183 if (!IsF32 && !FltTy->isHalfTy() && (!AllowF64 || !FltTy->isDoubleTy()))
1184 return false;
1185
1186 // We're implicitly inlining by replacing the libcall with the intrinsic, so
1187 // don't do it for noinline call sites.
1188 if (CI->isNoInline())
1189 return false;
1190
1191 const Function *ParentF = CI->getFunction();
1192 // TODO: Handle strictfp
1193 if (!AllowStrictFP && ParentF->hasFnAttribute(Kind: Attribute::StrictFP))
1194 return false;
1195
1196 if (IsF32 && !AllowMinSizeF32 && ParentF->hasMinSize())
1197 return false;
1198 return true;
1199}
1200
1201void AMDGPULibCalls::replaceLibCallWithSimpleIntrinsic(IRBuilder<> &B,
1202 CallInst *CI,
1203 Intrinsic::ID IntrID) {
1204 if (CI->arg_size() == 2) {
1205 Value *Arg0 = CI->getArgOperand(i: 0);
1206 Value *Arg1 = CI->getArgOperand(i: 1);
1207 VectorType *Arg0VecTy = dyn_cast<VectorType>(Val: Arg0->getType());
1208 VectorType *Arg1VecTy = dyn_cast<VectorType>(Val: Arg1->getType());
1209 if (Arg0VecTy && !Arg1VecTy) {
1210 Value *SplatRHS = B.CreateVectorSplat(EC: Arg0VecTy->getElementCount(), V: Arg1);
1211 CI->setArgOperand(i: 1, v: SplatRHS);
1212 } else if (!Arg0VecTy && Arg1VecTy) {
1213 Value *SplatLHS = B.CreateVectorSplat(EC: Arg1VecTy->getElementCount(), V: Arg0);
1214 CI->setArgOperand(i: 0, v: SplatLHS);
1215 }
1216 }
1217
1218 CI->setCalledFunction(Intrinsic::getOrInsertDeclaration(
1219 M: CI->getModule(), id: IntrID, Tys: {CI->getType()}));
1220}
1221
1222bool AMDGPULibCalls::tryReplaceLibcallWithSimpleIntrinsic(
1223 IRBuilder<> &B, CallInst *CI, Intrinsic::ID IntrID, bool AllowMinSizeF32,
1224 bool AllowF64, bool AllowStrictFP) {
1225 if (!shouldReplaceLibcallWithIntrinsic(CI, AllowMinSizeF32, AllowF64,
1226 AllowStrictFP))
1227 return false;
1228 replaceLibCallWithSimpleIntrinsic(B, CI, IntrID);
1229 return true;
1230}
1231
1232std::tuple<Value *, Value *, Value *>
1233AMDGPULibCalls::insertSinCos(Value *Arg, FastMathFlags FMF, IRBuilder<> &B,
1234 FunctionCallee Fsincos) {
1235 DebugLoc DL = B.getCurrentDebugLocation();
1236 Function *F = B.GetInsertBlock()->getParent();
1237 B.SetInsertPointPastAllocas(F);
1238
1239 AllocaInst *Alloc = B.CreateAlloca(Ty: Arg->getType(), ArraySize: nullptr, Name: "__sincos_");
1240
1241 if (Instruction *ArgInst = dyn_cast<Instruction>(Val: Arg)) {
1242 // If the argument is an instruction, it must dominate all uses so put our
1243 // sincos call there. Otherwise, right after the allocas works well enough
1244 // if it's an argument or constant.
1245
1246 B.SetInsertPoint(TheBB: ArgInst->getParent(), IP: ++ArgInst->getIterator());
1247
1248 // SetInsertPoint unwelcomely always tries to set the debug loc.
1249 B.SetCurrentDebugLocation(DL);
1250 }
1251
1252 Type *CosPtrTy = Fsincos.getFunctionType()->getParamType(i: 1);
1253
1254 // The allocaInst allocates the memory in private address space. This need
1255 // to be addrspacecasted to point to the address space of cos pointer type.
1256 // In OpenCL 2.0 this is generic, while in 1.2 that is private.
1257 Value *CastAlloc = B.CreateAddrSpaceCast(V: Alloc, DestTy: CosPtrTy);
1258
1259 CallInst *SinCos = CreateCallEx2(B, Callee: Fsincos, Arg1: Arg, Arg2: CastAlloc);
1260
1261 // TODO: Is it worth trying to preserve the location for the cos calls for the
1262 // load?
1263
1264 LoadInst *LoadCos = B.CreateLoad(Ty: Arg->getType(), Ptr: Alloc);
1265 return {SinCos, LoadCos, SinCos};
1266}
1267
1268// fold sin, cos -> sincos.
1269bool AMDGPULibCalls::fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B,
1270 const FuncInfo &fInfo) {
1271 assert(fInfo.getId() == AMDGPULibFunc::EI_SIN ||
1272 fInfo.getId() == AMDGPULibFunc::EI_COS);
1273
1274 if ((getArgType(FInfo: fInfo) != AMDGPULibFunc::F32 &&
1275 getArgType(FInfo: fInfo) != AMDGPULibFunc::F64) ||
1276 fInfo.getPrefix() != AMDGPULibFunc::NOPFX)
1277 return false;
1278
1279 bool const isSin = fInfo.getId() == AMDGPULibFunc::EI_SIN;
1280
1281 Value *CArgVal = FPOp->getOperand(i: 0);
1282
1283 // TODO: Constant fold the call
1284 if (isa<ConstantData>(Val: CArgVal))
1285 return false;
1286
1287 CallInst *CI = cast<CallInst>(Val: FPOp);
1288
1289 Function *F = B.GetInsertBlock()->getParent();
1290 Module *M = F->getParent();
1291
1292 // Merge the sin and cos. For OpenCL 2.0, there may only be a generic pointer
1293 // implementation. Prefer the private form if available.
1294 AMDGPULibFunc SinCosLibFuncPrivate(AMDGPULibFunc::EI_SINCOS, fInfo);
1295 SinCosLibFuncPrivate.getLeads()[0].PtrKind =
1296 AMDGPULibFunc::getEPtrKindFromAddrSpace(AS: AMDGPUAS::PRIVATE_ADDRESS);
1297
1298 AMDGPULibFunc SinCosLibFuncGeneric(AMDGPULibFunc::EI_SINCOS, fInfo);
1299 SinCosLibFuncGeneric.getLeads()[0].PtrKind =
1300 AMDGPULibFunc::getEPtrKindFromAddrSpace(AS: AMDGPUAS::FLAT_ADDRESS);
1301
1302 FunctionCallee FSinCosPrivate = getFunction(M, fInfo: SinCosLibFuncPrivate);
1303 FunctionCallee FSinCosGeneric = getFunction(M, fInfo: SinCosLibFuncGeneric);
1304 FunctionCallee FSinCos = FSinCosPrivate ? FSinCosPrivate : FSinCosGeneric;
1305 if (!FSinCos)
1306 return false;
1307
1308 SmallVector<CallInst *> SinCalls;
1309 SmallVector<CallInst *> CosCalls;
1310 SmallVector<CallInst *> SinCosCalls;
1311 FuncInfo PartnerInfo(isSin ? AMDGPULibFunc::EI_COS : AMDGPULibFunc::EI_SIN,
1312 fInfo);
1313 const std::string PairName = PartnerInfo.mangle();
1314
1315 StringRef SinName = isSin ? CI->getCalledFunction()->getName() : PairName;
1316 StringRef CosName = isSin ? PairName : CI->getCalledFunction()->getName();
1317 const std::string SinCosPrivateName = SinCosLibFuncPrivate.mangle();
1318 const std::string SinCosGenericName = SinCosLibFuncGeneric.mangle();
1319
1320 // Intersect the two sets of flags.
1321 FastMathFlags FMF = FPOp->getFastMathFlags();
1322 MDNode *FPMath = CI->getMetadata(KindID: LLVMContext::MD_fpmath);
1323
1324 SmallVector<DILocation *> MergeDbgLocs = {CI->getDebugLoc()};
1325
1326 for (User* U : CArgVal->users()) {
1327 CallInst *XI = dyn_cast<CallInst>(Val: U);
1328 if (!XI || XI->getFunction() != F || XI->isNoBuiltin())
1329 continue;
1330
1331 Function *UCallee = XI->getCalledFunction();
1332 if (!UCallee)
1333 continue;
1334
1335 bool Handled = true;
1336
1337 if (UCallee->getName() == SinName)
1338 SinCalls.push_back(Elt: XI);
1339 else if (UCallee->getName() == CosName)
1340 CosCalls.push_back(Elt: XI);
1341 else if (UCallee->getName() == SinCosPrivateName ||
1342 UCallee->getName() == SinCosGenericName)
1343 SinCosCalls.push_back(Elt: XI);
1344 else
1345 Handled = false;
1346
1347 if (Handled) {
1348 MergeDbgLocs.push_back(Elt: XI->getDebugLoc());
1349 auto *OtherOp = cast<FPMathOperator>(Val: XI);
1350 FMF &= OtherOp->getFastMathFlags();
1351 FPMath = MDNode::getMostGenericFPMath(
1352 A: FPMath, B: XI->getMetadata(KindID: LLVMContext::MD_fpmath));
1353 }
1354 }
1355
1356 if (SinCalls.empty() || CosCalls.empty())
1357 return false;
1358
1359 B.setFastMathFlags(FMF);
1360 B.setDefaultFPMathTag(FPMath);
1361 DILocation *DbgLoc = DILocation::getMergedLocations(Locs: MergeDbgLocs);
1362 B.SetCurrentDebugLocation(DbgLoc);
1363
1364 auto [Sin, Cos, SinCos] = insertSinCos(Arg: CArgVal, FMF, B, Fsincos: FSinCos);
1365
1366 auto replaceTrigInsts = [](ArrayRef<CallInst *> Calls, Value *Res) {
1367 for (CallInst *C : Calls)
1368 C->replaceAllUsesWith(V: Res);
1369
1370 // Leave the other dead instructions to avoid clobbering iterators.
1371 };
1372
1373 replaceTrigInsts(SinCalls, Sin);
1374 replaceTrigInsts(CosCalls, Cos);
1375 replaceTrigInsts(SinCosCalls, SinCos);
1376
1377 // It's safe to delete the original now.
1378 CI->eraseFromParent();
1379 return true;
1380}
1381
1382bool AMDGPULibCalls::evaluateScalarMathFunc(const FuncInfo &FInfo, double &Res0,
1383 double &Res1, Constant *copr0,
1384 Constant *copr1) {
1385 // By default, opr0/opr1/opr3 holds values of float/double type.
1386 // If they are not float/double, each function has to its
1387 // operand separately.
1388 double opr0 = 0.0, opr1 = 0.0;
1389 ConstantFP *fpopr0 = dyn_cast_or_null<ConstantFP>(Val: copr0);
1390 ConstantFP *fpopr1 = dyn_cast_or_null<ConstantFP>(Val: copr1);
1391 if (fpopr0) {
1392 opr0 = (getArgType(FInfo) == AMDGPULibFunc::F64)
1393 ? fpopr0->getValueAPF().convertToDouble()
1394 : (double)fpopr0->getValueAPF().convertToFloat();
1395 }
1396
1397 if (fpopr1) {
1398 opr1 = (getArgType(FInfo) == AMDGPULibFunc::F64)
1399 ? fpopr1->getValueAPF().convertToDouble()
1400 : (double)fpopr1->getValueAPF().convertToFloat();
1401 }
1402
1403 switch (FInfo.getId()) {
1404 default : return false;
1405
1406 case AMDGPULibFunc::EI_ACOS:
1407 Res0 = acos(x: opr0);
1408 return true;
1409
1410 case AMDGPULibFunc::EI_ACOSH:
1411 // acosh(x) == log(x + sqrt(x*x - 1))
1412 Res0 = log(x: opr0 + sqrt(x: opr0*opr0 - 1.0));
1413 return true;
1414
1415 case AMDGPULibFunc::EI_ACOSPI:
1416 Res0 = acos(x: opr0) / MATH_PI;
1417 return true;
1418
1419 case AMDGPULibFunc::EI_ASIN:
1420 Res0 = asin(x: opr0);
1421 return true;
1422
1423 case AMDGPULibFunc::EI_ASINH:
1424 // asinh(x) == log(x + sqrt(x*x + 1))
1425 Res0 = log(x: opr0 + sqrt(x: opr0*opr0 + 1.0));
1426 return true;
1427
1428 case AMDGPULibFunc::EI_ASINPI:
1429 Res0 = asin(x: opr0) / MATH_PI;
1430 return true;
1431
1432 case AMDGPULibFunc::EI_ATAN:
1433 Res0 = atan(x: opr0);
1434 return true;
1435
1436 case AMDGPULibFunc::EI_ATANH:
1437 // atanh(x) == (log(x+1) - log(x-1))/2;
1438 Res0 = (log(x: opr0 + 1.0) - log(x: opr0 - 1.0))/2.0;
1439 return true;
1440
1441 case AMDGPULibFunc::EI_ATANPI:
1442 Res0 = atan(x: opr0) / MATH_PI;
1443 return true;
1444
1445 case AMDGPULibFunc::EI_CBRT:
1446 Res0 = (opr0 < 0.0) ? -pow(x: -opr0, y: 1.0/3.0) : pow(x: opr0, y: 1.0/3.0);
1447 return true;
1448
1449 case AMDGPULibFunc::EI_COS:
1450 Res0 = cos(x: opr0);
1451 return true;
1452
1453 case AMDGPULibFunc::EI_COSH:
1454 Res0 = cosh(x: opr0);
1455 return true;
1456
1457 case AMDGPULibFunc::EI_COSPI:
1458 Res0 = cos(MATH_PI * opr0);
1459 return true;
1460
1461 case AMDGPULibFunc::EI_EXP:
1462 Res0 = exp(x: opr0);
1463 return true;
1464
1465 case AMDGPULibFunc::EI_EXP2:
1466 Res0 = pow(x: 2.0, y: opr0);
1467 return true;
1468
1469 case AMDGPULibFunc::EI_EXP10:
1470 Res0 = pow(x: 10.0, y: opr0);
1471 return true;
1472
1473 case AMDGPULibFunc::EI_LOG:
1474 Res0 = log(x: opr0);
1475 return true;
1476
1477 case AMDGPULibFunc::EI_LOG2:
1478 Res0 = log(x: opr0) / log(x: 2.0);
1479 return true;
1480
1481 case AMDGPULibFunc::EI_LOG10:
1482 Res0 = log(x: opr0) / log(x: 10.0);
1483 return true;
1484
1485 case AMDGPULibFunc::EI_RSQRT:
1486 Res0 = 1.0 / sqrt(x: opr0);
1487 return true;
1488
1489 case AMDGPULibFunc::EI_SIN:
1490 Res0 = sin(x: opr0);
1491 return true;
1492
1493 case AMDGPULibFunc::EI_SINH:
1494 Res0 = sinh(x: opr0);
1495 return true;
1496
1497 case AMDGPULibFunc::EI_SINPI:
1498 Res0 = sin(MATH_PI * opr0);
1499 return true;
1500
1501 case AMDGPULibFunc::EI_TAN:
1502 Res0 = tan(x: opr0);
1503 return true;
1504
1505 case AMDGPULibFunc::EI_TANH:
1506 Res0 = tanh(x: opr0);
1507 return true;
1508
1509 case AMDGPULibFunc::EI_TANPI:
1510 Res0 = tan(MATH_PI * opr0);
1511 return true;
1512
1513 // two-arg functions
1514 case AMDGPULibFunc::EI_POW:
1515 case AMDGPULibFunc::EI_POWR:
1516 Res0 = pow(x: opr0, y: opr1);
1517 return true;
1518
1519 case AMDGPULibFunc::EI_POWN: {
1520 if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(Val: copr1)) {
1521 double val = (double)iopr1->getSExtValue();
1522 Res0 = pow(x: opr0, y: val);
1523 return true;
1524 }
1525 return false;
1526 }
1527
1528 case AMDGPULibFunc::EI_ROOTN: {
1529 if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(Val: copr1)) {
1530 double val = (double)iopr1->getSExtValue();
1531 Res0 = pow(x: opr0, y: 1.0 / val);
1532 return true;
1533 }
1534 return false;
1535 }
1536
1537 // with ptr arg
1538 case AMDGPULibFunc::EI_SINCOS:
1539 Res0 = sin(x: opr0);
1540 Res1 = cos(x: opr0);
1541 return true;
1542 }
1543
1544 return false;
1545}
1546
1547bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) {
1548 int numArgs = (int)aCI->arg_size();
1549 if (numArgs > 3)
1550 return false;
1551
1552 Constant *copr0 = nullptr;
1553 Constant *copr1 = nullptr;
1554 if (numArgs > 0) {
1555 if ((copr0 = dyn_cast<Constant>(Val: aCI->getArgOperand(i: 0))) == nullptr)
1556 return false;
1557 }
1558
1559 if (numArgs > 1) {
1560 if ((copr1 = dyn_cast<Constant>(Val: aCI->getArgOperand(i: 1))) == nullptr) {
1561 if (FInfo.getId() != AMDGPULibFunc::EI_SINCOS)
1562 return false;
1563 }
1564 }
1565
1566 // At this point, all arguments to aCI are constants.
1567
1568 // max vector size is 16, and sincos will generate two results.
1569 double DVal0[16], DVal1[16];
1570 int FuncVecSize = getVecSize(FInfo);
1571 bool hasTwoResults = (FInfo.getId() == AMDGPULibFunc::EI_SINCOS);
1572 if (FuncVecSize == 1) {
1573 if (!evaluateScalarMathFunc(FInfo, Res0&: DVal0[0], Res1&: DVal1[0], copr0, copr1)) {
1574 return false;
1575 }
1576 } else {
1577 ConstantDataVector *CDV0 = dyn_cast_or_null<ConstantDataVector>(Val: copr0);
1578 ConstantDataVector *CDV1 = dyn_cast_or_null<ConstantDataVector>(Val: copr1);
1579 for (int i = 0; i < FuncVecSize; ++i) {
1580 Constant *celt0 = CDV0 ? CDV0->getElementAsConstant(i) : nullptr;
1581 Constant *celt1 = CDV1 ? CDV1->getElementAsConstant(i) : nullptr;
1582 if (!evaluateScalarMathFunc(FInfo, Res0&: DVal0[i], Res1&: DVal1[i], copr0: celt0, copr1: celt1)) {
1583 return false;
1584 }
1585 }
1586 }
1587
1588 LLVMContext &context = aCI->getContext();
1589 Constant *nval0, *nval1;
1590 if (FuncVecSize == 1) {
1591 nval0 = ConstantFP::get(Ty: aCI->getType(), V: DVal0[0]);
1592 if (hasTwoResults)
1593 nval1 = ConstantFP::get(Ty: aCI->getType(), V: DVal1[0]);
1594 } else {
1595 if (getArgType(FInfo) == AMDGPULibFunc::F32) {
1596 SmallVector <float, 0> FVal0, FVal1;
1597 for (int i = 0; i < FuncVecSize; ++i)
1598 FVal0.push_back(Elt: (float)DVal0[i]);
1599 ArrayRef<float> tmp0(FVal0);
1600 nval0 = ConstantDataVector::get(Context&: context, Elts: tmp0);
1601 if (hasTwoResults) {
1602 for (int i = 0; i < FuncVecSize; ++i)
1603 FVal1.push_back(Elt: (float)DVal1[i]);
1604 ArrayRef<float> tmp1(FVal1);
1605 nval1 = ConstantDataVector::get(Context&: context, Elts: tmp1);
1606 }
1607 } else {
1608 ArrayRef<double> tmp0(DVal0);
1609 nval0 = ConstantDataVector::get(Context&: context, Elts: tmp0);
1610 if (hasTwoResults) {
1611 ArrayRef<double> tmp1(DVal1);
1612 nval1 = ConstantDataVector::get(Context&: context, Elts: tmp1);
1613 }
1614 }
1615 }
1616
1617 if (hasTwoResults) {
1618 // sincos
1619 assert(FInfo.getId() == AMDGPULibFunc::EI_SINCOS &&
1620 "math function with ptr arg not supported yet");
1621 new StoreInst(nval1, aCI->getArgOperand(i: 1), aCI->getIterator());
1622 }
1623
1624 replaceCall(I: aCI, With: nval0);
1625 return true;
1626}
1627
1628PreservedAnalyses AMDGPUSimplifyLibCallsPass::run(Function &F,
1629 FunctionAnalysisManager &AM) {
1630 AMDGPULibCalls Simplifier(F, AM);
1631 Simplifier.initNativeFuncs();
1632
1633 bool Changed = false;
1634
1635 LLVM_DEBUG(dbgs() << "AMDIC: process function ";
1636 F.printAsOperand(dbgs(), false, F.getParent()); dbgs() << '\n';);
1637
1638 for (auto &BB : F) {
1639 for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;) {
1640 // Ignore non-calls.
1641 CallInst *CI = dyn_cast<CallInst>(Val&: I);
1642 ++I;
1643
1644 if (CI) {
1645 if (Simplifier.fold(CI))
1646 Changed = true;
1647 }
1648 }
1649 }
1650 return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
1651}
1652
1653PreservedAnalyses AMDGPUUseNativeCallsPass::run(Function &F,
1654 FunctionAnalysisManager &AM) {
1655 if (UseNative.empty())
1656 return PreservedAnalyses::all();
1657
1658 AMDGPULibCalls Simplifier(F, AM);
1659 Simplifier.initNativeFuncs();
1660
1661 bool Changed = false;
1662 for (auto &BB : F) {
1663 for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;) {
1664 // Ignore non-calls.
1665 CallInst *CI = dyn_cast<CallInst>(Val&: I);
1666 ++I;
1667 if (CI && Simplifier.useNative(aCI: CI))
1668 Changed = true;
1669 }
1670 }
1671 return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
1672}
1673