1//===- AMDGPULibCalls.cpp -------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file does AMD library function optimizations.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
15#include "AMDGPULibFunc.h"
16#include "llvm/Analysis/AssumptionCache.h"
17#include "llvm/Analysis/TargetLibraryInfo.h"
18#include "llvm/Analysis/ValueTracking.h"
19#include "llvm/IR/AttributeMask.h"
20#include "llvm/IR/Dominators.h"
21#include "llvm/IR/IRBuilder.h"
22#include "llvm/IR/MDBuilder.h"
23#include "llvm/IR/PatternMatch.h"
24#include <cmath>
25
26#define DEBUG_TYPE "amdgpu-simplifylib"
27
28using namespace llvm;
29using namespace llvm::PatternMatch;
30
31static cl::opt<bool> EnablePreLink("amdgpu-prelink",
32 cl::desc("Enable pre-link mode optimizations"),
33 cl::init(Val: false),
34 cl::Hidden);
35
36static cl::list<std::string> UseNative("amdgpu-use-native",
37 cl::desc("Comma separated list of functions to replace with native, or all"),
38 cl::CommaSeparated, cl::ValueOptional,
39 cl::Hidden);
40
41#define MATH_PI numbers::pi
42#define MATH_E numbers::e
43#define MATH_SQRT2 numbers::sqrt2
44#define MATH_SQRT1_2 numbers::inv_sqrt2
45
46namespace llvm {
47
48class AMDGPULibCalls {
49private:
50 const TargetLibraryInfo *TLInfo = nullptr;
51 AssumptionCache *AC = nullptr;
52 DominatorTree *DT = nullptr;
53
54 using FuncInfo = llvm::AMDGPULibFunc;
55
56 bool UnsafeFPMath = false;
57
58 // -fuse-native.
59 bool AllNative = false;
60
61 bool useNativeFunc(const StringRef F) const;
62
63 // Return a pointer (pointer expr) to the function if function definition with
64 // "FuncName" exists. It may create a new function prototype in pre-link mode.
65 FunctionCallee getFunction(Module *M, const FuncInfo &fInfo);
66
67 bool parseFunctionName(const StringRef &FMangledName, FuncInfo &FInfo);
68
69 bool TDOFold(CallInst *CI, const FuncInfo &FInfo);
70
71 /* Specialized optimizations */
72
73 // pow/powr/pown
74 bool fold_pow(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);
75
76 // rootn
77 bool fold_rootn(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);
78
79 // -fuse-native for sincos
80 bool sincosUseNative(CallInst *aCI, const FuncInfo &FInfo);
81
82 // evaluate calls if calls' arguments are constants.
83 bool evaluateScalarMathFunc(const FuncInfo &FInfo, double &Res0, double &Res1,
84 Constant *copr0, Constant *copr1);
85 bool evaluateCall(CallInst *aCI, const FuncInfo &FInfo);
86
87 /// Insert a value to sincos function \p Fsincos. Returns (value of sin, value
88 /// of cos, sincos call).
89 std::tuple<Value *, Value *, Value *> insertSinCos(Value *Arg,
90 FastMathFlags FMF,
91 IRBuilder<> &B,
92 FunctionCallee Fsincos);
93
94 // sin/cos
95 bool fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);
96
97 // __read_pipe/__write_pipe
98 bool fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
99 const FuncInfo &FInfo);
100
101 // Get a scalar native builtin single argument FP function
102 FunctionCallee getNativeFunction(Module *M, const FuncInfo &FInfo);
103
104 /// Substitute a call to a known libcall with an intrinsic call. If \p
105 /// AllowMinSize is true, allow the replacement in a minsize function.
106 bool shouldReplaceLibcallWithIntrinsic(const CallInst *CI,
107 bool AllowMinSizeF32 = false,
108 bool AllowF64 = false,
109 bool AllowStrictFP = false);
110 void replaceLibCallWithSimpleIntrinsic(IRBuilder<> &B, CallInst *CI,
111 Intrinsic::ID IntrID);
112
113 bool tryReplaceLibcallWithSimpleIntrinsic(IRBuilder<> &B, CallInst *CI,
114 Intrinsic::ID IntrID,
115 bool AllowMinSizeF32 = false,
116 bool AllowF64 = false,
117 bool AllowStrictFP = false);
118
119protected:
120 bool isUnsafeMath(const FPMathOperator *FPOp) const;
121 bool isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const;
122
123 bool canIncreasePrecisionOfConstantFold(const FPMathOperator *FPOp) const;
124
125 static void replaceCall(Instruction *I, Value *With) {
126 I->replaceAllUsesWith(V: With);
127 I->eraseFromParent();
128 }
129
130 static void replaceCall(FPMathOperator *I, Value *With) {
131 replaceCall(I: cast<Instruction>(Val: I), With);
132 }
133
134public:
135 AMDGPULibCalls() = default;
136
137 bool fold(CallInst *CI);
138
139 void initFunction(Function &F, FunctionAnalysisManager &FAM);
140 void initNativeFuncs();
141
142 // Replace a normal math function call with that native version
143 bool useNative(CallInst *CI);
144};
145
146} // end namespace llvm
147
148template <typename IRB>
149static CallInst *CreateCallEx(IRB &B, FunctionCallee Callee, Value *Arg,
150 const Twine &Name = "") {
151 CallInst *R = B.CreateCall(Callee, Arg, Name);
152 if (Function *F = dyn_cast<Function>(Val: Callee.getCallee()))
153 R->setCallingConv(F->getCallingConv());
154 return R;
155}
156
157template <typename IRB>
158static CallInst *CreateCallEx2(IRB &B, FunctionCallee Callee, Value *Arg1,
159 Value *Arg2, const Twine &Name = "") {
160 CallInst *R = B.CreateCall(Callee, {Arg1, Arg2}, Name);
161 if (Function *F = dyn_cast<Function>(Val: Callee.getCallee()))
162 R->setCallingConv(F->getCallingConv());
163 return R;
164}
165
166static FunctionType *getPownType(FunctionType *FT) {
167 Type *PowNExpTy = Type::getInt32Ty(C&: FT->getContext());
168 if (VectorType *VecTy = dyn_cast<VectorType>(Val: FT->getReturnType()))
169 PowNExpTy = VectorType::get(ElementType: PowNExpTy, EC: VecTy->getElementCount());
170
171 return FunctionType::get(Result: FT->getReturnType(),
172 Params: {FT->getParamType(i: 0), PowNExpTy}, isVarArg: false);
173}
174
175// Data structures for table-driven optimizations.
176// FuncTbl works for both f32 and f64 functions with 1 input argument
177
178struct TableEntry {
179 double result;
180 double input;
181};
182
183/* a list of {result, input} */
184static const TableEntry tbl_acos[] = {
185 {MATH_PI / 2.0, .input: 0.0},
186 {MATH_PI / 2.0, .input: -0.0},
187 {.result: 0.0, .input: 1.0},
188 {MATH_PI, .input: -1.0}
189};
190static const TableEntry tbl_acosh[] = {
191 {.result: 0.0, .input: 1.0}
192};
193static const TableEntry tbl_acospi[] = {
194 {.result: 0.5, .input: 0.0},
195 {.result: 0.5, .input: -0.0},
196 {.result: 0.0, .input: 1.0},
197 {.result: 1.0, .input: -1.0}
198};
199static const TableEntry tbl_asin[] = {
200 {.result: 0.0, .input: 0.0},
201 {.result: -0.0, .input: -0.0},
202 {MATH_PI / 2.0, .input: 1.0},
203 {.result: -MATH_PI / 2.0, .input: -1.0}
204};
205static const TableEntry tbl_asinh[] = {
206 {.result: 0.0, .input: 0.0},
207 {.result: -0.0, .input: -0.0}
208};
209static const TableEntry tbl_asinpi[] = {
210 {.result: 0.0, .input: 0.0},
211 {.result: -0.0, .input: -0.0},
212 {.result: 0.5, .input: 1.0},
213 {.result: -0.5, .input: -1.0}
214};
215static const TableEntry tbl_atan[] = {
216 {.result: 0.0, .input: 0.0},
217 {.result: -0.0, .input: -0.0},
218 {MATH_PI / 4.0, .input: 1.0},
219 {.result: -MATH_PI / 4.0, .input: -1.0}
220};
221static const TableEntry tbl_atanh[] = {
222 {.result: 0.0, .input: 0.0},
223 {.result: -0.0, .input: -0.0}
224};
225static const TableEntry tbl_atanpi[] = {
226 {.result: 0.0, .input: 0.0},
227 {.result: -0.0, .input: -0.0},
228 {.result: 0.25, .input: 1.0},
229 {.result: -0.25, .input: -1.0}
230};
231static const TableEntry tbl_cbrt[] = {
232 {.result: 0.0, .input: 0.0},
233 {.result: -0.0, .input: -0.0},
234 {.result: 1.0, .input: 1.0},
235 {.result: -1.0, .input: -1.0},
236};
237static const TableEntry tbl_cos[] = {
238 {.result: 1.0, .input: 0.0},
239 {.result: 1.0, .input: -0.0}
240};
241static const TableEntry tbl_cosh[] = {
242 {.result: 1.0, .input: 0.0},
243 {.result: 1.0, .input: -0.0}
244};
245static const TableEntry tbl_cospi[] = {
246 {.result: 1.0, .input: 0.0},
247 {.result: 1.0, .input: -0.0}
248};
249static const TableEntry tbl_erfc[] = {
250 {.result: 1.0, .input: 0.0},
251 {.result: 1.0, .input: -0.0}
252};
253static const TableEntry tbl_erf[] = {
254 {.result: 0.0, .input: 0.0},
255 {.result: -0.0, .input: -0.0}
256};
257static const TableEntry tbl_exp[] = {
258 {.result: 1.0, .input: 0.0},
259 {.result: 1.0, .input: -0.0},
260 {MATH_E, .input: 1.0}
261};
262static const TableEntry tbl_exp2[] = {
263 {.result: 1.0, .input: 0.0},
264 {.result: 1.0, .input: -0.0},
265 {.result: 2.0, .input: 1.0}
266};
267static const TableEntry tbl_exp10[] = {
268 {.result: 1.0, .input: 0.0},
269 {.result: 1.0, .input: -0.0},
270 {.result: 10.0, .input: 1.0}
271};
272static const TableEntry tbl_expm1[] = {
273 {.result: 0.0, .input: 0.0},
274 {.result: -0.0, .input: -0.0}
275};
276static const TableEntry tbl_log[] = {
277 {.result: 0.0, .input: 1.0},
278 {.result: 1.0, MATH_E}
279};
280static const TableEntry tbl_log2[] = {
281 {.result: 0.0, .input: 1.0},
282 {.result: 1.0, .input: 2.0}
283};
284static const TableEntry tbl_log10[] = {
285 {.result: 0.0, .input: 1.0},
286 {.result: 1.0, .input: 10.0}
287};
288static const TableEntry tbl_rsqrt[] = {
289 {.result: 1.0, .input: 1.0},
290 {MATH_SQRT1_2, .input: 2.0}
291};
292static const TableEntry tbl_sin[] = {
293 {.result: 0.0, .input: 0.0},
294 {.result: -0.0, .input: -0.0}
295};
296static const TableEntry tbl_sinh[] = {
297 {.result: 0.0, .input: 0.0},
298 {.result: -0.0, .input: -0.0}
299};
300static const TableEntry tbl_sinpi[] = {
301 {.result: 0.0, .input: 0.0},
302 {.result: -0.0, .input: -0.0}
303};
304static const TableEntry tbl_sqrt[] = {
305 {.result: 0.0, .input: 0.0},
306 {.result: 1.0, .input: 1.0},
307 {MATH_SQRT2, .input: 2.0}
308};
309static const TableEntry tbl_tan[] = {
310 {.result: 0.0, .input: 0.0},
311 {.result: -0.0, .input: -0.0}
312};
313static const TableEntry tbl_tanh[] = {
314 {.result: 0.0, .input: 0.0},
315 {.result: -0.0, .input: -0.0}
316};
317static const TableEntry tbl_tanpi[] = {
318 {.result: 0.0, .input: 0.0},
319 {.result: -0.0, .input: -0.0}
320};
321static const TableEntry tbl_tgamma[] = {
322 {.result: 1.0, .input: 1.0},
323 {.result: 1.0, .input: 2.0},
324 {.result: 2.0, .input: 3.0},
325 {.result: 6.0, .input: 4.0}
326};
327
328static bool HasNative(AMDGPULibFunc::EFuncId id) {
329 switch(id) {
330 case AMDGPULibFunc::EI_DIVIDE:
331 case AMDGPULibFunc::EI_COS:
332 case AMDGPULibFunc::EI_EXP:
333 case AMDGPULibFunc::EI_EXP2:
334 case AMDGPULibFunc::EI_EXP10:
335 case AMDGPULibFunc::EI_LOG:
336 case AMDGPULibFunc::EI_LOG2:
337 case AMDGPULibFunc::EI_LOG10:
338 case AMDGPULibFunc::EI_POWR:
339 case AMDGPULibFunc::EI_RECIP:
340 case AMDGPULibFunc::EI_RSQRT:
341 case AMDGPULibFunc::EI_SIN:
342 case AMDGPULibFunc::EI_SINCOS:
343 case AMDGPULibFunc::EI_SQRT:
344 case AMDGPULibFunc::EI_TAN:
345 return true;
346 default:;
347 }
348 return false;
349}
350
351using TableRef = ArrayRef<TableEntry>;
352
353static TableRef getOptTable(AMDGPULibFunc::EFuncId id) {
354 switch(id) {
355 case AMDGPULibFunc::EI_ACOS: return TableRef(tbl_acos);
356 case AMDGPULibFunc::EI_ACOSH: return TableRef(tbl_acosh);
357 case AMDGPULibFunc::EI_ACOSPI: return TableRef(tbl_acospi);
358 case AMDGPULibFunc::EI_ASIN: return TableRef(tbl_asin);
359 case AMDGPULibFunc::EI_ASINH: return TableRef(tbl_asinh);
360 case AMDGPULibFunc::EI_ASINPI: return TableRef(tbl_asinpi);
361 case AMDGPULibFunc::EI_ATAN: return TableRef(tbl_atan);
362 case AMDGPULibFunc::EI_ATANH: return TableRef(tbl_atanh);
363 case AMDGPULibFunc::EI_ATANPI: return TableRef(tbl_atanpi);
364 case AMDGPULibFunc::EI_CBRT: return TableRef(tbl_cbrt);
365 case AMDGPULibFunc::EI_NCOS:
366 case AMDGPULibFunc::EI_COS: return TableRef(tbl_cos);
367 case AMDGPULibFunc::EI_COSH: return TableRef(tbl_cosh);
368 case AMDGPULibFunc::EI_COSPI: return TableRef(tbl_cospi);
369 case AMDGPULibFunc::EI_ERFC: return TableRef(tbl_erfc);
370 case AMDGPULibFunc::EI_ERF: return TableRef(tbl_erf);
371 case AMDGPULibFunc::EI_EXP: return TableRef(tbl_exp);
372 case AMDGPULibFunc::EI_NEXP2:
373 case AMDGPULibFunc::EI_EXP2: return TableRef(tbl_exp2);
374 case AMDGPULibFunc::EI_EXP10: return TableRef(tbl_exp10);
375 case AMDGPULibFunc::EI_EXPM1: return TableRef(tbl_expm1);
376 case AMDGPULibFunc::EI_LOG: return TableRef(tbl_log);
377 case AMDGPULibFunc::EI_NLOG2:
378 case AMDGPULibFunc::EI_LOG2: return TableRef(tbl_log2);
379 case AMDGPULibFunc::EI_LOG10: return TableRef(tbl_log10);
380 case AMDGPULibFunc::EI_NRSQRT:
381 case AMDGPULibFunc::EI_RSQRT: return TableRef(tbl_rsqrt);
382 case AMDGPULibFunc::EI_NSIN:
383 case AMDGPULibFunc::EI_SIN: return TableRef(tbl_sin);
384 case AMDGPULibFunc::EI_SINH: return TableRef(tbl_sinh);
385 case AMDGPULibFunc::EI_SINPI: return TableRef(tbl_sinpi);
386 case AMDGPULibFunc::EI_NSQRT:
387 case AMDGPULibFunc::EI_SQRT: return TableRef(tbl_sqrt);
388 case AMDGPULibFunc::EI_TAN: return TableRef(tbl_tan);
389 case AMDGPULibFunc::EI_TANH: return TableRef(tbl_tanh);
390 case AMDGPULibFunc::EI_TANPI: return TableRef(tbl_tanpi);
391 case AMDGPULibFunc::EI_TGAMMA: return TableRef(tbl_tgamma);
392 default:;
393 }
394 return TableRef();
395}
396
397static inline int getVecSize(const AMDGPULibFunc& FInfo) {
398 return FInfo.getLeads()[0].VectorSize;
399}
400
401static inline AMDGPULibFunc::EType getArgType(const AMDGPULibFunc& FInfo) {
402 return (AMDGPULibFunc::EType)FInfo.getLeads()[0].ArgType;
403}
404
405FunctionCallee AMDGPULibCalls::getFunction(Module *M, const FuncInfo &fInfo) {
406 // If we are doing PreLinkOpt, the function is external. So it is safe to
407 // use getOrInsertFunction() at this stage.
408
409 return EnablePreLink ? AMDGPULibFunc::getOrInsertFunction(M, fInfo)
410 : AMDGPULibFunc::getFunction(M, fInfo);
411}
412
413bool AMDGPULibCalls::parseFunctionName(const StringRef &FMangledName,
414 FuncInfo &FInfo) {
415 return AMDGPULibFunc::parse(MangledName: FMangledName, Ptr&: FInfo);
416}
417
418bool AMDGPULibCalls::isUnsafeMath(const FPMathOperator *FPOp) const {
419 return UnsafeFPMath || FPOp->isFast();
420}
421
422bool AMDGPULibCalls::isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const {
423 return UnsafeFPMath ||
424 (FPOp->hasApproxFunc() && FPOp->hasNoNaNs() && FPOp->hasNoInfs());
425}
426
427bool AMDGPULibCalls::canIncreasePrecisionOfConstantFold(
428 const FPMathOperator *FPOp) const {
429 // TODO: Refine to approxFunc or contract
430 return isUnsafeMath(FPOp);
431}
432
433void AMDGPULibCalls::initFunction(Function &F, FunctionAnalysisManager &FAM) {
434 UnsafeFPMath = F.getFnAttribute(Kind: "unsafe-fp-math").getValueAsBool();
435 AC = &FAM.getResult<AssumptionAnalysis>(IR&: F);
436 TLInfo = &FAM.getResult<TargetLibraryAnalysis>(IR&: F);
437 DT = FAM.getCachedResult<DominatorTreeAnalysis>(IR&: F);
438}
439
440bool AMDGPULibCalls::useNativeFunc(const StringRef F) const {
441 return AllNative || llvm::is_contained(Range&: UseNative, Element: F);
442}
443
444void AMDGPULibCalls::initNativeFuncs() {
445 AllNative = useNativeFunc(F: "all") ||
446 (UseNative.getNumOccurrences() && UseNative.size() == 1 &&
447 UseNative.begin()->empty());
448}
449
450bool AMDGPULibCalls::sincosUseNative(CallInst *aCI, const FuncInfo &FInfo) {
451 bool native_sin = useNativeFunc(F: "sin");
452 bool native_cos = useNativeFunc(F: "cos");
453
454 if (native_sin && native_cos) {
455 Module *M = aCI->getModule();
456 Value *opr0 = aCI->getArgOperand(i: 0);
457
458 AMDGPULibFunc nf;
459 nf.getLeads()[0].ArgType = FInfo.getLeads()[0].ArgType;
460 nf.getLeads()[0].VectorSize = FInfo.getLeads()[0].VectorSize;
461
462 nf.setPrefix(AMDGPULibFunc::NATIVE);
463 nf.setId(AMDGPULibFunc::EI_SIN);
464 FunctionCallee sinExpr = getFunction(M, fInfo: nf);
465
466 nf.setPrefix(AMDGPULibFunc::NATIVE);
467 nf.setId(AMDGPULibFunc::EI_COS);
468 FunctionCallee cosExpr = getFunction(M, fInfo: nf);
469 if (sinExpr && cosExpr) {
470 Value *sinval =
471 CallInst::Create(Func: sinExpr, Args: opr0, NameStr: "splitsin", InsertBefore: aCI->getIterator());
472 Value *cosval =
473 CallInst::Create(Func: cosExpr, Args: opr0, NameStr: "splitcos", InsertBefore: aCI->getIterator());
474 new StoreInst(cosval, aCI->getArgOperand(i: 1), aCI->getIterator());
475
476 DEBUG_WITH_TYPE("usenative", dbgs() << "<useNative> replace " << *aCI
477 << " with native version of sin/cos");
478
479 replaceCall(I: aCI, With: sinval);
480 return true;
481 }
482 }
483 return false;
484}
485
486bool AMDGPULibCalls::useNative(CallInst *aCI) {
487 Function *Callee = aCI->getCalledFunction();
488 if (!Callee || aCI->isNoBuiltin())
489 return false;
490
491 FuncInfo FInfo;
492 if (!parseFunctionName(FMangledName: Callee->getName(), FInfo) || !FInfo.isMangled() ||
493 FInfo.getPrefix() != AMDGPULibFunc::NOPFX ||
494 getArgType(FInfo) == AMDGPULibFunc::F64 || !HasNative(id: FInfo.getId()) ||
495 !(AllNative || useNativeFunc(F: FInfo.getName()))) {
496 return false;
497 }
498
499 if (FInfo.getId() == AMDGPULibFunc::EI_SINCOS)
500 return sincosUseNative(aCI, FInfo);
501
502 FInfo.setPrefix(AMDGPULibFunc::NATIVE);
503 FunctionCallee F = getFunction(M: aCI->getModule(), fInfo: FInfo);
504 if (!F)
505 return false;
506
507 aCI->setCalledFunction(F);
508 DEBUG_WITH_TYPE("usenative", dbgs() << "<useNative> replace " << *aCI
509 << " with native version");
510 return true;
511}
512
513// Clang emits call of __read_pipe_2 or __read_pipe_4 for OpenCL read_pipe
514// builtin, with appended type size and alignment arguments, where 2 or 4
515// indicates the original number of arguments. The library has optimized version
516// of __read_pipe_2/__read_pipe_4 when the type size and alignment has the same
517// power of 2 value. This function transforms __read_pipe_2 to __read_pipe_2_N
518// for such cases where N is the size in bytes of the type (N = 1, 2, 4, 8, ...,
519// 128). The same for __read_pipe_4, write_pipe_2, and write_pipe_4.
520bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
521 const FuncInfo &FInfo) {
522 auto *Callee = CI->getCalledFunction();
523 if (!Callee->isDeclaration())
524 return false;
525
526 assert(Callee->hasName() && "Invalid read_pipe/write_pipe function");
527 auto *M = Callee->getParent();
528 std::string Name = std::string(Callee->getName());
529 auto NumArg = CI->arg_size();
530 if (NumArg != 4 && NumArg != 6)
531 return false;
532 ConstantInt *PacketSize =
533 dyn_cast<ConstantInt>(Val: CI->getArgOperand(i: NumArg - 2));
534 ConstantInt *PacketAlign =
535 dyn_cast<ConstantInt>(Val: CI->getArgOperand(i: NumArg - 1));
536 if (!PacketSize || !PacketAlign)
537 return false;
538
539 unsigned Size = PacketSize->getZExtValue();
540 Align Alignment = PacketAlign->getAlignValue();
541 if (Alignment != Size)
542 return false;
543
544 unsigned PtrArgLoc = CI->arg_size() - 3;
545 Value *PtrArg = CI->getArgOperand(i: PtrArgLoc);
546 Type *PtrTy = PtrArg->getType();
547
548 SmallVector<llvm::Type *, 6> ArgTys;
549 for (unsigned I = 0; I != PtrArgLoc; ++I)
550 ArgTys.push_back(Elt: CI->getArgOperand(i: I)->getType());
551 ArgTys.push_back(Elt: PtrTy);
552
553 Name = Name + "_" + std::to_string(val: Size);
554 auto *FTy = FunctionType::get(Result: Callee->getReturnType(),
555 Params: ArrayRef<Type *>(ArgTys), isVarArg: false);
556 AMDGPULibFunc NewLibFunc(Name, FTy);
557 FunctionCallee F = AMDGPULibFunc::getOrInsertFunction(M, fInfo: NewLibFunc);
558 if (!F)
559 return false;
560
561 SmallVector<Value *, 6> Args;
562 for (unsigned I = 0; I != PtrArgLoc; ++I)
563 Args.push_back(Elt: CI->getArgOperand(i: I));
564 Args.push_back(Elt: PtrArg);
565
566 auto *NCI = B.CreateCall(Callee: F, Args);
567 NCI->setAttributes(CI->getAttributes());
568 CI->replaceAllUsesWith(V: NCI);
569 CI->dropAllReferences();
570 CI->eraseFromParent();
571
572 return true;
573}
574
575static bool isKnownIntegral(const Value *V, const DataLayout &DL,
576 FastMathFlags FMF) {
577 if (isa<PoisonValue>(Val: V))
578 return true;
579 if (isa<UndefValue>(Val: V))
580 return false;
581
582 if (const ConstantFP *CF = dyn_cast<ConstantFP>(Val: V))
583 return CF->getValueAPF().isInteger();
584
585 auto *VFVTy = dyn_cast<FixedVectorType>(Val: V->getType());
586 const Constant *CV = dyn_cast<Constant>(Val: V);
587 if (VFVTy && CV) {
588 unsigned NumElts = VFVTy->getNumElements();
589 for (unsigned i = 0; i != NumElts; ++i) {
590 Constant *Elt = CV->getAggregateElement(Elt: i);
591 if (!Elt)
592 return false;
593 if (isa<PoisonValue>(Val: Elt))
594 continue;
595
596 const ConstantFP *CFP = dyn_cast<ConstantFP>(Val: Elt);
597 if (!CFP || !CFP->getValue().isInteger())
598 return false;
599 }
600
601 return true;
602 }
603
604 const Instruction *I = dyn_cast<Instruction>(Val: V);
605 if (!I)
606 return false;
607
608 switch (I->getOpcode()) {
609 case Instruction::SIToFP:
610 case Instruction::UIToFP:
611 // TODO: Could check nofpclass(inf) on incoming argument
612 if (FMF.noInfs())
613 return true;
614
615 // Need to check int size cannot produce infinity, which computeKnownFPClass
616 // knows how to do already.
617 return isKnownNeverInfinity(V: I, SQ: SimplifyQuery(DL));
618 case Instruction::Call: {
619 const CallInst *CI = cast<CallInst>(Val: I);
620 switch (CI->getIntrinsicID()) {
621 case Intrinsic::trunc:
622 case Intrinsic::floor:
623 case Intrinsic::ceil:
624 case Intrinsic::rint:
625 case Intrinsic::nearbyint:
626 case Intrinsic::round:
627 case Intrinsic::roundeven:
628 return (FMF.noInfs() && FMF.noNaNs()) ||
629 isKnownNeverInfOrNaN(V: I, SQ: SimplifyQuery(DL));
630 default:
631 break;
632 }
633
634 break;
635 }
636 default:
637 break;
638 }
639
640 return false;
641}
642
643// This function returns false if no change; return true otherwise.
644bool AMDGPULibCalls::fold(CallInst *CI) {
645 Function *Callee = CI->getCalledFunction();
646 // Ignore indirect calls.
647 if (!Callee || Callee->isIntrinsic() || CI->isNoBuiltin())
648 return false;
649
650 FuncInfo FInfo;
651 if (!parseFunctionName(FMangledName: Callee->getName(), FInfo))
652 return false;
653
654 // Further check the number of arguments to see if they match.
655 // TODO: Check calling convention matches too
656 if (!FInfo.isCompatibleSignature(M: *Callee->getParent(), FuncTy: CI->getFunctionType()))
657 return false;
658
659 LLVM_DEBUG(dbgs() << "AMDIC: try folding " << *CI << '\n');
660
661 if (TDOFold(CI, FInfo))
662 return true;
663
664 IRBuilder<> B(CI);
665 if (CI->isStrictFP())
666 B.setIsFPConstrained(true);
667
668 if (FPMathOperator *FPOp = dyn_cast<FPMathOperator>(Val: CI)) {
669 // Under unsafe-math, evaluate calls if possible.
670 // According to Brian Sumner, we can do this for all f32 function calls
671 // using host's double function calls.
672 if (canIncreasePrecisionOfConstantFold(FPOp) && evaluateCall(aCI: CI, FInfo))
673 return true;
674
675 // Copy fast flags from the original call.
676 FastMathFlags FMF = FPOp->getFastMathFlags();
677 B.setFastMathFlags(FMF);
678
679 // Specialized optimizations for each function call.
680 //
681 // TODO: Handle native functions
682 switch (FInfo.getId()) {
683 case AMDGPULibFunc::EI_EXP:
684 if (FMF.none())
685 return false;
686 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::exp,
687 AllowMinSizeF32: FMF.approxFunc());
688 case AMDGPULibFunc::EI_EXP2:
689 if (FMF.none())
690 return false;
691 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::exp2,
692 AllowMinSizeF32: FMF.approxFunc());
693 case AMDGPULibFunc::EI_LOG:
694 if (FMF.none())
695 return false;
696 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::log,
697 AllowMinSizeF32: FMF.approxFunc());
698 case AMDGPULibFunc::EI_LOG2:
699 if (FMF.none())
700 return false;
701 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::log2,
702 AllowMinSizeF32: FMF.approxFunc());
703 case AMDGPULibFunc::EI_LOG10:
704 if (FMF.none())
705 return false;
706 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::log10,
707 AllowMinSizeF32: FMF.approxFunc());
708 case AMDGPULibFunc::EI_FMIN:
709 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::minnum,
710 AllowMinSizeF32: true, AllowF64: true);
711 case AMDGPULibFunc::EI_FMAX:
712 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::maxnum,
713 AllowMinSizeF32: true, AllowF64: true);
714 case AMDGPULibFunc::EI_FMA:
715 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::fma, AllowMinSizeF32: true,
716 AllowF64: true);
717 case AMDGPULibFunc::EI_MAD:
718 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::fmuladd,
719 AllowMinSizeF32: true, AllowF64: true);
720 case AMDGPULibFunc::EI_FABS:
721 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::fabs, AllowMinSizeF32: true,
722 AllowF64: true, AllowStrictFP: true);
723 case AMDGPULibFunc::EI_COPYSIGN:
724 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::copysign,
725 AllowMinSizeF32: true, AllowF64: true, AllowStrictFP: true);
726 case AMDGPULibFunc::EI_FLOOR:
727 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::floor, AllowMinSizeF32: true,
728 AllowF64: true);
729 case AMDGPULibFunc::EI_CEIL:
730 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::ceil, AllowMinSizeF32: true,
731 AllowF64: true);
732 case AMDGPULibFunc::EI_TRUNC:
733 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::trunc, AllowMinSizeF32: true,
734 AllowF64: true);
735 case AMDGPULibFunc::EI_RINT:
736 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::rint, AllowMinSizeF32: true,
737 AllowF64: true);
738 case AMDGPULibFunc::EI_ROUND:
739 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::round, AllowMinSizeF32: true,
740 AllowF64: true);
741 case AMDGPULibFunc::EI_LDEXP: {
742 if (!shouldReplaceLibcallWithIntrinsic(CI, AllowMinSizeF32: true, AllowF64: true))
743 return false;
744
745 Value *Arg1 = CI->getArgOperand(i: 1);
746 if (VectorType *VecTy = dyn_cast<VectorType>(Val: CI->getType());
747 VecTy && !isa<VectorType>(Val: Arg1->getType())) {
748 Value *SplatArg1 = B.CreateVectorSplat(EC: VecTy->getElementCount(), V: Arg1);
749 CI->setArgOperand(i: 1, v: SplatArg1);
750 }
751
752 CI->setCalledFunction(Intrinsic::getOrInsertDeclaration(
753 M: CI->getModule(), id: Intrinsic::ldexp,
754 Tys: {CI->getType(), CI->getArgOperand(i: 1)->getType()}));
755 return true;
756 }
757 case AMDGPULibFunc::EI_POW: {
758 Module *M = Callee->getParent();
759 AMDGPULibFunc PowrInfo(AMDGPULibFunc::EI_POWR, FInfo);
760 FunctionCallee PowrFunc = getFunction(M, fInfo: PowrInfo);
761 CallInst *Call = cast<CallInst>(Val: FPOp);
762
763 // pow(x, y) -> powr(x, y) for x >= -0.0
764 // TODO: Account for flags on current call
765 if (PowrFunc &&
766 cannotBeOrderedLessThanZero(
767 V: FPOp->getOperand(i: 0),
768 SQ: SimplifyQuery(M->getDataLayout(), TLInfo, DT, AC, Call))) {
769 Call->setCalledFunction(PowrFunc);
770 return fold_pow(FPOp, B, FInfo: PowrInfo) || true;
771 }
772
773 // pow(x, y) -> pown(x, y) for known integral y
774 if (isKnownIntegral(V: FPOp->getOperand(i: 1), DL: M->getDataLayout(),
775 FMF: FPOp->getFastMathFlags())) {
776 FunctionType *PownType = getPownType(FT: CI->getFunctionType());
777 AMDGPULibFunc PownInfo(AMDGPULibFunc::EI_POWN, PownType, true);
778 FunctionCallee PownFunc = getFunction(M, fInfo: PownInfo);
779 if (PownFunc) {
780 // TODO: If the incoming integral value is an sitofp/uitofp, it won't
781 // fold out without a known range. We can probably take the source
782 // value directly.
783 Value *CastedArg =
784 B.CreateFPToSI(V: FPOp->getOperand(i: 1), DestTy: PownType->getParamType(i: 1));
785 // Have to drop any nofpclass attributes on the original call site.
786 Call->removeParamAttrs(
787 ArgNo: 1, AttrsToRemove: AttributeFuncs::typeIncompatible(Ty: CastedArg->getType(),
788 AS: Call->getParamAttributes(ArgNo: 1)));
789 Call->setCalledFunction(PownFunc);
790 Call->setArgOperand(i: 1, v: CastedArg);
791 return fold_pow(FPOp, B, FInfo: PownInfo) || true;
792 }
793 }
794
795 return fold_pow(FPOp, B, FInfo);
796 }
797 case AMDGPULibFunc::EI_POWR:
798 case AMDGPULibFunc::EI_POWN:
799 return fold_pow(FPOp, B, FInfo);
800 case AMDGPULibFunc::EI_ROOTN:
801 return fold_rootn(FPOp, B, FInfo);
802 case AMDGPULibFunc::EI_SQRT:
803 // TODO: Allow with strictfp + constrained intrinsic
804 return tryReplaceLibcallWithSimpleIntrinsic(
805 B, CI, IntrID: Intrinsic::sqrt, AllowMinSizeF32: true, AllowF64: true, /*AllowStrictFP=*/false);
806 case AMDGPULibFunc::EI_COS:
807 case AMDGPULibFunc::EI_SIN:
808 return fold_sincos(FPOp, B, FInfo);
809 default:
810 break;
811 }
812 } else {
813 // Specialized optimizations for each function call
814 switch (FInfo.getId()) {
815 case AMDGPULibFunc::EI_READ_PIPE_2:
816 case AMDGPULibFunc::EI_READ_PIPE_4:
817 case AMDGPULibFunc::EI_WRITE_PIPE_2:
818 case AMDGPULibFunc::EI_WRITE_PIPE_4:
819 return fold_read_write_pipe(CI, B, FInfo);
820 default:
821 break;
822 }
823 }
824
825 return false;
826}
827
828bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
829 // Table-Driven optimization
830 const TableRef tr = getOptTable(id: FInfo.getId());
831 if (tr.empty())
832 return false;
833
834 int const sz = (int)tr.size();
835 Value *opr0 = CI->getArgOperand(i: 0);
836
837 if (getVecSize(FInfo) > 1) {
838 if (ConstantDataVector *CV = dyn_cast<ConstantDataVector>(Val: opr0)) {
839 SmallVector<double, 0> DVal;
840 for (int eltNo = 0; eltNo < getVecSize(FInfo); ++eltNo) {
841 ConstantFP *eltval = dyn_cast<ConstantFP>(
842 Val: CV->getElementAsConstant(i: (unsigned)eltNo));
843 assert(eltval && "Non-FP arguments in math function!");
844 bool found = false;
845 for (int i=0; i < sz; ++i) {
846 if (eltval->isExactlyValue(V: tr[i].input)) {
847 DVal.push_back(Elt: tr[i].result);
848 found = true;
849 break;
850 }
851 }
852 if (!found) {
853 // This vector constants not handled yet.
854 return false;
855 }
856 }
857 LLVMContext &context = CI->getParent()->getParent()->getContext();
858 Constant *nval;
859 if (getArgType(FInfo) == AMDGPULibFunc::F32) {
860 SmallVector<float, 0> FVal;
861 for (double D : DVal)
862 FVal.push_back(Elt: (float)D);
863 ArrayRef<float> tmp(FVal);
864 nval = ConstantDataVector::get(Context&: context, Elts: tmp);
865 } else { // F64
866 ArrayRef<double> tmp(DVal);
867 nval = ConstantDataVector::get(Context&: context, Elts: tmp);
868 }
869 LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n");
870 replaceCall(I: CI, With: nval);
871 return true;
872 }
873 } else {
874 // Scalar version
875 if (ConstantFP *CF = dyn_cast<ConstantFP>(Val: opr0)) {
876 for (int i = 0; i < sz; ++i) {
877 if (CF->isExactlyValue(V: tr[i].input)) {
878 Value *nval = ConstantFP::get(Ty: CF->getType(), V: tr[i].result);
879 LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n");
880 replaceCall(I: CI, With: nval);
881 return true;
882 }
883 }
884 }
885 }
886
887 return false;
888}
889
890namespace llvm {
891static double log2(double V) {
892#if _XOPEN_SOURCE >= 600 || defined(_ISOC99_SOURCE) || _POSIX_C_SOURCE >= 200112L
893 return ::log2(x: V);
894#else
895 return log(V) / numbers::ln2;
896#endif
897}
898} // namespace llvm
899
900bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B,
901 const FuncInfo &FInfo) {
902 assert((FInfo.getId() == AMDGPULibFunc::EI_POW ||
903 FInfo.getId() == AMDGPULibFunc::EI_POWR ||
904 FInfo.getId() == AMDGPULibFunc::EI_POWN) &&
905 "fold_pow: encounter a wrong function call");
906
907 Module *M = B.GetInsertBlock()->getModule();
908 Type *eltType = FPOp->getType()->getScalarType();
909 Value *opr0 = FPOp->getOperand(i: 0);
910 Value *opr1 = FPOp->getOperand(i: 1);
911
912 const APFloat *CF = nullptr;
913 const APInt *CINT = nullptr;
914 if (!match(V: opr1, P: m_APFloatAllowPoison(Res&: CF)))
915 match(V: opr1, P: m_APIntAllowPoison(Res&: CINT));
916
917 // 0x1111111 means that we don't do anything for this call.
918 int ci_opr1 = (CINT ? (int)CINT->getSExtValue() : 0x1111111);
919
920 if ((CF && CF->isZero()) || (CINT && ci_opr1 == 0)) {
921 // pow/powr/pown(x, 0) == 1
922 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1\n");
923 Constant *cnval = ConstantFP::get(Ty: eltType, V: 1.0);
924 if (getVecSize(FInfo) > 1) {
925 cnval = ConstantDataVector::getSplat(NumElts: getVecSize(FInfo), Elt: cnval);
926 }
927 replaceCall(I: FPOp, With: cnval);
928 return true;
929 }
930 if ((CF && CF->isExactlyValue(V: 1.0)) || (CINT && ci_opr1 == 1)) {
931 // pow/powr/pown(x, 1.0) = x
932 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << *opr0 << "\n");
933 replaceCall(I: FPOp, With: opr0);
934 return true;
935 }
936 if ((CF && CF->isExactlyValue(V: 2.0)) || (CINT && ci_opr1 == 2)) {
937 // pow/powr/pown(x, 2.0) = x*x
938 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << *opr0 << " * "
939 << *opr0 << "\n");
940 Value *nval = B.CreateFMul(L: opr0, R: opr0, Name: "__pow2");
941 replaceCall(I: FPOp, With: nval);
942 return true;
943 }
944 if ((CF && CF->isExactlyValue(V: -1.0)) || (CINT && ci_opr1 == -1)) {
945 // pow/powr/pown(x, -1.0) = 1.0/x
946 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1 / " << *opr0 << "\n");
947 Constant *cnval = ConstantFP::get(Ty: eltType, V: 1.0);
948 if (getVecSize(FInfo) > 1) {
949 cnval = ConstantDataVector::getSplat(NumElts: getVecSize(FInfo), Elt: cnval);
950 }
951 Value *nval = B.CreateFDiv(L: cnval, R: opr0, Name: "__powrecip");
952 replaceCall(I: FPOp, With: nval);
953 return true;
954 }
955
956 if (CF && (CF->isExactlyValue(V: 0.5) || CF->isExactlyValue(V: -0.5))) {
957 // pow[r](x, [-]0.5) = sqrt(x)
958 bool issqrt = CF->isExactlyValue(V: 0.5);
959 if (FunctionCallee FPExpr =
960 getFunction(M, fInfo: AMDGPULibFunc(issqrt ? AMDGPULibFunc::EI_SQRT
961 : AMDGPULibFunc::EI_RSQRT,
962 FInfo))) {
963 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << FInfo.getName()
964 << '(' << *opr0 << ")\n");
965 Value *nval = CreateCallEx(B,Callee: FPExpr, Arg: opr0, Name: issqrt ? "__pow2sqrt"
966 : "__pow2rsqrt");
967 replaceCall(I: FPOp, With: nval);
968 return true;
969 }
970 }
971
972 if (!isUnsafeFiniteOnlyMath(FPOp))
973 return false;
974
975 // Unsafe Math optimization
976
977 // Remember that ci_opr1 is set if opr1 is integral
978 if (CF) {
979 double dval = (getArgType(FInfo) == AMDGPULibFunc::F32)
980 ? (double)CF->convertToFloat()
981 : CF->convertToDouble();
982 int ival = (int)dval;
983 if ((double)ival == dval) {
984 ci_opr1 = ival;
985 } else
986 ci_opr1 = 0x11111111;
987 }
988
989 // pow/powr/pown(x, c) = [1/](x*x*..x); where
990 // trunc(c) == c && the number of x == c && |c| <= 12
991 unsigned abs_opr1 = (ci_opr1 < 0) ? -ci_opr1 : ci_opr1;
992 if (abs_opr1 <= 12) {
993 Constant *cnval;
994 Value *nval;
995 if (abs_opr1 == 0) {
996 cnval = ConstantFP::get(Ty: eltType, V: 1.0);
997 if (getVecSize(FInfo) > 1) {
998 cnval = ConstantDataVector::getSplat(NumElts: getVecSize(FInfo), Elt: cnval);
999 }
1000 nval = cnval;
1001 } else {
1002 Value *valx2 = nullptr;
1003 nval = nullptr;
1004 while (abs_opr1 > 0) {
1005 valx2 = valx2 ? B.CreateFMul(L: valx2, R: valx2, Name: "__powx2") : opr0;
1006 if (abs_opr1 & 1) {
1007 nval = nval ? B.CreateFMul(L: nval, R: valx2, Name: "__powprod") : valx2;
1008 }
1009 abs_opr1 >>= 1;
1010 }
1011 }
1012
1013 if (ci_opr1 < 0) {
1014 cnval = ConstantFP::get(Ty: eltType, V: 1.0);
1015 if (getVecSize(FInfo) > 1) {
1016 cnval = ConstantDataVector::getSplat(NumElts: getVecSize(FInfo), Elt: cnval);
1017 }
1018 nval = B.CreateFDiv(L: cnval, R: nval, Name: "__1powprod");
1019 }
1020 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> "
1021 << ((ci_opr1 < 0) ? "1/prod(" : "prod(") << *opr0
1022 << ")\n");
1023 replaceCall(I: FPOp, With: nval);
1024 return true;
1025 }
1026
1027 // If we should use the generic intrinsic instead of emitting a libcall
1028 const bool ShouldUseIntrinsic = eltType->isFloatTy() || eltType->isHalfTy();
1029
1030 // powr ---> exp2(y * log2(x))
1031 // pown/pow ---> powr(fabs(x), y) | (x & ((int)y << 31))
1032 FunctionCallee ExpExpr;
1033 if (ShouldUseIntrinsic)
1034 ExpExpr = Intrinsic::getOrInsertDeclaration(M, id: Intrinsic::exp2,
1035 Tys: {FPOp->getType()});
1036 else {
1037 ExpExpr = getFunction(M, fInfo: AMDGPULibFunc(AMDGPULibFunc::EI_EXP2, FInfo));
1038 if (!ExpExpr)
1039 return false;
1040 }
1041
1042 bool needlog = false;
1043 bool needabs = false;
1044 bool needcopysign = false;
1045 Constant *cnval = nullptr;
1046 if (getVecSize(FInfo) == 1) {
1047 CF = nullptr;
1048 match(V: opr0, P: m_APFloatAllowPoison(Res&: CF));
1049
1050 if (CF) {
1051 double V = (getArgType(FInfo) == AMDGPULibFunc::F32)
1052 ? (double)CF->convertToFloat()
1053 : CF->convertToDouble();
1054
1055 V = log2(V: std::abs(x: V));
1056 cnval = ConstantFP::get(Ty: eltType, V);
1057 needcopysign = (FInfo.getId() != AMDGPULibFunc::EI_POWR) &&
1058 CF->isNegative();
1059 } else {
1060 needlog = true;
1061 needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR;
1062 }
1063 } else {
1064 ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(Val: opr0);
1065
1066 if (!CDV) {
1067 needlog = true;
1068 needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR;
1069 } else {
1070 assert ((int)CDV->getNumElements() == getVecSize(FInfo) &&
1071 "Wrong vector size detected");
1072
1073 SmallVector<double, 0> DVal;
1074 for (int i=0; i < getVecSize(FInfo); ++i) {
1075 double V = CDV->getElementAsAPFloat(i).convertToDouble();
1076 if (V < 0.0) needcopysign = true;
1077 V = log2(V: std::abs(x: V));
1078 DVal.push_back(Elt: V);
1079 }
1080 if (getArgType(FInfo) == AMDGPULibFunc::F32) {
1081 SmallVector<float, 0> FVal;
1082 for (double D : DVal)
1083 FVal.push_back(Elt: (float)D);
1084 ArrayRef<float> tmp(FVal);
1085 cnval = ConstantDataVector::get(Context&: M->getContext(), Elts: tmp);
1086 } else {
1087 ArrayRef<double> tmp(DVal);
1088 cnval = ConstantDataVector::get(Context&: M->getContext(), Elts: tmp);
1089 }
1090 }
1091 }
1092
1093 if (needcopysign && (FInfo.getId() == AMDGPULibFunc::EI_POW)) {
1094 // We cannot handle corner cases for a general pow() function, give up
1095 // unless y is a constant integral value. Then proceed as if it were pown.
1096 if (!isKnownIntegral(V: opr1, DL: M->getDataLayout(), FMF: FPOp->getFastMathFlags()))
1097 return false;
1098 }
1099
1100 Value *nval;
1101 if (needabs) {
1102 nval = B.CreateUnaryIntrinsic(ID: Intrinsic::fabs, V: opr0, FMFSource: nullptr, Name: "__fabs");
1103 } else {
1104 nval = cnval ? cnval : opr0;
1105 }
1106 if (needlog) {
1107 FunctionCallee LogExpr;
1108 if (ShouldUseIntrinsic) {
1109 LogExpr = Intrinsic::getOrInsertDeclaration(M, id: Intrinsic::log2,
1110 Tys: {FPOp->getType()});
1111 } else {
1112 LogExpr = getFunction(M, fInfo: AMDGPULibFunc(AMDGPULibFunc::EI_LOG2, FInfo));
1113 if (!LogExpr)
1114 return false;
1115 }
1116
1117 nval = CreateCallEx(B,Callee: LogExpr, Arg: nval, Name: "__log2");
1118 }
1119
1120 if (FInfo.getId() == AMDGPULibFunc::EI_POWN) {
1121 // convert int(32) to fp(f32 or f64)
1122 opr1 = B.CreateSIToFP(V: opr1, DestTy: nval->getType(), Name: "pownI2F");
1123 }
1124 nval = B.CreateFMul(L: opr1, R: nval, Name: "__ylogx");
1125 nval = CreateCallEx(B,Callee: ExpExpr, Arg: nval, Name: "__exp2");
1126
1127 if (needcopysign) {
1128 Type* nTyS = B.getIntNTy(N: eltType->getPrimitiveSizeInBits());
1129 Type *nTy = FPOp->getType()->getWithNewType(EltTy: nTyS);
1130 unsigned size = nTy->getScalarSizeInBits();
1131 Value *opr_n = FPOp->getOperand(i: 1);
1132 if (opr_n->getType()->getScalarType()->isIntegerTy())
1133 opr_n = B.CreateZExtOrTrunc(V: opr_n, DestTy: nTy, Name: "__ytou");
1134 else
1135 opr_n = B.CreateFPToSI(V: opr1, DestTy: nTy, Name: "__ytou");
1136
1137 Value *sign = B.CreateShl(LHS: opr_n, RHS: size-1, Name: "__yeven");
1138 sign = B.CreateAnd(LHS: B.CreateBitCast(V: opr0, DestTy: nTy), RHS: sign, Name: "__pow_sign");
1139 nval = B.CreateOr(LHS: B.CreateBitCast(V: nval, DestTy: nTy), RHS: sign);
1140 nval = B.CreateBitCast(V: nval, DestTy: opr0->getType());
1141 }
1142
1143 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> "
1144 << "exp2(" << *opr1 << " * log2(" << *opr0 << "))\n");
1145 replaceCall(I: FPOp, With: nval);
1146
1147 return true;
1148}
1149
1150bool AMDGPULibCalls::fold_rootn(FPMathOperator *FPOp, IRBuilder<> &B,
1151 const FuncInfo &FInfo) {
1152 Value *opr0 = FPOp->getOperand(i: 0);
1153 Value *opr1 = FPOp->getOperand(i: 1);
1154
1155 const APInt *CINT = nullptr;
1156 if (!match(V: opr1, P: m_APIntAllowPoison(Res&: CINT)))
1157 return false;
1158
1159 Function *Parent = B.GetInsertBlock()->getParent();
1160
1161 int ci_opr1 = (int)CINT->getSExtValue();
1162 if (ci_opr1 == 1 && !Parent->hasFnAttribute(Kind: Attribute::StrictFP)) {
1163 // rootn(x, 1) = x
1164 //
1165 // TODO: Insert constrained canonicalize for strictfp case.
1166 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << *opr0 << '\n');
1167 replaceCall(I: FPOp, With: opr0);
1168 return true;
1169 }
1170
1171 Module *M = B.GetInsertBlock()->getModule();
1172
1173 CallInst *CI = cast<CallInst>(Val: FPOp);
1174 if (ci_opr1 == 2 &&
1175 shouldReplaceLibcallWithIntrinsic(CI,
1176 /*AllowMinSizeF32=*/true,
1177 /*AllowF64=*/true)) {
1178 // rootn(x, 2) = sqrt(x)
1179 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> sqrt(" << *opr0 << ")\n");
1180
1181 CallInst *NewCall = B.CreateUnaryIntrinsic(ID: Intrinsic::sqrt, V: opr0, FMFSource: CI);
1182 NewCall->takeName(V: CI);
1183
1184 // OpenCL rootn has a looser ulp of 2 requirement than sqrt, so add some
1185 // metadata.
1186 MDBuilder MDHelper(M->getContext());
1187 MDNode *FPMD = MDHelper.createFPMath(Accuracy: std::max(a: FPOp->getFPAccuracy(), b: 2.0f));
1188 NewCall->setMetadata(KindID: LLVMContext::MD_fpmath, Node: FPMD);
1189
1190 replaceCall(I: CI, With: NewCall);
1191 return true;
1192 }
1193
1194 if (ci_opr1 == 3) { // rootn(x, 3) = cbrt(x)
1195 if (FunctionCallee FPExpr =
1196 getFunction(M, fInfo: AMDGPULibFunc(AMDGPULibFunc::EI_CBRT, FInfo))) {
1197 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> cbrt(" << *opr0
1198 << ")\n");
1199 Value *nval = CreateCallEx(B,Callee: FPExpr, Arg: opr0, Name: "__rootn2cbrt");
1200 replaceCall(I: FPOp, With: nval);
1201 return true;
1202 }
1203 } else if (ci_opr1 == -1) { // rootn(x, -1) = 1.0/x
1204 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1.0 / " << *opr0 << "\n");
1205 Value *nval = B.CreateFDiv(L: ConstantFP::get(Ty: opr0->getType(), V: 1.0),
1206 R: opr0,
1207 Name: "__rootn2div");
1208 replaceCall(I: FPOp, With: nval);
1209 return true;
1210 }
1211
1212 if (ci_opr1 == -2 &&
1213 shouldReplaceLibcallWithIntrinsic(CI,
1214 /*AllowMinSizeF32=*/true,
1215 /*AllowF64=*/true)) {
1216 // rootn(x, -2) = rsqrt(x)
1217
1218 // The original rootn had looser ulp requirements than the resultant sqrt
1219 // and fdiv.
1220 MDBuilder MDHelper(M->getContext());
1221 MDNode *FPMD = MDHelper.createFPMath(Accuracy: std::max(a: FPOp->getFPAccuracy(), b: 2.0f));
1222
1223 // TODO: Could handle strictfp but need to fix strict sqrt emission
1224 FastMathFlags FMF = FPOp->getFastMathFlags();
1225 FMF.setAllowContract(true);
1226
1227 CallInst *Sqrt = B.CreateUnaryIntrinsic(ID: Intrinsic::sqrt, V: opr0, FMFSource: CI);
1228 Instruction *RSqrt = cast<Instruction>(
1229 Val: B.CreateFDiv(L: ConstantFP::get(Ty: opr0->getType(), V: 1.0), R: Sqrt));
1230 Sqrt->setFastMathFlags(FMF);
1231 RSqrt->setFastMathFlags(FMF);
1232 RSqrt->setMetadata(KindID: LLVMContext::MD_fpmath, Node: FPMD);
1233
1234 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> rsqrt(" << *opr0
1235 << ")\n");
1236 replaceCall(I: CI, With: RSqrt);
1237 return true;
1238 }
1239
1240 return false;
1241}
1242
1243// Get a scalar native builtin single argument FP function
1244FunctionCallee AMDGPULibCalls::getNativeFunction(Module *M,
1245 const FuncInfo &FInfo) {
1246 if (getArgType(FInfo) == AMDGPULibFunc::F64 || !HasNative(id: FInfo.getId()))
1247 return nullptr;
1248 FuncInfo nf = FInfo;
1249 nf.setPrefix(AMDGPULibFunc::NATIVE);
1250 return getFunction(M, fInfo: nf);
1251}
1252
1253// Some library calls are just wrappers around llvm intrinsics, but compiled
1254// conservatively. Preserve the flags from the original call site by
1255// substituting them with direct calls with all the flags.
1256bool AMDGPULibCalls::shouldReplaceLibcallWithIntrinsic(const CallInst *CI,
1257 bool AllowMinSizeF32,
1258 bool AllowF64,
1259 bool AllowStrictFP) {
1260 Type *FltTy = CI->getType()->getScalarType();
1261 const bool IsF32 = FltTy->isFloatTy();
1262
1263 // f64 intrinsics aren't implemented for most operations.
1264 if (!IsF32 && !FltTy->isHalfTy() && (!AllowF64 || !FltTy->isDoubleTy()))
1265 return false;
1266
1267 // We're implicitly inlining by replacing the libcall with the intrinsic, so
1268 // don't do it for noinline call sites.
1269 if (CI->isNoInline())
1270 return false;
1271
1272 const Function *ParentF = CI->getFunction();
1273 // TODO: Handle strictfp
1274 if (!AllowStrictFP && ParentF->hasFnAttribute(Kind: Attribute::StrictFP))
1275 return false;
1276
1277 if (IsF32 && !AllowMinSizeF32 && ParentF->hasMinSize())
1278 return false;
1279 return true;
1280}
1281
1282void AMDGPULibCalls::replaceLibCallWithSimpleIntrinsic(IRBuilder<> &B,
1283 CallInst *CI,
1284 Intrinsic::ID IntrID) {
1285 if (CI->arg_size() == 2) {
1286 Value *Arg0 = CI->getArgOperand(i: 0);
1287 Value *Arg1 = CI->getArgOperand(i: 1);
1288 VectorType *Arg0VecTy = dyn_cast<VectorType>(Val: Arg0->getType());
1289 VectorType *Arg1VecTy = dyn_cast<VectorType>(Val: Arg1->getType());
1290 if (Arg0VecTy && !Arg1VecTy) {
1291 Value *SplatRHS = B.CreateVectorSplat(EC: Arg0VecTy->getElementCount(), V: Arg1);
1292 CI->setArgOperand(i: 1, v: SplatRHS);
1293 } else if (!Arg0VecTy && Arg1VecTy) {
1294 Value *SplatLHS = B.CreateVectorSplat(EC: Arg1VecTy->getElementCount(), V: Arg0);
1295 CI->setArgOperand(i: 0, v: SplatLHS);
1296 }
1297 }
1298
1299 CI->setCalledFunction(Intrinsic::getOrInsertDeclaration(
1300 M: CI->getModule(), id: IntrID, Tys: {CI->getType()}));
1301}
1302
1303bool AMDGPULibCalls::tryReplaceLibcallWithSimpleIntrinsic(
1304 IRBuilder<> &B, CallInst *CI, Intrinsic::ID IntrID, bool AllowMinSizeF32,
1305 bool AllowF64, bool AllowStrictFP) {
1306 if (!shouldReplaceLibcallWithIntrinsic(CI, AllowMinSizeF32, AllowF64,
1307 AllowStrictFP))
1308 return false;
1309 replaceLibCallWithSimpleIntrinsic(B, CI, IntrID);
1310 return true;
1311}
1312
1313std::tuple<Value *, Value *, Value *>
1314AMDGPULibCalls::insertSinCos(Value *Arg, FastMathFlags FMF, IRBuilder<> &B,
1315 FunctionCallee Fsincos) {
1316 DebugLoc DL = B.getCurrentDebugLocation();
1317 Function *F = B.GetInsertBlock()->getParent();
1318 B.SetInsertPointPastAllocas(F);
1319
1320 AllocaInst *Alloc = B.CreateAlloca(Ty: Arg->getType(), ArraySize: nullptr, Name: "__sincos_");
1321
1322 if (Instruction *ArgInst = dyn_cast<Instruction>(Val: Arg)) {
1323 // If the argument is an instruction, it must dominate all uses so put our
1324 // sincos call there. Otherwise, right after the allocas works well enough
1325 // if it's an argument or constant.
1326
1327 B.SetInsertPoint(TheBB: ArgInst->getParent(), IP: ++ArgInst->getIterator());
1328
1329 // SetInsertPoint unwelcomely always tries to set the debug loc.
1330 B.SetCurrentDebugLocation(DL);
1331 }
1332
1333 Type *CosPtrTy = Fsincos.getFunctionType()->getParamType(i: 1);
1334
1335 // The allocaInst allocates the memory in private address space. This need
1336 // to be addrspacecasted to point to the address space of cos pointer type.
1337 // In OpenCL 2.0 this is generic, while in 1.2 that is private.
1338 Value *CastAlloc = B.CreateAddrSpaceCast(V: Alloc, DestTy: CosPtrTy);
1339
1340 CallInst *SinCos = CreateCallEx2(B, Callee: Fsincos, Arg1: Arg, Arg2: CastAlloc);
1341
1342 // TODO: Is it worth trying to preserve the location for the cos calls for the
1343 // load?
1344
1345 LoadInst *LoadCos = B.CreateLoad(Ty: Alloc->getAllocatedType(), Ptr: Alloc);
1346 return {SinCos, LoadCos, SinCos};
1347}
1348
1349// fold sin, cos -> sincos.
1350bool AMDGPULibCalls::fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B,
1351 const FuncInfo &fInfo) {
1352 assert(fInfo.getId() == AMDGPULibFunc::EI_SIN ||
1353 fInfo.getId() == AMDGPULibFunc::EI_COS);
1354
1355 if ((getArgType(FInfo: fInfo) != AMDGPULibFunc::F32 &&
1356 getArgType(FInfo: fInfo) != AMDGPULibFunc::F64) ||
1357 fInfo.getPrefix() != AMDGPULibFunc::NOPFX)
1358 return false;
1359
1360 bool const isSin = fInfo.getId() == AMDGPULibFunc::EI_SIN;
1361
1362 Value *CArgVal = FPOp->getOperand(i: 0);
1363
1364 // TODO: Constant fold the call
1365 if (isa<ConstantData>(Val: CArgVal))
1366 return false;
1367
1368 CallInst *CI = cast<CallInst>(Val: FPOp);
1369
1370 Function *F = B.GetInsertBlock()->getParent();
1371 Module *M = F->getParent();
1372
1373 // Merge the sin and cos. For OpenCL 2.0, there may only be a generic pointer
1374 // implementation. Prefer the private form if available.
1375 AMDGPULibFunc SinCosLibFuncPrivate(AMDGPULibFunc::EI_SINCOS, fInfo);
1376 SinCosLibFuncPrivate.getLeads()[0].PtrKind =
1377 AMDGPULibFunc::getEPtrKindFromAddrSpace(AS: AMDGPUAS::PRIVATE_ADDRESS);
1378
1379 AMDGPULibFunc SinCosLibFuncGeneric(AMDGPULibFunc::EI_SINCOS, fInfo);
1380 SinCosLibFuncGeneric.getLeads()[0].PtrKind =
1381 AMDGPULibFunc::getEPtrKindFromAddrSpace(AS: AMDGPUAS::FLAT_ADDRESS);
1382
1383 FunctionCallee FSinCosPrivate = getFunction(M, fInfo: SinCosLibFuncPrivate);
1384 FunctionCallee FSinCosGeneric = getFunction(M, fInfo: SinCosLibFuncGeneric);
1385 FunctionCallee FSinCos = FSinCosPrivate ? FSinCosPrivate : FSinCosGeneric;
1386 if (!FSinCos)
1387 return false;
1388
1389 SmallVector<CallInst *> SinCalls;
1390 SmallVector<CallInst *> CosCalls;
1391 SmallVector<CallInst *> SinCosCalls;
1392 FuncInfo PartnerInfo(isSin ? AMDGPULibFunc::EI_COS : AMDGPULibFunc::EI_SIN,
1393 fInfo);
1394 const std::string PairName = PartnerInfo.mangle();
1395
1396 StringRef SinName = isSin ? CI->getCalledFunction()->getName() : PairName;
1397 StringRef CosName = isSin ? PairName : CI->getCalledFunction()->getName();
1398 const std::string SinCosPrivateName = SinCosLibFuncPrivate.mangle();
1399 const std::string SinCosGenericName = SinCosLibFuncGeneric.mangle();
1400
1401 // Intersect the two sets of flags.
1402 FastMathFlags FMF = FPOp->getFastMathFlags();
1403 MDNode *FPMath = CI->getMetadata(KindID: LLVMContext::MD_fpmath);
1404
1405 SmallVector<DILocation *> MergeDbgLocs = {CI->getDebugLoc()};
1406
1407 for (User* U : CArgVal->users()) {
1408 CallInst *XI = dyn_cast<CallInst>(Val: U);
1409 if (!XI || XI->getFunction() != F || XI->isNoBuiltin())
1410 continue;
1411
1412 Function *UCallee = XI->getCalledFunction();
1413 if (!UCallee)
1414 continue;
1415
1416 bool Handled = true;
1417
1418 if (UCallee->getName() == SinName)
1419 SinCalls.push_back(Elt: XI);
1420 else if (UCallee->getName() == CosName)
1421 CosCalls.push_back(Elt: XI);
1422 else if (UCallee->getName() == SinCosPrivateName ||
1423 UCallee->getName() == SinCosGenericName)
1424 SinCosCalls.push_back(Elt: XI);
1425 else
1426 Handled = false;
1427
1428 if (Handled) {
1429 MergeDbgLocs.push_back(Elt: XI->getDebugLoc());
1430 auto *OtherOp = cast<FPMathOperator>(Val: XI);
1431 FMF &= OtherOp->getFastMathFlags();
1432 FPMath = MDNode::getMostGenericFPMath(
1433 A: FPMath, B: XI->getMetadata(KindID: LLVMContext::MD_fpmath));
1434 }
1435 }
1436
1437 if (SinCalls.empty() || CosCalls.empty())
1438 return false;
1439
1440 B.setFastMathFlags(FMF);
1441 B.setDefaultFPMathTag(FPMath);
1442 DILocation *DbgLoc = DILocation::getMergedLocations(Locs: MergeDbgLocs);
1443 B.SetCurrentDebugLocation(DbgLoc);
1444
1445 auto [Sin, Cos, SinCos] = insertSinCos(Arg: CArgVal, FMF, B, Fsincos: FSinCos);
1446
1447 auto replaceTrigInsts = [](ArrayRef<CallInst *> Calls, Value *Res) {
1448 for (CallInst *C : Calls)
1449 C->replaceAllUsesWith(V: Res);
1450
1451 // Leave the other dead instructions to avoid clobbering iterators.
1452 };
1453
1454 replaceTrigInsts(SinCalls, Sin);
1455 replaceTrigInsts(CosCalls, Cos);
1456 replaceTrigInsts(SinCosCalls, SinCos);
1457
1458 // It's safe to delete the original now.
1459 CI->eraseFromParent();
1460 return true;
1461}
1462
1463bool AMDGPULibCalls::evaluateScalarMathFunc(const FuncInfo &FInfo, double &Res0,
1464 double &Res1, Constant *copr0,
1465 Constant *copr1) {
1466 // By default, opr0/opr1/opr3 holds values of float/double type.
1467 // If they are not float/double, each function has to its
1468 // operand separately.
1469 double opr0 = 0.0, opr1 = 0.0;
1470 ConstantFP *fpopr0 = dyn_cast_or_null<ConstantFP>(Val: copr0);
1471 ConstantFP *fpopr1 = dyn_cast_or_null<ConstantFP>(Val: copr1);
1472 if (fpopr0) {
1473 opr0 = (getArgType(FInfo) == AMDGPULibFunc::F64)
1474 ? fpopr0->getValueAPF().convertToDouble()
1475 : (double)fpopr0->getValueAPF().convertToFloat();
1476 }
1477
1478 if (fpopr1) {
1479 opr1 = (getArgType(FInfo) == AMDGPULibFunc::F64)
1480 ? fpopr1->getValueAPF().convertToDouble()
1481 : (double)fpopr1->getValueAPF().convertToFloat();
1482 }
1483
1484 switch (FInfo.getId()) {
1485 default : return false;
1486
1487 case AMDGPULibFunc::EI_ACOS:
1488 Res0 = acos(x: opr0);
1489 return true;
1490
1491 case AMDGPULibFunc::EI_ACOSH:
1492 // acosh(x) == log(x + sqrt(x*x - 1))
1493 Res0 = log(x: opr0 + sqrt(x: opr0*opr0 - 1.0));
1494 return true;
1495
1496 case AMDGPULibFunc::EI_ACOSPI:
1497 Res0 = acos(x: opr0) / MATH_PI;
1498 return true;
1499
1500 case AMDGPULibFunc::EI_ASIN:
1501 Res0 = asin(x: opr0);
1502 return true;
1503
1504 case AMDGPULibFunc::EI_ASINH:
1505 // asinh(x) == log(x + sqrt(x*x + 1))
1506 Res0 = log(x: opr0 + sqrt(x: opr0*opr0 + 1.0));
1507 return true;
1508
1509 case AMDGPULibFunc::EI_ASINPI:
1510 Res0 = asin(x: opr0) / MATH_PI;
1511 return true;
1512
1513 case AMDGPULibFunc::EI_ATAN:
1514 Res0 = atan(x: opr0);
1515 return true;
1516
1517 case AMDGPULibFunc::EI_ATANH:
1518 // atanh(x) == (log(x+1) - log(x-1))/2;
1519 Res0 = (log(x: opr0 + 1.0) - log(x: opr0 - 1.0))/2.0;
1520 return true;
1521
1522 case AMDGPULibFunc::EI_ATANPI:
1523 Res0 = atan(x: opr0) / MATH_PI;
1524 return true;
1525
1526 case AMDGPULibFunc::EI_CBRT:
1527 Res0 = (opr0 < 0.0) ? -pow(x: -opr0, y: 1.0/3.0) : pow(x: opr0, y: 1.0/3.0);
1528 return true;
1529
1530 case AMDGPULibFunc::EI_COS:
1531 Res0 = cos(x: opr0);
1532 return true;
1533
1534 case AMDGPULibFunc::EI_COSH:
1535 Res0 = cosh(x: opr0);
1536 return true;
1537
1538 case AMDGPULibFunc::EI_COSPI:
1539 Res0 = cos(MATH_PI * opr0);
1540 return true;
1541
1542 case AMDGPULibFunc::EI_EXP:
1543 Res0 = exp(x: opr0);
1544 return true;
1545
1546 case AMDGPULibFunc::EI_EXP2:
1547 Res0 = pow(x: 2.0, y: opr0);
1548 return true;
1549
1550 case AMDGPULibFunc::EI_EXP10:
1551 Res0 = pow(x: 10.0, y: opr0);
1552 return true;
1553
1554 case AMDGPULibFunc::EI_LOG:
1555 Res0 = log(x: opr0);
1556 return true;
1557
1558 case AMDGPULibFunc::EI_LOG2:
1559 Res0 = log(x: opr0) / log(x: 2.0);
1560 return true;
1561
1562 case AMDGPULibFunc::EI_LOG10:
1563 Res0 = log(x: opr0) / log(x: 10.0);
1564 return true;
1565
1566 case AMDGPULibFunc::EI_RSQRT:
1567 Res0 = 1.0 / sqrt(x: opr0);
1568 return true;
1569
1570 case AMDGPULibFunc::EI_SIN:
1571 Res0 = sin(x: opr0);
1572 return true;
1573
1574 case AMDGPULibFunc::EI_SINH:
1575 Res0 = sinh(x: opr0);
1576 return true;
1577
1578 case AMDGPULibFunc::EI_SINPI:
1579 Res0 = sin(MATH_PI * opr0);
1580 return true;
1581
1582 case AMDGPULibFunc::EI_TAN:
1583 Res0 = tan(x: opr0);
1584 return true;
1585
1586 case AMDGPULibFunc::EI_TANH:
1587 Res0 = tanh(x: opr0);
1588 return true;
1589
1590 case AMDGPULibFunc::EI_TANPI:
1591 Res0 = tan(MATH_PI * opr0);
1592 return true;
1593
1594 // two-arg functions
1595 case AMDGPULibFunc::EI_POW:
1596 case AMDGPULibFunc::EI_POWR:
1597 Res0 = pow(x: opr0, y: opr1);
1598 return true;
1599
1600 case AMDGPULibFunc::EI_POWN: {
1601 if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(Val: copr1)) {
1602 double val = (double)iopr1->getSExtValue();
1603 Res0 = pow(x: opr0, y: val);
1604 return true;
1605 }
1606 return false;
1607 }
1608
1609 case AMDGPULibFunc::EI_ROOTN: {
1610 if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(Val: copr1)) {
1611 double val = (double)iopr1->getSExtValue();
1612 Res0 = pow(x: opr0, y: 1.0 / val);
1613 return true;
1614 }
1615 return false;
1616 }
1617
1618 // with ptr arg
1619 case AMDGPULibFunc::EI_SINCOS:
1620 Res0 = sin(x: opr0);
1621 Res1 = cos(x: opr0);
1622 return true;
1623 }
1624
1625 return false;
1626}
1627
1628bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) {
1629 int numArgs = (int)aCI->arg_size();
1630 if (numArgs > 3)
1631 return false;
1632
1633 Constant *copr0 = nullptr;
1634 Constant *copr1 = nullptr;
1635 if (numArgs > 0) {
1636 if ((copr0 = dyn_cast<Constant>(Val: aCI->getArgOperand(i: 0))) == nullptr)
1637 return false;
1638 }
1639
1640 if (numArgs > 1) {
1641 if ((copr1 = dyn_cast<Constant>(Val: aCI->getArgOperand(i: 1))) == nullptr) {
1642 if (FInfo.getId() != AMDGPULibFunc::EI_SINCOS)
1643 return false;
1644 }
1645 }
1646
1647 // At this point, all arguments to aCI are constants.
1648
1649 // max vector size is 16, and sincos will generate two results.
1650 double DVal0[16], DVal1[16];
1651 int FuncVecSize = getVecSize(FInfo);
1652 bool hasTwoResults = (FInfo.getId() == AMDGPULibFunc::EI_SINCOS);
1653 if (FuncVecSize == 1) {
1654 if (!evaluateScalarMathFunc(FInfo, Res0&: DVal0[0], Res1&: DVal1[0], copr0, copr1)) {
1655 return false;
1656 }
1657 } else {
1658 ConstantDataVector *CDV0 = dyn_cast_or_null<ConstantDataVector>(Val: copr0);
1659 ConstantDataVector *CDV1 = dyn_cast_or_null<ConstantDataVector>(Val: copr1);
1660 for (int i = 0; i < FuncVecSize; ++i) {
1661 Constant *celt0 = CDV0 ? CDV0->getElementAsConstant(i) : nullptr;
1662 Constant *celt1 = CDV1 ? CDV1->getElementAsConstant(i) : nullptr;
1663 if (!evaluateScalarMathFunc(FInfo, Res0&: DVal0[i], Res1&: DVal1[i], copr0: celt0, copr1: celt1)) {
1664 return false;
1665 }
1666 }
1667 }
1668
1669 LLVMContext &context = aCI->getContext();
1670 Constant *nval0, *nval1;
1671 if (FuncVecSize == 1) {
1672 nval0 = ConstantFP::get(Ty: aCI->getType(), V: DVal0[0]);
1673 if (hasTwoResults)
1674 nval1 = ConstantFP::get(Ty: aCI->getType(), V: DVal1[0]);
1675 } else {
1676 if (getArgType(FInfo) == AMDGPULibFunc::F32) {
1677 SmallVector <float, 0> FVal0, FVal1;
1678 for (int i = 0; i < FuncVecSize; ++i)
1679 FVal0.push_back(Elt: (float)DVal0[i]);
1680 ArrayRef<float> tmp0(FVal0);
1681 nval0 = ConstantDataVector::get(Context&: context, Elts: tmp0);
1682 if (hasTwoResults) {
1683 for (int i = 0; i < FuncVecSize; ++i)
1684 FVal1.push_back(Elt: (float)DVal1[i]);
1685 ArrayRef<float> tmp1(FVal1);
1686 nval1 = ConstantDataVector::get(Context&: context, Elts: tmp1);
1687 }
1688 } else {
1689 ArrayRef<double> tmp0(DVal0);
1690 nval0 = ConstantDataVector::get(Context&: context, Elts: tmp0);
1691 if (hasTwoResults) {
1692 ArrayRef<double> tmp1(DVal1);
1693 nval1 = ConstantDataVector::get(Context&: context, Elts: tmp1);
1694 }
1695 }
1696 }
1697
1698 if (hasTwoResults) {
1699 // sincos
1700 assert(FInfo.getId() == AMDGPULibFunc::EI_SINCOS &&
1701 "math function with ptr arg not supported yet");
1702 new StoreInst(nval1, aCI->getArgOperand(i: 1), aCI->getIterator());
1703 }
1704
1705 replaceCall(I: aCI, With: nval0);
1706 return true;
1707}
1708
1709PreservedAnalyses AMDGPUSimplifyLibCallsPass::run(Function &F,
1710 FunctionAnalysisManager &AM) {
1711 AMDGPULibCalls Simplifier;
1712 Simplifier.initNativeFuncs();
1713 Simplifier.initFunction(F, FAM&: AM);
1714
1715 bool Changed = false;
1716
1717 LLVM_DEBUG(dbgs() << "AMDIC: process function ";
1718 F.printAsOperand(dbgs(), false, F.getParent()); dbgs() << '\n';);
1719
1720 for (auto &BB : F) {
1721 for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;) {
1722 // Ignore non-calls.
1723 CallInst *CI = dyn_cast<CallInst>(Val&: I);
1724 ++I;
1725
1726 if (CI) {
1727 if (Simplifier.fold(CI))
1728 Changed = true;
1729 }
1730 }
1731 }
1732 return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
1733}
1734
1735PreservedAnalyses AMDGPUUseNativeCallsPass::run(Function &F,
1736 FunctionAnalysisManager &AM) {
1737 if (UseNative.empty())
1738 return PreservedAnalyses::all();
1739
1740 AMDGPULibCalls Simplifier;
1741 Simplifier.initNativeFuncs();
1742 Simplifier.initFunction(F, FAM&: AM);
1743
1744 bool Changed = false;
1745 for (auto &BB : F) {
1746 for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;) {
1747 // Ignore non-calls.
1748 CallInst *CI = dyn_cast<CallInst>(Val&: I);
1749 ++I;
1750 if (CI && Simplifier.useNative(aCI: CI))
1751 Changed = true;
1752 }
1753 }
1754 return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
1755}
1756