1//===- AMDGPULibCalls.cpp -------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file does AMD library function optimizations.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
15#include "AMDGPULibFunc.h"
16#include "llvm/Analysis/AssumptionCache.h"
17#include "llvm/Analysis/TargetLibraryInfo.h"
18#include "llvm/Analysis/ValueTracking.h"
19#include "llvm/IR/AttributeMask.h"
20#include "llvm/IR/Dominators.h"
21#include "llvm/IR/IRBuilder.h"
22#include "llvm/IR/IntrinsicsAMDGPU.h"
23#include "llvm/IR/MDBuilder.h"
24#include "llvm/IR/PatternMatch.h"
25#include <cmath>
26
27#define DEBUG_TYPE "amdgpu-simplifylib"
28
29using namespace llvm;
30using namespace llvm::PatternMatch;
31
32static cl::opt<bool> EnablePreLink("amdgpu-prelink",
33 cl::desc("Enable pre-link mode optimizations"),
34 cl::init(Val: false),
35 cl::Hidden);
36
37static cl::list<std::string> UseNative("amdgpu-use-native",
38 cl::desc("Comma separated list of functions to replace with native, or all"),
39 cl::CommaSeparated, cl::ValueOptional,
40 cl::Hidden);
41
42#define MATH_PI numbers::pi
43#define MATH_E numbers::e
44#define MATH_SQRT2 numbers::sqrt2
45#define MATH_SQRT1_2 numbers::inv_sqrt2
46
47enum class PowKind { Pow, PowR, PowN, RootN };
48
49namespace llvm {
50
51class AMDGPULibCalls {
52private:
53 SimplifyQuery SQ;
54
55 using FuncInfo = llvm::AMDGPULibFunc;
56
57 // -fuse-native.
58 bool AllNative = false;
59
60 bool useNativeFunc(const StringRef F) const;
61
62 // Return a pointer (pointer expr) to the function if function definition with
63 // "FuncName" exists. It may create a new function prototype in pre-link mode.
64 FunctionCallee getFunction(Module *M, const FuncInfo &fInfo);
65
66 /// Wrapper around getFunction which tries to use a faster variant if
67 /// available, and falls back to a less fast option.
68 ///
69 /// Return a replacement function for \p fInfo that has float-typed fast
70 /// variants. \p NewFunc is a base replacement function to use. \p
71 /// NewFuncFastVariant is a faster version to use if the calling context knows
72 /// it's legal. If there is no fast variant to use, \p NewFuncFastVariant
73 /// should be EI_NONE.
74 FunctionCallee getFloatFastVariant(Module *M, const FuncInfo &fInfo,
75 FuncInfo &newInfo,
76 AMDGPULibFunc::EFuncId NewFunc,
77 AMDGPULibFunc::EFuncId NewFuncFastVariant);
78
79 bool parseFunctionName(const StringRef &FMangledName, FuncInfo &FInfo);
80
81 bool TDOFold(CallInst *CI, const FuncInfo &FInfo);
82
83 /* Specialized optimizations */
84
85 // pow/powr/pown
86 bool fold_pow(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);
87
88 /// Peform a fast math expansion of pow, powr, pown or rootn.
89 bool expandFastPow(FPMathOperator *FPOp, IRBuilder<> &B, PowKind Kind);
90
91 bool tryOptimizePow(FPMathOperator *FPOp, IRBuilder<> &B,
92 const FuncInfo &FInfo);
93
94 // rootn
95 bool fold_rootn(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);
96
97 // -fuse-native for sincos
98 bool sincosUseNative(CallInst *aCI, const FuncInfo &FInfo);
99
100 // evaluate calls if calls' arguments are constants.
101 bool evaluateScalarMathFunc(const FuncInfo &FInfo, double &Res0, double &Res1,
102 Constant *copr0, Constant *copr1);
103 bool evaluateCall(CallInst *aCI, const FuncInfo &FInfo);
104
105 /// Insert a value to sincos function \p Fsincos. Returns (value of sin, value
106 /// of cos, sincos call).
107 std::tuple<Value *, Value *, Value *> insertSinCos(Value *Arg,
108 FastMathFlags FMF,
109 IRBuilder<> &B,
110 FunctionCallee Fsincos);
111
112 // sin/cos
113 bool fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);
114
115 // __read_pipe/__write_pipe
116 bool fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
117 const FuncInfo &FInfo);
118
119 // Get a scalar native builtin single argument FP function
120 FunctionCallee getNativeFunction(Module *M, const FuncInfo &FInfo);
121
122 /// Substitute a call to a known libcall with an intrinsic call. If \p
123 /// AllowMinSize is true, allow the replacement in a minsize function.
124 bool shouldReplaceLibcallWithIntrinsic(const CallInst *CI,
125 bool AllowMinSizeF32 = false,
126 bool AllowF64 = false,
127 bool AllowStrictFP = false);
128 void replaceLibCallWithSimpleIntrinsic(IRBuilder<> &B, CallInst *CI,
129 Intrinsic::ID IntrID);
130
131 bool tryReplaceLibcallWithSimpleIntrinsic(IRBuilder<> &B, CallInst *CI,
132 Intrinsic::ID IntrID,
133 bool AllowMinSizeF32 = false,
134 bool AllowF64 = false,
135 bool AllowStrictFP = false);
136
137protected:
138 bool isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const;
139
140 bool canIncreasePrecisionOfConstantFold(const FPMathOperator *FPOp) const;
141
142 static void replaceCall(Instruction *I, Value *With) {
143 I->replaceAllUsesWith(V: With);
144 I->eraseFromParent();
145 }
146
147 static void replaceCall(FPMathOperator *I, Value *With) {
148 replaceCall(I: cast<Instruction>(Val: I), With);
149 }
150
151public:
152 AMDGPULibCalls(Function &F, FunctionAnalysisManager &FAM);
153
154 bool fold(CallInst *CI);
155
156 void initNativeFuncs();
157
158 // Replace a normal math function call with that native version
159 bool useNative(CallInst *CI);
160};
161
162} // end namespace llvm
163
164template <typename IRB>
165static CallInst *CreateCallEx(IRB &B, FunctionCallee Callee, Value *Arg,
166 const Twine &Name = "") {
167 CallInst *R = B.CreateCall(Callee, Arg, Name);
168 if (Function *F = dyn_cast<Function>(Val: Callee.getCallee()))
169 R->setCallingConv(F->getCallingConv());
170 return R;
171}
172
173template <typename IRB>
174static CallInst *CreateCallEx2(IRB &B, FunctionCallee Callee, Value *Arg1,
175 Value *Arg2, const Twine &Name = "") {
176 CallInst *R = B.CreateCall(Callee, {Arg1, Arg2}, Name);
177 if (Function *F = dyn_cast<Function>(Val: Callee.getCallee()))
178 R->setCallingConv(F->getCallingConv());
179 return R;
180}
181
182static FunctionType *getPownType(FunctionType *FT) {
183 Type *PowNExpTy = Type::getInt32Ty(C&: FT->getContext());
184 if (VectorType *VecTy = dyn_cast<VectorType>(Val: FT->getReturnType()))
185 PowNExpTy = VectorType::get(ElementType: PowNExpTy, EC: VecTy->getElementCount());
186
187 return FunctionType::get(Result: FT->getReturnType(),
188 Params: {FT->getParamType(i: 0), PowNExpTy}, isVarArg: false);
189}
190
191// Data structures for table-driven optimizations.
192// FuncTbl works for both f32 and f64 functions with 1 input argument
193
194struct TableEntry {
195 double result;
196 double input;
197};
198
199/* a list of {result, input} */
200static const TableEntry tbl_acos[] = {
201 {MATH_PI / 2.0, .input: 0.0},
202 {MATH_PI / 2.0, .input: -0.0},
203 {.result: 0.0, .input: 1.0},
204 {MATH_PI, .input: -1.0}
205};
206static const TableEntry tbl_acosh[] = {
207 {.result: 0.0, .input: 1.0}
208};
209static const TableEntry tbl_acospi[] = {
210 {.result: 0.5, .input: 0.0},
211 {.result: 0.5, .input: -0.0},
212 {.result: 0.0, .input: 1.0},
213 {.result: 1.0, .input: -1.0}
214};
215static const TableEntry tbl_asin[] = {
216 {.result: 0.0, .input: 0.0},
217 {.result: -0.0, .input: -0.0},
218 {MATH_PI / 2.0, .input: 1.0},
219 {.result: -MATH_PI / 2.0, .input: -1.0}
220};
221static const TableEntry tbl_asinh[] = {
222 {.result: 0.0, .input: 0.0},
223 {.result: -0.0, .input: -0.0}
224};
225static const TableEntry tbl_asinpi[] = {
226 {.result: 0.0, .input: 0.0},
227 {.result: -0.0, .input: -0.0},
228 {.result: 0.5, .input: 1.0},
229 {.result: -0.5, .input: -1.0}
230};
231static const TableEntry tbl_atan[] = {
232 {.result: 0.0, .input: 0.0},
233 {.result: -0.0, .input: -0.0},
234 {MATH_PI / 4.0, .input: 1.0},
235 {.result: -MATH_PI / 4.0, .input: -1.0}
236};
237static const TableEntry tbl_atanh[] = {
238 {.result: 0.0, .input: 0.0},
239 {.result: -0.0, .input: -0.0}
240};
241static const TableEntry tbl_atanpi[] = {
242 {.result: 0.0, .input: 0.0},
243 {.result: -0.0, .input: -0.0},
244 {.result: 0.25, .input: 1.0},
245 {.result: -0.25, .input: -1.0}
246};
247static const TableEntry tbl_cbrt[] = {
248 {.result: 0.0, .input: 0.0},
249 {.result: -0.0, .input: -0.0},
250 {.result: 1.0, .input: 1.0},
251 {.result: -1.0, .input: -1.0},
252};
253static const TableEntry tbl_cos[] = {
254 {.result: 1.0, .input: 0.0},
255 {.result: 1.0, .input: -0.0}
256};
257static const TableEntry tbl_cosh[] = {
258 {.result: 1.0, .input: 0.0},
259 {.result: 1.0, .input: -0.0}
260};
261static const TableEntry tbl_cospi[] = {
262 {.result: 1.0, .input: 0.0},
263 {.result: 1.0, .input: -0.0}
264};
265static const TableEntry tbl_erfc[] = {
266 {.result: 1.0, .input: 0.0},
267 {.result: 1.0, .input: -0.0}
268};
269static const TableEntry tbl_erf[] = {
270 {.result: 0.0, .input: 0.0},
271 {.result: -0.0, .input: -0.0}
272};
273static const TableEntry tbl_exp[] = {
274 {.result: 1.0, .input: 0.0},
275 {.result: 1.0, .input: -0.0},
276 {MATH_E, .input: 1.0}
277};
278static const TableEntry tbl_exp2[] = {
279 {.result: 1.0, .input: 0.0},
280 {.result: 1.0, .input: -0.0},
281 {.result: 2.0, .input: 1.0}
282};
283static const TableEntry tbl_exp10[] = {
284 {.result: 1.0, .input: 0.0},
285 {.result: 1.0, .input: -0.0},
286 {.result: 10.0, .input: 1.0}
287};
288static const TableEntry tbl_expm1[] = {
289 {.result: 0.0, .input: 0.0},
290 {.result: -0.0, .input: -0.0}
291};
292static const TableEntry tbl_log[] = {
293 {.result: 0.0, .input: 1.0},
294 {.result: 1.0, MATH_E}
295};
296static const TableEntry tbl_log2[] = {
297 {.result: 0.0, .input: 1.0},
298 {.result: 1.0, .input: 2.0}
299};
300static const TableEntry tbl_log10[] = {
301 {.result: 0.0, .input: 1.0},
302 {.result: 1.0, .input: 10.0}
303};
304static const TableEntry tbl_rsqrt[] = {
305 {.result: 1.0, .input: 1.0},
306 {MATH_SQRT1_2, .input: 2.0}
307};
308static const TableEntry tbl_sin[] = {
309 {.result: 0.0, .input: 0.0},
310 {.result: -0.0, .input: -0.0}
311};
312static const TableEntry tbl_sinh[] = {
313 {.result: 0.0, .input: 0.0},
314 {.result: -0.0, .input: -0.0}
315};
316static const TableEntry tbl_sinpi[] = {
317 {.result: 0.0, .input: 0.0},
318 {.result: -0.0, .input: -0.0}
319};
320static const TableEntry tbl_sqrt[] = {
321 {.result: 0.0, .input: 0.0},
322 {.result: 1.0, .input: 1.0},
323 {MATH_SQRT2, .input: 2.0}
324};
325static const TableEntry tbl_tan[] = {
326 {.result: 0.0, .input: 0.0},
327 {.result: -0.0, .input: -0.0}
328};
329static const TableEntry tbl_tanh[] = {
330 {.result: 0.0, .input: 0.0},
331 {.result: -0.0, .input: -0.0}
332};
333static const TableEntry tbl_tanpi[] = {
334 {.result: 0.0, .input: 0.0},
335 {.result: -0.0, .input: -0.0}
336};
337static const TableEntry tbl_tgamma[] = {
338 {.result: 1.0, .input: 1.0},
339 {.result: 1.0, .input: 2.0},
340 {.result: 2.0, .input: 3.0},
341 {.result: 6.0, .input: 4.0}
342};
343
344static bool HasNative(AMDGPULibFunc::EFuncId id) {
345 switch(id) {
346 case AMDGPULibFunc::EI_DIVIDE:
347 case AMDGPULibFunc::EI_COS:
348 case AMDGPULibFunc::EI_EXP:
349 case AMDGPULibFunc::EI_EXP2:
350 case AMDGPULibFunc::EI_EXP10:
351 case AMDGPULibFunc::EI_LOG:
352 case AMDGPULibFunc::EI_LOG2:
353 case AMDGPULibFunc::EI_LOG10:
354 case AMDGPULibFunc::EI_POWR:
355 case AMDGPULibFunc::EI_RECIP:
356 case AMDGPULibFunc::EI_RSQRT:
357 case AMDGPULibFunc::EI_SIN:
358 case AMDGPULibFunc::EI_SINCOS:
359 case AMDGPULibFunc::EI_SQRT:
360 case AMDGPULibFunc::EI_TAN:
361 return true;
362 default:;
363 }
364 return false;
365}
366
367using TableRef = ArrayRef<TableEntry>;
368
369static TableRef getOptTable(AMDGPULibFunc::EFuncId id) {
370 switch(id) {
371 case AMDGPULibFunc::EI_ACOS: return TableRef(tbl_acos);
372 case AMDGPULibFunc::EI_ACOSH: return TableRef(tbl_acosh);
373 case AMDGPULibFunc::EI_ACOSPI: return TableRef(tbl_acospi);
374 case AMDGPULibFunc::EI_ASIN: return TableRef(tbl_asin);
375 case AMDGPULibFunc::EI_ASINH: return TableRef(tbl_asinh);
376 case AMDGPULibFunc::EI_ASINPI: return TableRef(tbl_asinpi);
377 case AMDGPULibFunc::EI_ATAN: return TableRef(tbl_atan);
378 case AMDGPULibFunc::EI_ATANH: return TableRef(tbl_atanh);
379 case AMDGPULibFunc::EI_ATANPI: return TableRef(tbl_atanpi);
380 case AMDGPULibFunc::EI_CBRT: return TableRef(tbl_cbrt);
381 case AMDGPULibFunc::EI_NCOS:
382 case AMDGPULibFunc::EI_COS: return TableRef(tbl_cos);
383 case AMDGPULibFunc::EI_COSH: return TableRef(tbl_cosh);
384 case AMDGPULibFunc::EI_COSPI: return TableRef(tbl_cospi);
385 case AMDGPULibFunc::EI_ERFC: return TableRef(tbl_erfc);
386 case AMDGPULibFunc::EI_ERF: return TableRef(tbl_erf);
387 case AMDGPULibFunc::EI_EXP: return TableRef(tbl_exp);
388 case AMDGPULibFunc::EI_NEXP2:
389 case AMDGPULibFunc::EI_EXP2: return TableRef(tbl_exp2);
390 case AMDGPULibFunc::EI_EXP10: return TableRef(tbl_exp10);
391 case AMDGPULibFunc::EI_EXPM1: return TableRef(tbl_expm1);
392 case AMDGPULibFunc::EI_LOG: return TableRef(tbl_log);
393 case AMDGPULibFunc::EI_NLOG2:
394 case AMDGPULibFunc::EI_LOG2: return TableRef(tbl_log2);
395 case AMDGPULibFunc::EI_LOG10: return TableRef(tbl_log10);
396 case AMDGPULibFunc::EI_NRSQRT:
397 case AMDGPULibFunc::EI_RSQRT: return TableRef(tbl_rsqrt);
398 case AMDGPULibFunc::EI_NSIN:
399 case AMDGPULibFunc::EI_SIN: return TableRef(tbl_sin);
400 case AMDGPULibFunc::EI_SINH: return TableRef(tbl_sinh);
401 case AMDGPULibFunc::EI_SINPI: return TableRef(tbl_sinpi);
402 case AMDGPULibFunc::EI_NSQRT:
403 case AMDGPULibFunc::EI_SQRT: return TableRef(tbl_sqrt);
404 case AMDGPULibFunc::EI_TAN: return TableRef(tbl_tan);
405 case AMDGPULibFunc::EI_TANH: return TableRef(tbl_tanh);
406 case AMDGPULibFunc::EI_TANPI: return TableRef(tbl_tanpi);
407 case AMDGPULibFunc::EI_TGAMMA: return TableRef(tbl_tgamma);
408 default:;
409 }
410 return TableRef();
411}
412
413static inline int getVecSize(const AMDGPULibFunc& FInfo) {
414 return FInfo.getLeads()[0].VectorSize;
415}
416
417static inline AMDGPULibFunc::EType getArgType(const AMDGPULibFunc& FInfo) {
418 return (AMDGPULibFunc::EType)FInfo.getLeads()[0].ArgType;
419}
420
421FunctionCallee AMDGPULibCalls::getFunction(Module *M, const FuncInfo &fInfo) {
422 // If we are doing PreLinkOpt, the function is external. So it is safe to
423 // use getOrInsertFunction() at this stage.
424
425 return EnablePreLink ? AMDGPULibFunc::getOrInsertFunction(M, fInfo)
426 : AMDGPULibFunc::getFunction(M, fInfo);
427}
428
429FunctionCallee AMDGPULibCalls::getFloatFastVariant(
430 Module *M, const FuncInfo &fInfo, FuncInfo &newInfo,
431 AMDGPULibFunc::EFuncId NewFunc, AMDGPULibFunc::EFuncId FastVariant) {
432 assert(NewFunc != FastVariant);
433
434 if (FastVariant != AMDGPULibFunc::EI_NONE &&
435 getArgType(FInfo: fInfo) == AMDGPULibFunc::F32) {
436 newInfo = AMDGPULibFunc(FastVariant, fInfo);
437 if (FunctionCallee NewCallee = getFunction(M, fInfo: newInfo))
438 return NewCallee;
439 }
440
441 newInfo = AMDGPULibFunc(NewFunc, fInfo);
442 return getFunction(M, fInfo: newInfo);
443}
444
445bool AMDGPULibCalls::parseFunctionName(const StringRef &FMangledName,
446 FuncInfo &FInfo) {
447 return AMDGPULibFunc::parse(MangledName: FMangledName, Ptr&: FInfo);
448}
449
450bool AMDGPULibCalls::isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const {
451 return FPOp->hasApproxFunc() && FPOp->hasNoNaNs() && FPOp->hasNoInfs();
452}
453
454bool AMDGPULibCalls::canIncreasePrecisionOfConstantFold(
455 const FPMathOperator *FPOp) const {
456 // TODO: Refine to approxFunc or contract
457 return FPOp->isFast();
458}
459
460AMDGPULibCalls::AMDGPULibCalls(Function &F, FunctionAnalysisManager &FAM)
461 : SQ(F.getParent()->getDataLayout(),
462 &FAM.getResult<TargetLibraryAnalysis>(IR&: F),
463 FAM.getCachedResult<DominatorTreeAnalysis>(IR&: F),
464 &FAM.getResult<AssumptionAnalysis>(IR&: F)) {}
465
466bool AMDGPULibCalls::useNativeFunc(const StringRef F) const {
467 return AllNative || llvm::is_contained(Range&: UseNative, Element: F);
468}
469
470void AMDGPULibCalls::initNativeFuncs() {
471 AllNative = useNativeFunc(F: "all") ||
472 (UseNative.getNumOccurrences() && UseNative.size() == 1 &&
473 UseNative.begin()->empty());
474}
475
476bool AMDGPULibCalls::sincosUseNative(CallInst *aCI, const FuncInfo &FInfo) {
477 bool native_sin = useNativeFunc(F: "sin");
478 bool native_cos = useNativeFunc(F: "cos");
479
480 if (native_sin && native_cos) {
481 Module *M = aCI->getModule();
482 Value *opr0 = aCI->getArgOperand(i: 0);
483
484 AMDGPULibFunc nf;
485 nf.getLeads()[0].ArgType = FInfo.getLeads()[0].ArgType;
486 nf.getLeads()[0].VectorSize = FInfo.getLeads()[0].VectorSize;
487
488 nf.setPrefix(AMDGPULibFunc::NATIVE);
489 nf.setId(AMDGPULibFunc::EI_SIN);
490 FunctionCallee sinExpr = getFunction(M, fInfo: nf);
491
492 nf.setPrefix(AMDGPULibFunc::NATIVE);
493 nf.setId(AMDGPULibFunc::EI_COS);
494 FunctionCallee cosExpr = getFunction(M, fInfo: nf);
495 if (sinExpr && cosExpr) {
496 Value *sinval =
497 CallInst::Create(Func: sinExpr, Args: opr0, NameStr: "splitsin", InsertBefore: aCI->getIterator());
498 Value *cosval =
499 CallInst::Create(Func: cosExpr, Args: opr0, NameStr: "splitcos", InsertBefore: aCI->getIterator());
500 new StoreInst(cosval, aCI->getArgOperand(i: 1), aCI->getIterator());
501
502 DEBUG_WITH_TYPE("usenative", dbgs() << "<useNative> replace " << *aCI
503 << " with native version of sin/cos");
504
505 replaceCall(I: aCI, With: sinval);
506 return true;
507 }
508 }
509 return false;
510}
511
512bool AMDGPULibCalls::useNative(CallInst *aCI) {
513 Function *Callee = aCI->getCalledFunction();
514 if (!Callee || aCI->isNoBuiltin())
515 return false;
516
517 FuncInfo FInfo;
518 if (!parseFunctionName(FMangledName: Callee->getName(), FInfo) || !FInfo.isMangled() ||
519 FInfo.getPrefix() != AMDGPULibFunc::NOPFX ||
520 getArgType(FInfo) == AMDGPULibFunc::F64 || !HasNative(id: FInfo.getId()) ||
521 !(AllNative || useNativeFunc(F: FInfo.getName()))) {
522 return false;
523 }
524
525 if (FInfo.getId() == AMDGPULibFunc::EI_SINCOS)
526 return sincosUseNative(aCI, FInfo);
527
528 FInfo.setPrefix(AMDGPULibFunc::NATIVE);
529 FunctionCallee F = getFunction(M: aCI->getModule(), fInfo: FInfo);
530 if (!F)
531 return false;
532
533 aCI->setCalledFunction(F);
534 DEBUG_WITH_TYPE("usenative", dbgs() << "<useNative> replace " << *aCI
535 << " with native version");
536 return true;
537}
538
539// Clang emits call of __read_pipe_2 or __read_pipe_4 for OpenCL read_pipe
540// builtin, with appended type size and alignment arguments, where 2 or 4
541// indicates the original number of arguments. The library has optimized version
542// of __read_pipe_2/__read_pipe_4 when the type size and alignment has the same
543// power of 2 value. This function transforms __read_pipe_2 to __read_pipe_2_N
544// for such cases where N is the size in bytes of the type (N = 1, 2, 4, 8, ...,
545// 128). The same for __read_pipe_4, write_pipe_2, and write_pipe_4.
546bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
547 const FuncInfo &FInfo) {
548 auto *Callee = CI->getCalledFunction();
549 if (!Callee->isDeclaration())
550 return false;
551
552 assert(Callee->hasName() && "Invalid read_pipe/write_pipe function");
553 auto *M = Callee->getParent();
554 std::string Name = std::string(Callee->getName());
555 auto NumArg = CI->arg_size();
556 if (NumArg != 4 && NumArg != 6)
557 return false;
558 ConstantInt *PacketSize =
559 dyn_cast<ConstantInt>(Val: CI->getArgOperand(i: NumArg - 2));
560 ConstantInt *PacketAlign =
561 dyn_cast<ConstantInt>(Val: CI->getArgOperand(i: NumArg - 1));
562 if (!PacketSize || !PacketAlign)
563 return false;
564
565 unsigned Size = PacketSize->getZExtValue();
566 Align Alignment = PacketAlign->getAlignValue();
567 if (Alignment != Size)
568 return false;
569
570 unsigned PtrArgLoc = CI->arg_size() - 3;
571 Value *PtrArg = CI->getArgOperand(i: PtrArgLoc);
572 Type *PtrTy = PtrArg->getType();
573
574 SmallVector<llvm::Type *, 6> ArgTys;
575 for (unsigned I = 0; I != PtrArgLoc; ++I)
576 ArgTys.push_back(Elt: CI->getArgOperand(i: I)->getType());
577 ArgTys.push_back(Elt: PtrTy);
578
579 Name = Name + "_" + std::to_string(val: Size);
580 auto *FTy = FunctionType::get(Result: Callee->getReturnType(),
581 Params: ArrayRef<Type *>(ArgTys), isVarArg: false);
582 AMDGPULibFunc NewLibFunc(Name, FTy);
583 FunctionCallee F = AMDGPULibFunc::getOrInsertFunction(M, fInfo: NewLibFunc);
584 if (!F)
585 return false;
586
587 SmallVector<Value *, 6> Args;
588 for (unsigned I = 0; I != PtrArgLoc; ++I)
589 Args.push_back(Elt: CI->getArgOperand(i: I));
590 Args.push_back(Elt: PtrArg);
591
592 auto *NCI = B.CreateCall(Callee: F, Args);
593 NCI->setAttributes(CI->getAttributes());
594 CI->replaceAllUsesWith(V: NCI);
595 CI->dropAllReferences();
596 CI->eraseFromParent();
597
598 return true;
599}
600
601// This function returns false if no change; return true otherwise.
602bool AMDGPULibCalls::fold(CallInst *CI) {
603 Function *Callee = CI->getCalledFunction();
604 // Ignore indirect calls.
605 if (!Callee || Callee->isIntrinsic() || CI->isNoBuiltin())
606 return false;
607
608 FuncInfo FInfo;
609 if (!parseFunctionName(FMangledName: Callee->getName(), FInfo))
610 return false;
611
612 // Further check the number of arguments to see if they match.
613 // TODO: Check calling convention matches too
614 if (!FInfo.isCompatibleSignature(M: *Callee->getParent(), FuncTy: CI->getFunctionType()))
615 return false;
616
617 LLVM_DEBUG(dbgs() << "AMDIC: try folding " << *CI << '\n');
618
619 if (TDOFold(CI, FInfo))
620 return true;
621
622 IRBuilder<> B(CI);
623 if (CI->isStrictFP())
624 B.setIsFPConstrained(true);
625
626 if (FPMathOperator *FPOp = dyn_cast<FPMathOperator>(Val: CI)) {
627 // Under unsafe-math, evaluate calls if possible.
628 // According to Brian Sumner, we can do this for all f32 function calls
629 // using host's double function calls.
630 if (canIncreasePrecisionOfConstantFold(FPOp) && evaluateCall(aCI: CI, FInfo))
631 return true;
632
633 // Copy fast flags from the original call.
634 FastMathFlags FMF = FPOp->getFastMathFlags();
635 B.setFastMathFlags(FMF);
636
637 // Specialized optimizations for each function call.
638 //
639 // TODO: Handle native functions
640 switch (FInfo.getId()) {
641 case AMDGPULibFunc::EI_EXP:
642 if (FMF.none())
643 return false;
644 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::exp,
645 AllowMinSizeF32: FMF.approxFunc());
646 case AMDGPULibFunc::EI_EXP2:
647 if (FMF.none())
648 return false;
649 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::exp2,
650 AllowMinSizeF32: FMF.approxFunc());
651 case AMDGPULibFunc::EI_LOG:
652 if (FMF.none())
653 return false;
654 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::log,
655 AllowMinSizeF32: FMF.approxFunc());
656 case AMDGPULibFunc::EI_LOG2:
657 if (FMF.none())
658 return false;
659 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::log2,
660 AllowMinSizeF32: FMF.approxFunc());
661 case AMDGPULibFunc::EI_LOG10:
662 if (FMF.none())
663 return false;
664 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::log10,
665 AllowMinSizeF32: FMF.approxFunc());
666 case AMDGPULibFunc::EI_FMIN:
667 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::minnum,
668 AllowMinSizeF32: true, AllowF64: true);
669 case AMDGPULibFunc::EI_FMAX:
670 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::maxnum,
671 AllowMinSizeF32: true, AllowF64: true);
672 case AMDGPULibFunc::EI_FMA:
673 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::fma, AllowMinSizeF32: true,
674 AllowF64: true);
675 case AMDGPULibFunc::EI_MAD:
676 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::fmuladd,
677 AllowMinSizeF32: true, AllowF64: true);
678 case AMDGPULibFunc::EI_FABS:
679 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::fabs, AllowMinSizeF32: true,
680 AllowF64: true, AllowStrictFP: true);
681 case AMDGPULibFunc::EI_COPYSIGN:
682 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::copysign,
683 AllowMinSizeF32: true, AllowF64: true, AllowStrictFP: true);
684 case AMDGPULibFunc::EI_FLOOR:
685 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::floor, AllowMinSizeF32: true,
686 AllowF64: true);
687 case AMDGPULibFunc::EI_CEIL:
688 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::ceil, AllowMinSizeF32: true,
689 AllowF64: true);
690 case AMDGPULibFunc::EI_TRUNC:
691 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::trunc, AllowMinSizeF32: true,
692 AllowF64: true);
693 case AMDGPULibFunc::EI_RINT:
694 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::rint, AllowMinSizeF32: true,
695 AllowF64: true);
696 case AMDGPULibFunc::EI_ROUND:
697 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::round, AllowMinSizeF32: true,
698 AllowF64: true);
699 case AMDGPULibFunc::EI_LDEXP: {
700 if (!shouldReplaceLibcallWithIntrinsic(CI, AllowMinSizeF32: true, AllowF64: true))
701 return false;
702
703 Value *Arg1 = CI->getArgOperand(i: 1);
704 if (VectorType *VecTy = dyn_cast<VectorType>(Val: CI->getType());
705 VecTy && !isa<VectorType>(Val: Arg1->getType())) {
706 Value *SplatArg1 = B.CreateVectorSplat(EC: VecTy->getElementCount(), V: Arg1);
707 CI->setArgOperand(i: 1, v: SplatArg1);
708 }
709
710 CI->setCalledFunction(Intrinsic::getOrInsertDeclaration(
711 M: CI->getModule(), id: Intrinsic::ldexp,
712 Tys: {CI->getType(), CI->getArgOperand(i: 1)->getType()}));
713 return true;
714 }
715 case AMDGPULibFunc::EI_POW:
716 case AMDGPULibFunc::EI_POW_FAST:
717 return tryOptimizePow(FPOp, B, FInfo);
718 case AMDGPULibFunc::EI_POWR:
719 case AMDGPULibFunc::EI_POWR_FAST: {
720 if (fold_pow(FPOp, B, FInfo))
721 return true;
722 if (!FMF.approxFunc())
723 return false;
724
725 if (FInfo.getId() == AMDGPULibFunc::EI_POWR && FMF.approxFunc() &&
726 getArgType(FInfo) == AMDGPULibFunc::F32) {
727 Module *M = Callee->getParent();
728 AMDGPULibFunc PowrFastInfo(AMDGPULibFunc::EI_POWR_FAST, FInfo);
729 if (FunctionCallee PowrFastFunc = getFunction(M, fInfo: PowrFastInfo)) {
730 CI->setCalledFunction(PowrFastFunc);
731 return true;
732 }
733 }
734
735 if (!shouldReplaceLibcallWithIntrinsic(CI))
736 return false;
737 return expandFastPow(FPOp, B, Kind: PowKind::PowR);
738 }
739 case AMDGPULibFunc::EI_POWN:
740 case AMDGPULibFunc::EI_POWN_FAST: {
741 if (fold_pow(FPOp, B, FInfo))
742 return true;
743 if (!FMF.approxFunc())
744 return false;
745
746 if (FInfo.getId() == AMDGPULibFunc::EI_POWN &&
747 getArgType(FInfo) == AMDGPULibFunc::F32) {
748 Module *M = Callee->getParent();
749 AMDGPULibFunc PownFastInfo(AMDGPULibFunc::EI_POWN_FAST, FInfo);
750 if (FunctionCallee PownFastFunc = getFunction(M, fInfo: PownFastInfo)) {
751 CI->setCalledFunction(PownFastFunc);
752 return true;
753 }
754 }
755
756 if (!shouldReplaceLibcallWithIntrinsic(CI))
757 return false;
758 return expandFastPow(FPOp, B, Kind: PowKind::PowN);
759 }
760 case AMDGPULibFunc::EI_ROOTN:
761 case AMDGPULibFunc::EI_ROOTN_FAST: {
762 if (fold_rootn(FPOp, B, FInfo))
763 return true;
764 if (!FMF.approxFunc())
765 return false;
766
767 if (getArgType(FInfo) == AMDGPULibFunc::F32) {
768 Module *M = Callee->getParent();
769 AMDGPULibFunc RootnFastInfo(AMDGPULibFunc::EI_ROOTN_FAST, FInfo);
770 if (FunctionCallee RootnFastFunc = getFunction(M, fInfo: RootnFastInfo)) {
771 CI->setCalledFunction(RootnFastFunc);
772 return true;
773 }
774 }
775
776 return expandFastPow(FPOp, B, Kind: PowKind::RootN);
777 }
778 case AMDGPULibFunc::EI_SQRT:
779 // TODO: Allow with strictfp + constrained intrinsic
780 return tryReplaceLibcallWithSimpleIntrinsic(
781 B, CI, IntrID: Intrinsic::sqrt, AllowMinSizeF32: true, AllowF64: true, /*AllowStrictFP=*/false);
782 case AMDGPULibFunc::EI_COS:
783 case AMDGPULibFunc::EI_SIN:
784 return fold_sincos(FPOp, B, FInfo);
785 default:
786 break;
787 }
788 } else {
789 // Specialized optimizations for each function call
790 switch (FInfo.getId()) {
791 case AMDGPULibFunc::EI_READ_PIPE_2:
792 case AMDGPULibFunc::EI_READ_PIPE_4:
793 case AMDGPULibFunc::EI_WRITE_PIPE_2:
794 case AMDGPULibFunc::EI_WRITE_PIPE_4:
795 return fold_read_write_pipe(CI, B, FInfo);
796 default:
797 break;
798 }
799 }
800
801 return false;
802}
803
804bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
805 // Table-Driven optimization
806 const TableRef tr = getOptTable(id: FInfo.getId());
807 if (tr.empty())
808 return false;
809
810 int const sz = (int)tr.size();
811 Value *opr0 = CI->getArgOperand(i: 0);
812
813 if (getVecSize(FInfo) > 1) {
814 if (ConstantDataVector *CV = dyn_cast<ConstantDataVector>(Val: opr0)) {
815 SmallVector<double, 0> DVal;
816 for (int eltNo = 0; eltNo < getVecSize(FInfo); ++eltNo) {
817 ConstantFP *eltval = dyn_cast<ConstantFP>(
818 Val: CV->getElementAsConstant(i: (unsigned)eltNo));
819 assert(eltval && "Non-FP arguments in math function!");
820 bool found = false;
821 for (int i=0; i < sz; ++i) {
822 if (eltval->isExactlyValue(V: tr[i].input)) {
823 DVal.push_back(Elt: tr[i].result);
824 found = true;
825 break;
826 }
827 }
828 if (!found) {
829 // This vector constants not handled yet.
830 return false;
831 }
832 }
833 LLVMContext &context = CI->getContext();
834 Constant *nval;
835 if (getArgType(FInfo) == AMDGPULibFunc::F32) {
836 SmallVector<float, 0> FVal;
837 for (double D : DVal)
838 FVal.push_back(Elt: (float)D);
839 ArrayRef<float> tmp(FVal);
840 nval = ConstantDataVector::get(Context&: context, Elts: tmp);
841 } else { // F64
842 ArrayRef<double> tmp(DVal);
843 nval = ConstantDataVector::get(Context&: context, Elts: tmp);
844 }
845 LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n");
846 replaceCall(I: CI, With: nval);
847 return true;
848 }
849 } else {
850 // Scalar version
851 if (ConstantFP *CF = dyn_cast<ConstantFP>(Val: opr0)) {
852 for (int i = 0; i < sz; ++i) {
853 if (CF->isExactlyValue(V: tr[i].input)) {
854 Value *nval = ConstantFP::get(Ty: CF->getType(), V: tr[i].result);
855 LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n");
856 replaceCall(I: CI, With: nval);
857 return true;
858 }
859 }
860 }
861 }
862
863 return false;
864}
865
866namespace llvm {
867static double log2(double V) {
868#if _XOPEN_SOURCE >= 600 || defined(_ISOC99_SOURCE) || _POSIX_C_SOURCE >= 200112L
869 return ::log2(x: V);
870#else
871 return log(V) / numbers::ln2;
872#endif
873}
874} // namespace llvm
875
876bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B,
877 const FuncInfo &FInfo) {
878 assert((FInfo.getId() == AMDGPULibFunc::EI_POW ||
879 FInfo.getId() == AMDGPULibFunc::EI_POW_FAST ||
880 FInfo.getId() == AMDGPULibFunc::EI_POWR ||
881 FInfo.getId() == AMDGPULibFunc::EI_POWR_FAST ||
882 FInfo.getId() == AMDGPULibFunc::EI_POWN ||
883 FInfo.getId() == AMDGPULibFunc::EI_POWN_FAST) &&
884 "fold_pow: encounter a wrong function call");
885
886 Module *M = B.GetInsertBlock()->getModule();
887 Type *eltType = FPOp->getType()->getScalarType();
888 Value *opr0 = FPOp->getOperand(i: 0);
889 Value *opr1 = FPOp->getOperand(i: 1);
890
891 const APFloat *CF = nullptr;
892 const APInt *CINT = nullptr;
893 if (!match(V: opr1, P: m_APFloatAllowPoison(Res&: CF)))
894 match(V: opr1, P: m_APIntAllowPoison(Res&: CINT));
895
896 // 0x1111111 means that we don't do anything for this call.
897 int ci_opr1 = (CINT ? (int)CINT->getSExtValue() : 0x1111111);
898
899 if ((CF && CF->isZero()) || (CINT && ci_opr1 == 0)) {
900 // pow/powr/pown(x, 0) == 1
901 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1\n");
902 Constant *cnval = ConstantFP::get(Ty: eltType, V: 1.0);
903 if (getVecSize(FInfo) > 1) {
904 cnval = ConstantDataVector::getSplat(NumElts: getVecSize(FInfo), Elt: cnval);
905 }
906 replaceCall(I: FPOp, With: cnval);
907 return true;
908 }
909 if ((CF && CF->isExactlyValue(V: 1.0)) || (CINT && ci_opr1 == 1)) {
910 // pow/powr/pown(x, 1.0) = x
911 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << *opr0 << "\n");
912 replaceCall(I: FPOp, With: opr0);
913 return true;
914 }
915 if ((CF && CF->isExactlyValue(V: 2.0)) || (CINT && ci_opr1 == 2)) {
916 // pow/powr/pown(x, 2.0) = x*x
917 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << *opr0 << " * "
918 << *opr0 << "\n");
919 Value *nval = B.CreateFMul(L: opr0, R: opr0, Name: "__pow2");
920 replaceCall(I: FPOp, With: nval);
921 return true;
922 }
923 if ((CF && CF->isExactlyValue(V: -1.0)) || (CINT && ci_opr1 == -1)) {
924 // pow/powr/pown(x, -1.0) = 1.0/x
925 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1 / " << *opr0 << "\n");
926 Constant *cnval = ConstantFP::get(Ty: eltType, V: 1.0);
927 if (getVecSize(FInfo) > 1) {
928 cnval = ConstantDataVector::getSplat(NumElts: getVecSize(FInfo), Elt: cnval);
929 }
930 Value *nval = B.CreateFDiv(L: cnval, R: opr0, Name: "__powrecip");
931 replaceCall(I: FPOp, With: nval);
932 return true;
933 }
934
935 if (CF && (CF->isExactlyValue(V: 0.5) || CF->isExactlyValue(V: -0.5))) {
936 // pow[r](x, [-]0.5) = sqrt(x)
937 bool issqrt = CF->isExactlyValue(V: 0.5);
938 if (FunctionCallee FPExpr =
939 getFunction(M, fInfo: AMDGPULibFunc(issqrt ? AMDGPULibFunc::EI_SQRT
940 : AMDGPULibFunc::EI_RSQRT,
941 FInfo))) {
942 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << FInfo.getName()
943 << '(' << *opr0 << ")\n");
944 Value *nval = CreateCallEx(B,Callee: FPExpr, Arg: opr0, Name: issqrt ? "__pow2sqrt"
945 : "__pow2rsqrt");
946 replaceCall(I: FPOp, With: nval);
947 return true;
948 }
949 }
950
951 if (!isUnsafeFiniteOnlyMath(FPOp))
952 return false;
953
954 // Unsafe Math optimization
955
956 // Remember that ci_opr1 is set if opr1 is integral
957 if (CF) {
958 double dval = (getArgType(FInfo) == AMDGPULibFunc::F32)
959 ? (double)CF->convertToFloat()
960 : CF->convertToDouble();
961 int ival = (int)dval;
962 if ((double)ival == dval) {
963 ci_opr1 = ival;
964 } else
965 ci_opr1 = 0x11111111;
966 }
967
968 // pow/powr/pown(x, c) = [1/](x*x*..x); where
969 // trunc(c) == c && the number of x == c && |c| <= 12
970 unsigned abs_opr1 = (ci_opr1 < 0) ? -ci_opr1 : ci_opr1;
971 if (abs_opr1 <= 12) {
972 Constant *cnval;
973 Value *nval;
974 if (abs_opr1 == 0) {
975 cnval = ConstantFP::get(Ty: eltType, V: 1.0);
976 if (getVecSize(FInfo) > 1) {
977 cnval = ConstantDataVector::getSplat(NumElts: getVecSize(FInfo), Elt: cnval);
978 }
979 nval = cnval;
980 } else {
981 Value *valx2 = nullptr;
982 nval = nullptr;
983 while (abs_opr1 > 0) {
984 valx2 = valx2 ? B.CreateFMul(L: valx2, R: valx2, Name: "__powx2") : opr0;
985 if (abs_opr1 & 1) {
986 nval = nval ? B.CreateFMul(L: nval, R: valx2, Name: "__powprod") : valx2;
987 }
988 abs_opr1 >>= 1;
989 }
990 }
991
992 if (ci_opr1 < 0) {
993 cnval = ConstantFP::get(Ty: eltType, V: 1.0);
994 if (getVecSize(FInfo) > 1) {
995 cnval = ConstantDataVector::getSplat(NumElts: getVecSize(FInfo), Elt: cnval);
996 }
997 nval = B.CreateFDiv(L: cnval, R: nval, Name: "__1powprod");
998 }
999 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> "
1000 << ((ci_opr1 < 0) ? "1/prod(" : "prod(") << *opr0
1001 << ")\n");
1002 replaceCall(I: FPOp, With: nval);
1003 return true;
1004 }
1005
1006 // If we should use the generic intrinsic instead of emitting a libcall
1007 const bool ShouldUseIntrinsic = eltType->isFloatTy() || eltType->isHalfTy();
1008
1009 // powr ---> exp2(y * log2(x))
1010 // pown/pow ---> powr(fabs(x), y) | (x & ((int)y << 31))
1011 FunctionCallee ExpExpr;
1012 if (ShouldUseIntrinsic)
1013 ExpExpr = Intrinsic::getOrInsertDeclaration(M, id: Intrinsic::exp2,
1014 Tys: {FPOp->getType()});
1015 else {
1016 ExpExpr = getFunction(M, fInfo: AMDGPULibFunc(AMDGPULibFunc::EI_EXP2, FInfo));
1017 if (!ExpExpr)
1018 return false;
1019 }
1020
1021 bool needlog = false;
1022 bool needabs = false;
1023 bool needcopysign = false;
1024 Constant *cnval = nullptr;
1025 if (getVecSize(FInfo) == 1) {
1026 CF = nullptr;
1027 match(V: opr0, P: m_APFloatAllowPoison(Res&: CF));
1028
1029 if (CF) {
1030 double V = (getArgType(FInfo) == AMDGPULibFunc::F32)
1031 ? (double)CF->convertToFloat()
1032 : CF->convertToDouble();
1033
1034 V = log2(V: std::abs(x: V));
1035 cnval = ConstantFP::get(Ty: eltType, V);
1036 needcopysign = (FInfo.getId() != AMDGPULibFunc::EI_POWR &&
1037 FInfo.getId() != AMDGPULibFunc::EI_POWR_FAST) &&
1038 CF->isNegative();
1039 } else {
1040 needlog = true;
1041 needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR &&
1042 FInfo.getId() != AMDGPULibFunc::EI_POWR_FAST;
1043 }
1044 } else {
1045 ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(Val: opr0);
1046
1047 if (!CDV) {
1048 needlog = true;
1049 needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR &&
1050 FInfo.getId() != AMDGPULibFunc::EI_POWR_FAST;
1051 } else {
1052 assert ((int)CDV->getNumElements() == getVecSize(FInfo) &&
1053 "Wrong vector size detected");
1054
1055 SmallVector<double, 0> DVal;
1056 for (int i=0; i < getVecSize(FInfo); ++i) {
1057 double V = CDV->getElementAsAPFloat(i).convertToDouble();
1058 if (V < 0.0) needcopysign = true;
1059 V = log2(V: std::abs(x: V));
1060 DVal.push_back(Elt: V);
1061 }
1062 if (getArgType(FInfo) == AMDGPULibFunc::F32) {
1063 SmallVector<float, 0> FVal;
1064 for (double D : DVal)
1065 FVal.push_back(Elt: (float)D);
1066 ArrayRef<float> tmp(FVal);
1067 cnval = ConstantDataVector::get(Context&: M->getContext(), Elts: tmp);
1068 } else {
1069 ArrayRef<double> tmp(DVal);
1070 cnval = ConstantDataVector::get(Context&: M->getContext(), Elts: tmp);
1071 }
1072 }
1073 }
1074
1075 if (needcopysign && (FInfo.getId() == AMDGPULibFunc::EI_POW ||
1076 FInfo.getId() == AMDGPULibFunc::EI_POW_FAST)) {
1077 // We cannot handle corner cases for a general pow() function, give up
1078 // unless y is a constant integral value. Then proceed as if it were pown.
1079 if (!isKnownIntegral(V: opr1, SQ: SQ.getWithInstruction(I: cast<Instruction>(Val: FPOp)),
1080 FMF: FPOp->getFastMathFlags()))
1081 return false;
1082 }
1083
1084 Value *nval;
1085 if (needabs) {
1086 nval = B.CreateUnaryIntrinsic(ID: Intrinsic::fabs, V: opr0, FMFSource: nullptr, Name: "__fabs");
1087 } else {
1088 nval = cnval ? cnval : opr0;
1089 }
1090 if (needlog) {
1091 FunctionCallee LogExpr;
1092 if (ShouldUseIntrinsic) {
1093 LogExpr = Intrinsic::getOrInsertDeclaration(M, id: Intrinsic::log2,
1094 Tys: {FPOp->getType()});
1095 } else {
1096 LogExpr = getFunction(M, fInfo: AMDGPULibFunc(AMDGPULibFunc::EI_LOG2, FInfo));
1097 if (!LogExpr)
1098 return false;
1099 }
1100
1101 nval = CreateCallEx(B,Callee: LogExpr, Arg: nval, Name: "__log2");
1102 }
1103
1104 if (FInfo.getId() == AMDGPULibFunc::EI_POWN ||
1105 FInfo.getId() == AMDGPULibFunc::EI_POWN_FAST) {
1106 // convert int(32) to fp(f32 or f64)
1107 opr1 = B.CreateSIToFP(V: opr1, DestTy: nval->getType(), Name: "pownI2F");
1108 }
1109 nval = B.CreateFMul(L: opr1, R: nval, Name: "__ylogx");
1110
1111 CallInst *Exp2Call = CreateCallEx(B, Callee: ExpExpr, Arg: nval, Name: "__exp2");
1112
1113 // TODO: Generalized fpclass logic for pow
1114 FPClassTest KnownNot = FPClassTest::fcNegative;
1115 if (FPOp->hasNoNaNs())
1116 KnownNot |= FPClassTest::fcNan;
1117
1118 Exp2Call->addRetAttr(
1119 Attr: Attribute::getWithNoFPClass(Context&: Exp2Call->getContext(), Mask: KnownNot));
1120 nval = Exp2Call;
1121
1122 if (needcopysign) {
1123 Type* nTyS = B.getIntNTy(N: eltType->getPrimitiveSizeInBits());
1124 Type *nTy = FPOp->getType()->getWithNewType(EltTy: nTyS);
1125 Value *opr_n = FPOp->getOperand(i: 1);
1126 if (opr_n->getType()->getScalarType()->isIntegerTy())
1127 opr_n = B.CreateZExtOrTrunc(V: opr_n, DestTy: nTy, Name: "__ytou");
1128 else
1129 opr_n = B.CreateFPToSI(V: opr1, DestTy: nTy, Name: "__ytou");
1130
1131 unsigned size = nTy->getScalarSizeInBits();
1132 Value *sign = B.CreateShl(LHS: opr_n, RHS: size-1, Name: "__yeven");
1133 sign = B.CreateAnd(LHS: B.CreateBitCast(V: opr0, DestTy: nTy), RHS: sign, Name: "__pow_sign");
1134
1135 nval = B.CreateCopySign(LHS: nval, RHS: B.CreateBitCast(V: sign, DestTy: nval->getType()),
1136 FMFSource: nullptr, Name: "__pow_sign");
1137 }
1138
1139 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> "
1140 << "exp2(" << *opr1 << " * log2(" << *opr0 << "))\n");
1141 replaceCall(I: FPOp, With: nval);
1142
1143 return true;
1144}
1145
1146bool AMDGPULibCalls::fold_rootn(FPMathOperator *FPOp, IRBuilder<> &B,
1147 const FuncInfo &FInfo) {
1148 Value *opr0 = FPOp->getOperand(i: 0);
1149 Value *opr1 = FPOp->getOperand(i: 1);
1150
1151 const APInt *CINT = nullptr;
1152 if (!match(V: opr1, P: m_APIntAllowPoison(Res&: CINT)))
1153 return false;
1154
1155 Function *Parent = B.GetInsertBlock()->getParent();
1156
1157 int ci_opr1 = (int)CINT->getSExtValue();
1158 if (ci_opr1 == 1 && !Parent->hasFnAttribute(Kind: Attribute::StrictFP)) {
1159 // rootn(x, 1) = x
1160 //
1161 // TODO: Insert constrained canonicalize for strictfp case.
1162 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << *opr0 << '\n');
1163 replaceCall(I: FPOp, With: opr0);
1164 return true;
1165 }
1166
1167 Module *M = B.GetInsertBlock()->getModule();
1168
1169 CallInst *CI = cast<CallInst>(Val: FPOp);
1170 if (ci_opr1 == 2 &&
1171 shouldReplaceLibcallWithIntrinsic(CI,
1172 /*AllowMinSizeF32=*/true,
1173 /*AllowF64=*/true)) {
1174 // rootn(x, 2) = sqrt(x)
1175 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> sqrt(" << *opr0 << ")\n");
1176
1177 CallInst *NewCall = B.CreateUnaryIntrinsic(ID: Intrinsic::sqrt, V: opr0, FMFSource: CI);
1178 NewCall->takeName(V: CI);
1179
1180 // OpenCL rootn has a looser ulp of 2 requirement than sqrt, so add some
1181 // metadata.
1182 MDBuilder MDHelper(M->getContext());
1183 MDNode *FPMD = MDHelper.createFPMath(Accuracy: std::max(a: FPOp->getFPAccuracy(), b: 2.0f));
1184 NewCall->setMetadata(KindID: LLVMContext::MD_fpmath, Node: FPMD);
1185
1186 replaceCall(I: CI, With: NewCall);
1187 return true;
1188 }
1189
1190 if (ci_opr1 == 3) { // rootn(x, 3) = cbrt(x)
1191 if (FunctionCallee FPExpr =
1192 getFunction(M, fInfo: AMDGPULibFunc(AMDGPULibFunc::EI_CBRT, FInfo))) {
1193 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> cbrt(" << *opr0
1194 << ")\n");
1195 Value *nval = CreateCallEx(B,Callee: FPExpr, Arg: opr0, Name: "__rootn2cbrt");
1196 replaceCall(I: FPOp, With: nval);
1197 return true;
1198 }
1199 } else if (ci_opr1 == -1) { // rootn(x, -1) = 1.0/x
1200 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1.0 / " << *opr0 << "\n");
1201 Value *nval = B.CreateFDiv(L: ConstantFP::get(Ty: opr0->getType(), V: 1.0),
1202 R: opr0,
1203 Name: "__rootn2div");
1204 replaceCall(I: FPOp, With: nval);
1205 return true;
1206 }
1207
1208 if (ci_opr1 == -2 &&
1209 shouldReplaceLibcallWithIntrinsic(CI,
1210 /*AllowMinSizeF32=*/true,
1211 /*AllowF64=*/true)) {
1212 // rootn(x, -2) = rsqrt(x)
1213
1214 // The original rootn had looser ulp requirements than the resultant sqrt
1215 // and fdiv.
1216 MDBuilder MDHelper(M->getContext());
1217 MDNode *FPMD = MDHelper.createFPMath(Accuracy: std::max(a: FPOp->getFPAccuracy(), b: 2.0f));
1218
1219 // TODO: Could handle strictfp but need to fix strict sqrt emission
1220 FastMathFlags FMF = FPOp->getFastMathFlags();
1221 FMF.setAllowContract(true);
1222
1223 CallInst *Sqrt = B.CreateUnaryIntrinsic(ID: Intrinsic::sqrt, V: opr0, FMFSource: CI);
1224 Instruction *RSqrt = cast<Instruction>(
1225 Val: B.CreateFDiv(L: ConstantFP::get(Ty: opr0->getType(), V: 1.0), R: Sqrt));
1226 Sqrt->setFastMathFlags(FMF);
1227 RSqrt->setFastMathFlags(FMF);
1228 RSqrt->setMetadata(KindID: LLVMContext::MD_fpmath, Node: FPMD);
1229
1230 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> rsqrt(" << *opr0
1231 << ")\n");
1232 replaceCall(I: CI, With: RSqrt);
1233 return true;
1234 }
1235
1236 return false;
1237}
1238
1239// is_integer(y) => trunc(y) == y
1240static Value *emitIsInteger(IRBuilder<> &B, Value *Y) {
1241 Value *TruncY = B.CreateUnaryIntrinsic(ID: Intrinsic::trunc, V: Y);
1242 return B.CreateFCmpOEQ(LHS: TruncY, RHS: Y);
1243}
1244
1245static Value *emitIsEvenInteger(IRBuilder<> &B, Value *Y) {
1246 // Even integers are still integers after division by 2.
1247 auto *HalfY = B.CreateFMul(L: Y, R: ConstantFP::get(Ty: Y->getType(), V: 0.5));
1248 return emitIsInteger(B, Y: HalfY);
1249}
1250
1251// is_odd_integer(y) => is_integer(y) && !is_even_integer(y)
1252static Value *emitIsOddInteger(IRBuilder<> &B, Value *Y) {
1253 Value *IsIntY = emitIsInteger(B, Y);
1254 Value *IsEvenY = emitIsEvenInteger(B, Y);
1255 Value *NotEvenY = B.CreateNot(V: IsEvenY);
1256 return B.CreateAnd(LHS: IsIntY, RHS: NotEvenY);
1257}
1258
1259// isinf(val) => fabs(val) == +inf
1260static Value *emitIsInf(IRBuilder<> &B, Value *val) {
1261 auto *fabsVal = B.CreateUnaryIntrinsic(ID: Intrinsic::fabs, V: val);
1262 return B.CreateFCmpOEQ(LHS: fabsVal, RHS: ConstantFP::getInfinity(Ty: val->getType()));
1263}
1264
1265// y * log2(fabs(x))
1266static Value *emitFastExpYLnx(IRBuilder<> &B, Value *X, Value *Y) {
1267 Value *AbsX = B.CreateUnaryIntrinsic(ID: Intrinsic::fabs, V: X);
1268 Value *LogAbsX = B.CreateUnaryIntrinsic(ID: Intrinsic::log2, V: AbsX);
1269 Value *YTimesLogX = B.CreateFMul(L: Y, R: LogAbsX);
1270 return B.CreateUnaryIntrinsic(ID: Intrinsic::exp2, V: YTimesLogX);
1271}
1272
1273/// Emit special case management epilog code for fast pow, powr, pown, and rootn
1274/// expansions. \p x and \p y should be the arguments to the library call
1275/// (possibly with some values clamped). \p expylnx should be the result to use
1276/// in normal circumstances.
1277static Value *emitPowFixup(IRBuilder<> &B, Value *X, Value *Y, Value *ExpYLnX,
1278 PowKind Kind) {
1279 Constant *Zero = ConstantFP::getZero(Ty: X->getType());
1280 Constant *One = ConstantFP::get(Ty: X->getType(), V: 1.0);
1281 Constant *QNaN = ConstantFP::getQNaN(Ty: X->getType());
1282 Constant *PInf = ConstantFP::getInfinity(Ty: X->getType());
1283
1284 switch (Kind) {
1285 case PowKind::Pow: {
1286 // is_odd_integer(y)
1287 Value *IsOddY = emitIsOddInteger(B, Y);
1288
1289 // ret = copysign(expylnx, is_odd_y ? x : 1.0f)
1290 Value *SelSign = B.CreateSelect(C: IsOddY, True: X, False: One);
1291 Value *Ret = B.CreateCopySign(LHS: ExpYLnX, RHS: SelSign);
1292
1293 // if (x < 0 && !is_integer(y)) ret = QNAN
1294 Value *IsIntY = emitIsInteger(B, Y);
1295 Value *condNegX = B.CreateFCmpOLT(LHS: X, RHS: Zero);
1296 Value *condNotIntY = B.CreateNot(V: IsIntY);
1297 Value *condNaN = B.CreateAnd(LHS: condNegX, RHS: condNotIntY);
1298 Ret = B.CreateSelect(C: condNaN, True: QNaN, False: Ret);
1299
1300 // if (isinf(ay)) { ... }
1301
1302 // FIXME: Missing backend optimization to save on materialization cost of
1303 // mixed sign constant infinities.
1304 Value *YIsInf = emitIsInf(B, val: Y);
1305
1306 Value *AY = B.CreateUnaryIntrinsic(ID: Intrinsic::fabs, V: Y);
1307 Value *YIsNegInf = B.CreateFCmpUNE(LHS: Y, RHS: AY);
1308
1309 Value *AX = B.CreateUnaryIntrinsic(ID: Intrinsic::fabs, V: X);
1310 Value *AxEqOne = B.CreateFCmpOEQ(LHS: AX, RHS: One);
1311 Value *AxLtOne = B.CreateFCmpOLT(LHS: AX, RHS: One);
1312 Value *XorCond = B.CreateXor(LHS: AxLtOne, RHS: YIsNegInf);
1313 Value *SelInf =
1314 B.CreateSelect(C: AxEqOne, True: AX, False: B.CreateSelect(C: XorCond, True: Zero, False: AY));
1315 Ret = B.CreateSelect(C: YIsInf, True: SelInf, False: Ret);
1316
1317 // if (isinf(ax) || x == 0.0f) { ... }
1318 Value *XIsInf = emitIsInf(B, val: X);
1319 Value *XEqZero = B.CreateFCmpOEQ(LHS: X, RHS: Zero);
1320 Value *AxInfOrZero = B.CreateOr(LHS: XIsInf, RHS: XEqZero);
1321 Value *YLtZero = B.CreateFCmpOLT(LHS: Y, RHS: Zero);
1322 Value *XorZeroInf = B.CreateXor(LHS: XEqZero, RHS: YLtZero);
1323 Value *SelVal = B.CreateSelect(C: XorZeroInf, True: Zero, False: PInf);
1324 Value *SelSign2 = B.CreateSelect(C: IsOddY, True: X, False: Zero);
1325 Value *Copysign = B.CreateCopySign(LHS: SelVal, RHS: SelSign2);
1326 Ret = B.CreateSelect(C: AxInfOrZero, True: Copysign, False: Ret);
1327
1328 // if (isunordered(x, y)) ret = QNAN
1329 Value *isUnordered = B.CreateFCmpUNO(LHS: X, RHS: Y);
1330 return B.CreateSelect(C: isUnordered, True: QNaN, False: Ret);
1331 }
1332 case PowKind::PowR: {
1333 Value *YIsNeg = B.CreateFCmpOLT(LHS: Y, RHS: Zero);
1334 Value *IZ = B.CreateSelect(C: YIsNeg, True: PInf, False: Zero);
1335 Value *ZI = B.CreateSelect(C: YIsNeg, True: Zero, False: PInf);
1336
1337 Value *YEqZero = B.CreateFCmpOEQ(LHS: Y, RHS: Zero);
1338 Value *SelZeroCase = B.CreateSelect(C: YEqZero, True: QNaN, False: IZ);
1339 Value *XEqZero = B.CreateFCmpOEQ(LHS: X, RHS: Zero);
1340 Value *Ret = B.CreateSelect(C: XEqZero, True: SelZeroCase, False: ExpYLnX);
1341
1342 Value *XEqInf = B.CreateFCmpOEQ(LHS: X, RHS: PInf);
1343 Value *YNeZero = B.CreateFCmpUNE(LHS: Y, RHS: Zero);
1344 Value *CondInfCase = B.CreateAnd(LHS: XEqInf, RHS: YNeZero);
1345 Ret = B.CreateSelect(C: CondInfCase, True: ZI, False: Ret);
1346
1347 Value *IsInfY = emitIsInf(B, val: Y);
1348 Value *XNeOne = B.CreateFCmpUNE(LHS: X, RHS: One);
1349 Value *CondInfY = B.CreateAnd(LHS: IsInfY, RHS: XNeOne);
1350 Value *XLtOne = B.CreateFCmpOLT(LHS: X, RHS: One);
1351 Value *SelInfYCase = B.CreateSelect(C: XLtOne, True: IZ, False: ZI);
1352 Ret = B.CreateSelect(C: CondInfY, True: SelInfYCase, False: Ret);
1353
1354 Value *IsUnordered = B.CreateFCmpUNO(LHS: X, RHS: Y);
1355 return B.CreateSelect(C: IsUnordered, True: QNaN, False: Ret);
1356 }
1357 case PowKind::PowN: {
1358 Constant *ZeroI = ConstantInt::get(Ty: Y->getType(), V: 0);
1359
1360 // is_odd_y = (ny & 1) != 0
1361 Value *OneI = ConstantInt::get(Ty: Y->getType(), V: 1);
1362 Value *YAnd1 = B.CreateAnd(LHS: Y, RHS: OneI);
1363 Value *IsOddY = B.CreateICmpNE(LHS: YAnd1, RHS: ZeroI);
1364
1365 // ret = copysign(expylnx, is_odd_y ? x : 1.0f)
1366 Value *SelSign = B.CreateSelect(C: IsOddY, True: X, False: One);
1367 Value *Ret = B.CreateCopySign(LHS: ExpYLnX, RHS: SelSign);
1368
1369 // if (isinf(x) || x == 0.0f)
1370 Value *FabsX = B.CreateUnaryIntrinsic(ID: Intrinsic::fabs, V: X);
1371 Value *XIsInf = B.CreateFCmpOEQ(LHS: FabsX, RHS: PInf);
1372 Value *XEqZero = B.CreateFCmpOEQ(LHS: X, RHS: Zero);
1373 Value *InfOrZero = B.CreateOr(LHS: XIsInf, RHS: XEqZero);
1374
1375 // (x == 0.0f) ^ (ny < 0) ? 0.0f : +inf
1376 Value *YLtZero = B.CreateICmpSLT(LHS: Y, RHS: ZeroI);
1377 Value *XorZeroInf = B.CreateXor(LHS: XEqZero, RHS: YLtZero);
1378 Value *SelVal = B.CreateSelect(C: XorZeroInf, True: Zero, False: PInf);
1379
1380 // copysign(selVal, is_odd_y ? x : 0.0f)
1381 Value *SelSign2 = B.CreateSelect(C: IsOddY, True: X, False: Zero);
1382 Value *Copysign = B.CreateCopySign(LHS: SelVal, RHS: SelSign2);
1383
1384 return B.CreateSelect(C: InfOrZero, True: Copysign, False: Ret);
1385 }
1386 case PowKind::RootN: {
1387 Constant *ZeroI = ConstantInt::get(Ty: Y->getType(), V: 0);
1388
1389 // is_odd_y = (ny & 1) != 0
1390 Value *YAnd1 = B.CreateAnd(LHS: Y, RHS: ConstantInt::get(Ty: Y->getType(), V: 1));
1391 Value *IsOddY = B.CreateICmpNE(LHS: YAnd1, RHS: ZeroI);
1392
1393 // ret = copysign(expylnx, is_odd_y ? x : 1.0f)
1394 Value *SelSign = B.CreateSelect(C: IsOddY, True: X, False: One);
1395 Value *Ret = B.CreateCopySign(LHS: ExpYLnX, RHS: SelSign);
1396
1397 // if (isinf(x) || x == 0.0f)
1398 Value *FabsX = B.CreateUnaryIntrinsic(ID: Intrinsic::fabs, V: X);
1399 Value *IsInfX = B.CreateFCmpOEQ(LHS: FabsX, RHS: PInf);
1400 Value *XEqZero = B.CreateFCmpOEQ(LHS: X, RHS: Zero);
1401 Value *CondInfOrZero = B.CreateOr(LHS: IsInfX, RHS: XEqZero);
1402
1403 // (x == 0.0f) ^ (ny < 0) ? 0.0f : +inf
1404 Value *YLtZero = B.CreateICmpSLT(LHS: Y, RHS: ZeroI);
1405 Value *XorZeroInf = B.CreateXor(LHS: XEqZero, RHS: YLtZero);
1406 Value *SelVal = B.CreateSelect(C: XorZeroInf, True: Zero, False: PInf);
1407
1408 // copysign(selVal, is_odd_y ? x : 0.0f)
1409 Value *SelSign2 = B.CreateSelect(C: IsOddY, True: X, False: Zero);
1410 Value *Copysign = B.CreateCopySign(LHS: SelVal, RHS: SelSign2);
1411
1412 Ret = B.CreateSelect(C: CondInfOrZero, True: Copysign, False: Ret);
1413
1414 // if ((x < 0.0f && !is_odd_y) || ny == 0) ret = QNAN
1415 Value *XIsNeg = B.CreateFCmpOLT(LHS: X, RHS: Zero);
1416 Value *NotOddY = B.CreateNot(V: IsOddY);
1417 Value *CondNegAndNotOdd = B.CreateAnd(LHS: XIsNeg, RHS: NotOddY);
1418 Value *YEqZero = B.CreateICmpEQ(LHS: Y, RHS: ZeroI);
1419 Value *CondBad = B.CreateOr(LHS: CondNegAndNotOdd, RHS: YEqZero);
1420 return B.CreateSelect(C: CondBad, True: QNaN, False: Ret);
1421 }
1422 }
1423
1424 llvm_unreachable("covered switch");
1425}
1426
1427// TODO: Move the fold_pow folding to sqrt/fdiv here
1428bool AMDGPULibCalls::expandFastPow(FPMathOperator *FPOp, IRBuilder<> &B,
1429 PowKind Kind) {
1430 Type *Ty = FPOp->getType();
1431
1432 // There's currently no reason to do this for half. The correct path is
1433 // promote to float and use the fast float expansion.
1434 //
1435 // TODO: We could move this expansion to lowering to get half pow to work.
1436 if (!Ty->getScalarType()->isFloatTy())
1437 return false;
1438
1439 // TODO: Verify optimization for double and bfloat.
1440 Value *X = FPOp->getOperand(i: 0);
1441 Value *Y = FPOp->getOperand(i: 1);
1442
1443 switch (Kind) {
1444 case PowKind::Pow: {
1445 Constant *One = ConstantFP::get(Ty: X->getType(), V: 1.0);
1446
1447 // if (x == 1.0f) y = 1.0f;
1448 Value *XEqOne = B.CreateFCmpOEQ(LHS: X, RHS: One);
1449 Y = B.CreateSelect(C: XEqOne, True: One, False: Y);
1450
1451 // if (y == 0.0f) x = 1.0f;
1452 Value *YEqZero = B.CreateFCmpOEQ(LHS: Y, RHS: ConstantFP::getZero(Ty: X->getType()));
1453 X = B.CreateSelect(C: YEqZero, True: One, False: X);
1454
1455 Value *ExpYLnX = emitFastExpYLnx(B, X, Y);
1456 Value *Fixed = emitPowFixup(B, X, Y, ExpYLnX, Kind);
1457 replaceCall(I: FPOp, With: Fixed);
1458 return true;
1459 }
1460 case PowKind::PowR: {
1461 Value *NegX = B.CreateFCmpOLT(LHS: X, RHS: ConstantFP::getZero(Ty: X->getType()));
1462 X = B.CreateSelect(C: NegX, True: ConstantFP::getQNaN(Ty: X->getType()), False: X);
1463
1464 Value *ExpYLnX = emitFastExpYLnx(B, X, Y);
1465 Value *Fixed = emitPowFixup(B, X, Y, ExpYLnX, Kind);
1466 replaceCall(I: FPOp, With: Fixed);
1467 return true;
1468 }
1469 case PowKind::PowN: {
1470 // ny == 0
1471 Value *YEqZero = B.CreateICmpEQ(LHS: Y, RHS: ConstantInt::get(Ty: Y->getType(), V: 0));
1472
1473 // x = (ny == 0 ? 1.0f : x)
1474 X = B.CreateSelect(C: YEqZero, True: ConstantFP::get(Ty: X->getType(), V: 1.0), False: X);
1475
1476 Value *CastY = B.CreateSIToFP(V: Y, DestTy: X->getType());
1477 Value *ExpYLnX = emitFastExpYLnx(B, X, Y: CastY);
1478 Value *Fixed = emitPowFixup(B, X, Y, ExpYLnX, Kind);
1479 replaceCall(I: FPOp, With: Fixed);
1480 return true;
1481 }
1482 case PowKind::RootN: {
1483 Value *CastY = B.CreateSIToFP(V: Y, DestTy: X->getType());
1484 Value *RcpY = B.CreateUnaryIntrinsic(ID: Intrinsic::amdgcn_rcp, V: CastY);
1485 Value *ExpYLnX = emitFastExpYLnx(B, X, Y: RcpY);
1486 Value *Fixed = emitPowFixup(B, X, Y, ExpYLnX, Kind);
1487 replaceCall(I: FPOp, With: Fixed);
1488 return true;
1489 }
1490 }
1491 llvm_unreachable("Unhandled PowKind enum");
1492}
1493
1494bool AMDGPULibCalls::tryOptimizePow(FPMathOperator *FPOp, IRBuilder<> &B,
1495 const FuncInfo &FInfo) {
1496 FastMathFlags FMF = FPOp->getFastMathFlags();
1497 CallInst *Call = cast<CallInst>(Val: FPOp);
1498 Module *M = Call->getModule();
1499
1500 FuncInfo PowrInfo;
1501 AMDGPULibFunc::EFuncId FastPowrFuncId =
1502 FMF.approxFunc() || FInfo.getId() == AMDGPULibFunc::EI_POW_FAST
1503 ? AMDGPULibFunc::EI_POWR_FAST
1504 : AMDGPULibFunc::EI_NONE;
1505 FunctionCallee PowrFunc = getFloatFastVariant(
1506 M, fInfo: FInfo, newInfo&: PowrInfo, NewFunc: AMDGPULibFunc::EI_POWR, FastVariant: FastPowrFuncId);
1507
1508 // TODO: Prefer fast pown to fast powr, but slow powr to slow pown.
1509
1510 // pow(x, y) -> powr(x, y) for x >= -0.0
1511 // TODO: Account for flags on current call
1512 if (PowrFunc && cannotBeOrderedLessThanZero(V: FPOp->getOperand(i: 0),
1513 SQ: SQ.getWithInstruction(I: Call))) {
1514 Call->setCalledFunction(PowrFunc);
1515 return fold_pow(FPOp, B, FInfo: PowrInfo) || true;
1516 }
1517
1518 // pow(x, y) -> pown(x, y) for known integral y
1519 if (isKnownIntegral(V: FPOp->getOperand(i: 1), SQ: SQ.getWithInstruction(I: Call),
1520 FMF: FPOp->getFastMathFlags())) {
1521 FunctionType *PownType = getPownType(FT: Call->getFunctionType());
1522
1523 FuncInfo PownInfo;
1524 AMDGPULibFunc::EFuncId FastPownFuncId =
1525 FMF.approxFunc() || FInfo.getId() == AMDGPULibFunc::EI_POW_FAST
1526 ? AMDGPULibFunc::EI_POWN_FAST
1527 : AMDGPULibFunc::EI_NONE;
1528 FunctionCallee PownFunc = getFloatFastVariant(
1529 M, fInfo: FInfo, newInfo&: PownInfo, NewFunc: AMDGPULibFunc::EI_POWN, FastVariant: FastPownFuncId);
1530
1531 if (PownFunc) {
1532 // TODO: If the incoming integral value is an sitofp/uitofp, it won't
1533 // fold out without a known range. We can probably take the source
1534 // value directly.
1535 Value *CastedArg =
1536 B.CreateFPToSI(V: FPOp->getOperand(i: 1), DestTy: PownType->getParamType(i: 1));
1537 // Have to drop any nofpclass attributes on the original call site.
1538 Call->removeParamAttrs(
1539 ArgNo: 1, AttrsToRemove: AttributeFuncs::typeIncompatible(Ty: CastedArg->getType(),
1540 AS: Call->getParamAttributes(ArgNo: 1)));
1541 Call->setCalledFunction(PownFunc);
1542 Call->setArgOperand(i: 1, v: CastedArg);
1543 return fold_pow(FPOp, B, FInfo: PownInfo) || true;
1544 }
1545 }
1546
1547 if (fold_pow(FPOp, B, FInfo))
1548 return true;
1549
1550 if (!FMF.approxFunc())
1551 return false;
1552
1553 if (FInfo.getId() == AMDGPULibFunc::EI_POW && FMF.approxFunc() &&
1554 getArgType(FInfo) == AMDGPULibFunc::F32) {
1555 AMDGPULibFunc PowFastInfo(AMDGPULibFunc::EI_POW_FAST, FInfo);
1556 if (FunctionCallee PowFastFunc = getFunction(M, fInfo: PowFastInfo)) {
1557 Call->setCalledFunction(PowFastFunc);
1558 return fold_pow(FPOp, B, FInfo: PowFastInfo) || true;
1559 }
1560 }
1561
1562 return expandFastPow(FPOp, B, Kind: PowKind::Pow);
1563}
1564
1565// Get a scalar native builtin single argument FP function
1566FunctionCallee AMDGPULibCalls::getNativeFunction(Module *M,
1567 const FuncInfo &FInfo) {
1568 if (getArgType(FInfo) == AMDGPULibFunc::F64 || !HasNative(id: FInfo.getId()))
1569 return nullptr;
1570 FuncInfo nf = FInfo;
1571 nf.setPrefix(AMDGPULibFunc::NATIVE);
1572 return getFunction(M, fInfo: nf);
1573}
1574
1575// Some library calls are just wrappers around llvm intrinsics, but compiled
1576// conservatively. Preserve the flags from the original call site by
1577// substituting them with direct calls with all the flags.
1578bool AMDGPULibCalls::shouldReplaceLibcallWithIntrinsic(const CallInst *CI,
1579 bool AllowMinSizeF32,
1580 bool AllowF64,
1581 bool AllowStrictFP) {
1582 Type *FltTy = CI->getType()->getScalarType();
1583 const bool IsF32 = FltTy->isFloatTy();
1584
1585 // f64 intrinsics aren't implemented for most operations.
1586 if (!IsF32 && !FltTy->isHalfTy() && (!AllowF64 || !FltTy->isDoubleTy()))
1587 return false;
1588
1589 // We're implicitly inlining by replacing the libcall with the intrinsic, so
1590 // don't do it for noinline call sites.
1591 if (CI->isNoInline())
1592 return false;
1593
1594 const Function *ParentF = CI->getFunction();
1595 // TODO: Handle strictfp
1596 if (!AllowStrictFP && ParentF->hasFnAttribute(Kind: Attribute::StrictFP))
1597 return false;
1598
1599 if (IsF32 && !AllowMinSizeF32 && ParentF->hasMinSize())
1600 return false;
1601 return true;
1602}
1603
1604void AMDGPULibCalls::replaceLibCallWithSimpleIntrinsic(IRBuilder<> &B,
1605 CallInst *CI,
1606 Intrinsic::ID IntrID) {
1607 if (CI->arg_size() == 2) {
1608 Value *Arg0 = CI->getArgOperand(i: 0);
1609 Value *Arg1 = CI->getArgOperand(i: 1);
1610 VectorType *Arg0VecTy = dyn_cast<VectorType>(Val: Arg0->getType());
1611 VectorType *Arg1VecTy = dyn_cast<VectorType>(Val: Arg1->getType());
1612 if (Arg0VecTy && !Arg1VecTy) {
1613 Value *SplatRHS = B.CreateVectorSplat(EC: Arg0VecTy->getElementCount(), V: Arg1);
1614 CI->setArgOperand(i: 1, v: SplatRHS);
1615 } else if (!Arg0VecTy && Arg1VecTy) {
1616 Value *SplatLHS = B.CreateVectorSplat(EC: Arg1VecTy->getElementCount(), V: Arg0);
1617 CI->setArgOperand(i: 0, v: SplatLHS);
1618 }
1619 }
1620
1621 CI->setCalledFunction(Intrinsic::getOrInsertDeclaration(
1622 M: CI->getModule(), id: IntrID, Tys: {CI->getType()}));
1623}
1624
1625bool AMDGPULibCalls::tryReplaceLibcallWithSimpleIntrinsic(
1626 IRBuilder<> &B, CallInst *CI, Intrinsic::ID IntrID, bool AllowMinSizeF32,
1627 bool AllowF64, bool AllowStrictFP) {
1628 if (!shouldReplaceLibcallWithIntrinsic(CI, AllowMinSizeF32, AllowF64,
1629 AllowStrictFP))
1630 return false;
1631 replaceLibCallWithSimpleIntrinsic(B, CI, IntrID);
1632 return true;
1633}
1634
1635std::tuple<Value *, Value *, Value *>
1636AMDGPULibCalls::insertSinCos(Value *Arg, FastMathFlags FMF, IRBuilder<> &B,
1637 FunctionCallee Fsincos) {
1638 DebugLoc DL = B.getCurrentDebugLocation();
1639 Function *F = B.GetInsertBlock()->getParent();
1640 B.SetInsertPointPastAllocas(F);
1641
1642 AllocaInst *Alloc = B.CreateAlloca(Ty: Arg->getType(), ArraySize: nullptr, Name: "__sincos_");
1643
1644 if (Instruction *ArgInst = dyn_cast<Instruction>(Val: Arg)) {
1645 // If the argument is an instruction, it must dominate all uses so put our
1646 // sincos call there. Otherwise, right after the allocas works well enough
1647 // if it's an argument or constant.
1648
1649 B.SetInsertPoint(TheBB: ArgInst->getParent(), IP: ++ArgInst->getIterator());
1650
1651 // SetInsertPoint unwelcomely always tries to set the debug loc.
1652 B.SetCurrentDebugLocation(DL);
1653 }
1654
1655 Type *CosPtrTy = Fsincos.getFunctionType()->getParamType(i: 1);
1656
1657 // The allocaInst allocates the memory in private address space. This need
1658 // to be addrspacecasted to point to the address space of cos pointer type.
1659 // In OpenCL 2.0 this is generic, while in 1.2 that is private.
1660 Value *CastAlloc = B.CreateAddrSpaceCast(V: Alloc, DestTy: CosPtrTy);
1661
1662 CallInst *SinCos = CreateCallEx2(B, Callee: Fsincos, Arg1: Arg, Arg2: CastAlloc);
1663
1664 // TODO: Is it worth trying to preserve the location for the cos calls for the
1665 // load?
1666
1667 LoadInst *LoadCos = B.CreateLoad(Ty: Arg->getType(), Ptr: Alloc);
1668 return {SinCos, LoadCos, SinCos};
1669}
1670
1671// fold sin, cos -> sincos.
1672bool AMDGPULibCalls::fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B,
1673 const FuncInfo &fInfo) {
1674 assert(fInfo.getId() == AMDGPULibFunc::EI_SIN ||
1675 fInfo.getId() == AMDGPULibFunc::EI_COS);
1676
1677 if ((getArgType(FInfo: fInfo) != AMDGPULibFunc::F32 &&
1678 getArgType(FInfo: fInfo) != AMDGPULibFunc::F64) ||
1679 fInfo.getPrefix() != AMDGPULibFunc::NOPFX)
1680 return false;
1681
1682 bool const isSin = fInfo.getId() == AMDGPULibFunc::EI_SIN;
1683
1684 Value *CArgVal = FPOp->getOperand(i: 0);
1685
1686 // TODO: Constant fold the call
1687 if (isa<ConstantData>(Val: CArgVal))
1688 return false;
1689
1690 CallInst *CI = cast<CallInst>(Val: FPOp);
1691
1692 Function *F = B.GetInsertBlock()->getParent();
1693 Module *M = F->getParent();
1694
1695 // Merge the sin and cos. For OpenCL 2.0, there may only be a generic pointer
1696 // implementation. Prefer the private form if available.
1697 AMDGPULibFunc SinCosLibFuncPrivate(AMDGPULibFunc::EI_SINCOS, fInfo);
1698 SinCosLibFuncPrivate.getLeads()[0].PtrKind =
1699 AMDGPULibFunc::getEPtrKindFromAddrSpace(AS: AMDGPUAS::PRIVATE_ADDRESS);
1700
1701 AMDGPULibFunc SinCosLibFuncGeneric(AMDGPULibFunc::EI_SINCOS, fInfo);
1702 SinCosLibFuncGeneric.getLeads()[0].PtrKind =
1703 AMDGPULibFunc::getEPtrKindFromAddrSpace(AS: AMDGPUAS::FLAT_ADDRESS);
1704
1705 FunctionCallee FSinCosPrivate = getFunction(M, fInfo: SinCosLibFuncPrivate);
1706 FunctionCallee FSinCosGeneric = getFunction(M, fInfo: SinCosLibFuncGeneric);
1707 FunctionCallee FSinCos = FSinCosPrivate ? FSinCosPrivate : FSinCosGeneric;
1708 if (!FSinCos)
1709 return false;
1710
1711 SmallVector<CallInst *> SinCalls;
1712 SmallVector<CallInst *> CosCalls;
1713 SmallVector<CallInst *> SinCosCalls;
1714 FuncInfo PartnerInfo(isSin ? AMDGPULibFunc::EI_COS : AMDGPULibFunc::EI_SIN,
1715 fInfo);
1716 const std::string PairName = PartnerInfo.mangle();
1717
1718 StringRef SinName = isSin ? CI->getCalledFunction()->getName() : PairName;
1719 StringRef CosName = isSin ? PairName : CI->getCalledFunction()->getName();
1720 const std::string SinCosPrivateName = SinCosLibFuncPrivate.mangle();
1721 const std::string SinCosGenericName = SinCosLibFuncGeneric.mangle();
1722
1723 // Intersect the two sets of flags.
1724 FastMathFlags FMF = FPOp->getFastMathFlags();
1725 MDNode *FPMath = CI->getMetadata(KindID: LLVMContext::MD_fpmath);
1726
1727 SmallVector<DILocation *> MergeDbgLocs = {CI->getDebugLoc()};
1728
1729 for (User* U : CArgVal->users()) {
1730 CallInst *XI = dyn_cast<CallInst>(Val: U);
1731 if (!XI || XI->getFunction() != F || XI->isNoBuiltin())
1732 continue;
1733
1734 Function *UCallee = XI->getCalledFunction();
1735 if (!UCallee)
1736 continue;
1737
1738 bool Handled = true;
1739
1740 if (UCallee->getName() == SinName)
1741 SinCalls.push_back(Elt: XI);
1742 else if (UCallee->getName() == CosName)
1743 CosCalls.push_back(Elt: XI);
1744 else if (UCallee->getName() == SinCosPrivateName ||
1745 UCallee->getName() == SinCosGenericName)
1746 SinCosCalls.push_back(Elt: XI);
1747 else
1748 Handled = false;
1749
1750 if (Handled) {
1751 MergeDbgLocs.push_back(Elt: XI->getDebugLoc());
1752 auto *OtherOp = cast<FPMathOperator>(Val: XI);
1753 FMF &= OtherOp->getFastMathFlags();
1754 FPMath = MDNode::getMostGenericFPMath(
1755 A: FPMath, B: XI->getMetadata(KindID: LLVMContext::MD_fpmath));
1756 }
1757 }
1758
1759 if (SinCalls.empty() || CosCalls.empty())
1760 return false;
1761
1762 B.setFastMathFlags(FMF);
1763 B.setDefaultFPMathTag(FPMath);
1764 DILocation *DbgLoc = DILocation::getMergedLocations(Locs: MergeDbgLocs);
1765 B.SetCurrentDebugLocation(DbgLoc);
1766
1767 auto [Sin, Cos, SinCos] = insertSinCos(Arg: CArgVal, FMF, B, Fsincos: FSinCos);
1768
1769 auto replaceTrigInsts = [](ArrayRef<CallInst *> Calls, Value *Res) {
1770 for (CallInst *C : Calls)
1771 C->replaceAllUsesWith(V: Res);
1772
1773 // Leave the other dead instructions to avoid clobbering iterators.
1774 };
1775
1776 replaceTrigInsts(SinCalls, Sin);
1777 replaceTrigInsts(CosCalls, Cos);
1778 replaceTrigInsts(SinCosCalls, SinCos);
1779
1780 // It's safe to delete the original now.
1781 CI->eraseFromParent();
1782 return true;
1783}
1784
1785bool AMDGPULibCalls::evaluateScalarMathFunc(const FuncInfo &FInfo, double &Res0,
1786 double &Res1, Constant *copr0,
1787 Constant *copr1) {
1788 // By default, opr0/opr1/opr3 holds values of float/double type.
1789 // If they are not float/double, each function has to its
1790 // operand separately.
1791 double opr0 = 0.0, opr1 = 0.0;
1792 ConstantFP *fpopr0 = dyn_cast_or_null<ConstantFP>(Val: copr0);
1793 ConstantFP *fpopr1 = dyn_cast_or_null<ConstantFP>(Val: copr1);
1794 if (fpopr0) {
1795 opr0 = (getArgType(FInfo) == AMDGPULibFunc::F64)
1796 ? fpopr0->getValueAPF().convertToDouble()
1797 : (double)fpopr0->getValueAPF().convertToFloat();
1798 }
1799
1800 if (fpopr1) {
1801 opr1 = (getArgType(FInfo) == AMDGPULibFunc::F64)
1802 ? fpopr1->getValueAPF().convertToDouble()
1803 : (double)fpopr1->getValueAPF().convertToFloat();
1804 }
1805
1806 switch (FInfo.getId()) {
1807 default : return false;
1808
1809 case AMDGPULibFunc::EI_ACOS:
1810 Res0 = acos(x: opr0);
1811 return true;
1812
1813 case AMDGPULibFunc::EI_ACOSH:
1814 // acosh(x) == log(x + sqrt(x*x - 1))
1815 Res0 = log(x: opr0 + sqrt(x: opr0*opr0 - 1.0));
1816 return true;
1817
1818 case AMDGPULibFunc::EI_ACOSPI:
1819 Res0 = acos(x: opr0) / MATH_PI;
1820 return true;
1821
1822 case AMDGPULibFunc::EI_ASIN:
1823 Res0 = asin(x: opr0);
1824 return true;
1825
1826 case AMDGPULibFunc::EI_ASINH:
1827 // asinh(x) == log(x + sqrt(x*x + 1))
1828 Res0 = log(x: opr0 + sqrt(x: opr0*opr0 + 1.0));
1829 return true;
1830
1831 case AMDGPULibFunc::EI_ASINPI:
1832 Res0 = asin(x: opr0) / MATH_PI;
1833 return true;
1834
1835 case AMDGPULibFunc::EI_ATAN:
1836 Res0 = atan(x: opr0);
1837 return true;
1838
1839 case AMDGPULibFunc::EI_ATANH:
1840 // atanh(x) == (log(x+1) - log(x-1))/2;
1841 Res0 = (log(x: opr0 + 1.0) - log(x: opr0 - 1.0))/2.0;
1842 return true;
1843
1844 case AMDGPULibFunc::EI_ATANPI:
1845 Res0 = atan(x: opr0) / MATH_PI;
1846 return true;
1847
1848 case AMDGPULibFunc::EI_CBRT:
1849 Res0 = (opr0 < 0.0) ? -pow(x: -opr0, y: 1.0/3.0) : pow(x: opr0, y: 1.0/3.0);
1850 return true;
1851
1852 case AMDGPULibFunc::EI_COS:
1853 Res0 = cos(x: opr0);
1854 return true;
1855
1856 case AMDGPULibFunc::EI_COSH:
1857 Res0 = cosh(x: opr0);
1858 return true;
1859
1860 case AMDGPULibFunc::EI_COSPI:
1861 Res0 = cos(MATH_PI * opr0);
1862 return true;
1863
1864 case AMDGPULibFunc::EI_EXP:
1865 Res0 = exp(x: opr0);
1866 return true;
1867
1868 case AMDGPULibFunc::EI_EXP2:
1869 Res0 = pow(x: 2.0, y: opr0);
1870 return true;
1871
1872 case AMDGPULibFunc::EI_EXP10:
1873 Res0 = pow(x: 10.0, y: opr0);
1874 return true;
1875
1876 case AMDGPULibFunc::EI_LOG:
1877 Res0 = log(x: opr0);
1878 return true;
1879
1880 case AMDGPULibFunc::EI_LOG2:
1881 Res0 = log(x: opr0) / log(x: 2.0);
1882 return true;
1883
1884 case AMDGPULibFunc::EI_LOG10:
1885 Res0 = log(x: opr0) / log(x: 10.0);
1886 return true;
1887
1888 case AMDGPULibFunc::EI_RSQRT:
1889 Res0 = 1.0 / sqrt(x: opr0);
1890 return true;
1891
1892 case AMDGPULibFunc::EI_SIN:
1893 Res0 = sin(x: opr0);
1894 return true;
1895
1896 case AMDGPULibFunc::EI_SINH:
1897 Res0 = sinh(x: opr0);
1898 return true;
1899
1900 case AMDGPULibFunc::EI_SINPI:
1901 Res0 = sin(MATH_PI * opr0);
1902 return true;
1903
1904 case AMDGPULibFunc::EI_TAN:
1905 Res0 = tan(x: opr0);
1906 return true;
1907
1908 case AMDGPULibFunc::EI_TANH:
1909 Res0 = tanh(x: opr0);
1910 return true;
1911
1912 case AMDGPULibFunc::EI_TANPI:
1913 Res0 = tan(MATH_PI * opr0);
1914 return true;
1915
1916 // two-arg functions
1917 case AMDGPULibFunc::EI_POW:
1918 case AMDGPULibFunc::EI_POWR:
1919 Res0 = pow(x: opr0, y: opr1);
1920 return true;
1921
1922 case AMDGPULibFunc::EI_POWN: {
1923 if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(Val: copr1)) {
1924 double val = (double)iopr1->getSExtValue();
1925 Res0 = pow(x: opr0, y: val);
1926 return true;
1927 }
1928 return false;
1929 }
1930
1931 case AMDGPULibFunc::EI_ROOTN: {
1932 if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(Val: copr1)) {
1933 double val = (double)iopr1->getSExtValue();
1934 Res0 = pow(x: opr0, y: 1.0 / val);
1935 return true;
1936 }
1937 return false;
1938 }
1939
1940 // with ptr arg
1941 case AMDGPULibFunc::EI_SINCOS:
1942 Res0 = sin(x: opr0);
1943 Res1 = cos(x: opr0);
1944 return true;
1945 }
1946
1947 return false;
1948}
1949
1950bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) {
1951 int numArgs = (int)aCI->arg_size();
1952 if (numArgs > 3)
1953 return false;
1954
1955 Constant *copr0 = nullptr;
1956 Constant *copr1 = nullptr;
1957 if (numArgs > 0) {
1958 if ((copr0 = dyn_cast<Constant>(Val: aCI->getArgOperand(i: 0))) == nullptr)
1959 return false;
1960 }
1961
1962 if (numArgs > 1) {
1963 if ((copr1 = dyn_cast<Constant>(Val: aCI->getArgOperand(i: 1))) == nullptr) {
1964 if (FInfo.getId() != AMDGPULibFunc::EI_SINCOS)
1965 return false;
1966 }
1967 }
1968
1969 // At this point, all arguments to aCI are constants.
1970
1971 // max vector size is 16, and sincos will generate two results.
1972 double DVal0[16], DVal1[16];
1973 int FuncVecSize = getVecSize(FInfo);
1974 bool hasTwoResults = (FInfo.getId() == AMDGPULibFunc::EI_SINCOS);
1975 if (FuncVecSize == 1) {
1976 if (!evaluateScalarMathFunc(FInfo, Res0&: DVal0[0], Res1&: DVal1[0], copr0, copr1)) {
1977 return false;
1978 }
1979 } else {
1980 ConstantDataVector *CDV0 = dyn_cast_or_null<ConstantDataVector>(Val: copr0);
1981 ConstantDataVector *CDV1 = dyn_cast_or_null<ConstantDataVector>(Val: copr1);
1982 for (int i = 0; i < FuncVecSize; ++i) {
1983 Constant *celt0 = CDV0 ? CDV0->getElementAsConstant(i) : nullptr;
1984 Constant *celt1 = CDV1 ? CDV1->getElementAsConstant(i) : nullptr;
1985 if (!evaluateScalarMathFunc(FInfo, Res0&: DVal0[i], Res1&: DVal1[i], copr0: celt0, copr1: celt1)) {
1986 return false;
1987 }
1988 }
1989 }
1990
1991 LLVMContext &context = aCI->getContext();
1992 Constant *nval0, *nval1;
1993 if (FuncVecSize == 1) {
1994 nval0 = ConstantFP::get(Ty: aCI->getType(), V: DVal0[0]);
1995 if (hasTwoResults)
1996 nval1 = ConstantFP::get(Ty: aCI->getType(), V: DVal1[0]);
1997 } else {
1998 if (getArgType(FInfo) == AMDGPULibFunc::F32) {
1999 SmallVector <float, 0> FVal0, FVal1;
2000 for (int i = 0; i < FuncVecSize; ++i)
2001 FVal0.push_back(Elt: (float)DVal0[i]);
2002 ArrayRef<float> tmp0(FVal0);
2003 nval0 = ConstantDataVector::get(Context&: context, Elts: tmp0);
2004 if (hasTwoResults) {
2005 for (int i = 0; i < FuncVecSize; ++i)
2006 FVal1.push_back(Elt: (float)DVal1[i]);
2007 ArrayRef<float> tmp1(FVal1);
2008 nval1 = ConstantDataVector::get(Context&: context, Elts: tmp1);
2009 }
2010 } else {
2011 ArrayRef<double> tmp0(DVal0);
2012 nval0 = ConstantDataVector::get(Context&: context, Elts: tmp0);
2013 if (hasTwoResults) {
2014 ArrayRef<double> tmp1(DVal1);
2015 nval1 = ConstantDataVector::get(Context&: context, Elts: tmp1);
2016 }
2017 }
2018 }
2019
2020 if (hasTwoResults) {
2021 // sincos
2022 assert(FInfo.getId() == AMDGPULibFunc::EI_SINCOS &&
2023 "math function with ptr arg not supported yet");
2024 new StoreInst(nval1, aCI->getArgOperand(i: 1), aCI->getIterator());
2025 }
2026
2027 replaceCall(I: aCI, With: nval0);
2028 return true;
2029}
2030
2031PreservedAnalyses AMDGPUSimplifyLibCallsPass::run(Function &F,
2032 FunctionAnalysisManager &AM) {
2033 AMDGPULibCalls Simplifier(F, AM);
2034 Simplifier.initNativeFuncs();
2035
2036 bool Changed = false;
2037
2038 LLVM_DEBUG(dbgs() << "AMDIC: process function ";
2039 F.printAsOperand(dbgs(), false, F.getParent()); dbgs() << '\n';);
2040
2041 for (auto &BB : F) {
2042 for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;) {
2043 // Ignore non-calls.
2044 CallInst *CI = dyn_cast<CallInst>(Val&: I);
2045 ++I;
2046
2047 if (CI) {
2048 if (Simplifier.fold(CI))
2049 Changed = true;
2050 }
2051 }
2052 }
2053 return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
2054}
2055
2056PreservedAnalyses AMDGPUUseNativeCallsPass::run(Function &F,
2057 FunctionAnalysisManager &AM) {
2058 if (UseNative.empty())
2059 return PreservedAnalyses::all();
2060
2061 AMDGPULibCalls Simplifier(F, AM);
2062 Simplifier.initNativeFuncs();
2063
2064 bool Changed = false;
2065 for (auto &BB : F) {
2066 for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;) {
2067 // Ignore non-calls.
2068 CallInst *CI = dyn_cast<CallInst>(Val&: I);
2069 ++I;
2070 if (CI && Simplifier.useNative(aCI: CI))
2071 Changed = true;
2072 }
2073 }
2074 return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
2075}
2076