1//===- AMDGPULibCalls.cpp -------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file does AMD library function optimizations.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
15#include "AMDGPULibFunc.h"
16#include "llvm/Analysis/AssumptionCache.h"
17#include "llvm/Analysis/TargetLibraryInfo.h"
18#include "llvm/Analysis/ValueTracking.h"
19#include "llvm/IR/AttributeMask.h"
20#include "llvm/IR/Dominators.h"
21#include "llvm/IR/IRBuilder.h"
22#include "llvm/IR/IntrinsicsAMDGPU.h"
23#include "llvm/IR/MDBuilder.h"
24#include "llvm/IR/PatternMatch.h"
25#include <cmath>
26
27#define DEBUG_TYPE "amdgpu-simplifylib"
28
29using namespace llvm;
30using namespace llvm::PatternMatch;
31
32static cl::opt<bool> EnablePreLink("amdgpu-prelink",
33 cl::desc("Enable pre-link mode optimizations"),
34 cl::init(Val: false),
35 cl::Hidden);
36
37static cl::list<std::string> UseNative("amdgpu-use-native",
38 cl::desc("Comma separated list of functions to replace with native, or all"),
39 cl::CommaSeparated, cl::ValueOptional,
40 cl::Hidden);
41
42#define MATH_PI numbers::pi
43#define MATH_E numbers::e
44#define MATH_SQRT2 numbers::sqrt2
45#define MATH_SQRT1_2 numbers::inv_sqrt2
46
47enum class PowKind { Pow, PowR, PowN, RootN };
48
49namespace llvm {
50
51class AMDGPULibCalls {
52private:
53 SimplifyQuery SQ;
54
55 using FuncInfo = llvm::AMDGPULibFunc;
56
57 // -fuse-native.
58 bool AllNative = false;
59
60 bool useNativeFunc(const StringRef F) const;
61
62 // Return a pointer (pointer expr) to the function if function definition with
63 // "FuncName" exists. It may create a new function prototype in pre-link mode.
64 FunctionCallee getFunction(Module *M, const FuncInfo &fInfo);
65
66 /// Wrapper around getFunction which tries to use a faster variant if
67 /// available, and falls back to a less fast option.
68 ///
69 /// Return a replacement function for \p fInfo that has float-typed fast
70 /// variants. \p NewFunc is a base replacement function to use. \p
71 /// NewFuncFastVariant is a faster version to use if the calling context knows
72 /// it's legal. If there is no fast variant to use, \p NewFuncFastVariant
73 /// should be EI_NONE.
74 FunctionCallee getFloatFastVariant(Module *M, const FuncInfo &fInfo,
75 FuncInfo &newInfo,
76 AMDGPULibFunc::EFuncId NewFunc,
77 AMDGPULibFunc::EFuncId NewFuncFastVariant);
78
79 bool parseFunctionName(const StringRef &FMangledName, FuncInfo &FInfo);
80
81 bool TDOFold(CallInst *CI, const FuncInfo &FInfo);
82
83 /* Specialized optimizations */
84
85 // pow/powr/pown
86 bool fold_pow(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);
87
88 /// Peform a fast math expansion of pow, powr, pown or rootn.
89 bool expandFastPow(FPMathOperator *FPOp, IRBuilder<> &B, PowKind Kind);
90
91 bool tryOptimizePow(FPMathOperator *FPOp, IRBuilder<> &B,
92 const FuncInfo &FInfo);
93
94 // rootn
95 bool fold_rootn(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);
96
97 // -fuse-native for sincos
98 bool sincosUseNative(CallInst *aCI, const FuncInfo &FInfo);
99
100 // evaluate calls if calls' arguments are constants.
101 bool evaluateScalarMathFunc(const FuncInfo &FInfo, APFloat &Res0,
102 APFloat &Res1, Constant *copr0, Constant *copr1);
103 bool evaluateCall(CallInst *aCI, const FuncInfo &FInfo);
104
105 /// Insert a value to sincos function \p Fsincos. Returns (value of sin, value
106 /// of cos, sincos call).
107 std::tuple<Value *, Value *, Value *> insertSinCos(Value *Arg,
108 FastMathFlags FMF,
109 IRBuilder<> &B,
110 FunctionCallee Fsincos);
111
112 // sin/cos
113 bool fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);
114
115 // __read_pipe/__write_pipe
116 bool fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
117 const FuncInfo &FInfo);
118
119 // Get a scalar native builtin single argument FP function
120 FunctionCallee getNativeFunction(Module *M, const FuncInfo &FInfo);
121
122 /// Substitute a call to a known libcall with an intrinsic call. If \p
123 /// AllowMinSize is true, allow the replacement in a minsize function.
124 bool shouldReplaceLibcallWithIntrinsic(const CallInst *CI,
125 bool AllowMinSizeF32 = false,
126 bool AllowF64 = false,
127 bool AllowStrictFP = false);
128 void replaceLibCallWithSimpleIntrinsic(IRBuilder<> &B, CallInst *CI,
129 Intrinsic::ID IntrID);
130
131 bool tryReplaceLibcallWithSimpleIntrinsic(IRBuilder<> &B, CallInst *CI,
132 Intrinsic::ID IntrID,
133 bool AllowMinSizeF32 = false,
134 bool AllowF64 = false,
135 bool AllowStrictFP = false);
136
137protected:
138 bool isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const;
139
140 bool canIncreasePrecisionOfConstantFold(const FPMathOperator *FPOp) const;
141
142 static void replaceCall(Instruction *I, Value *With) {
143 I->replaceAllUsesWith(V: With);
144 I->eraseFromParent();
145 }
146
147 static void replaceCall(FPMathOperator *I, Value *With) {
148 replaceCall(I: cast<Instruction>(Val: I), With);
149 }
150
151public:
152 AMDGPULibCalls(Function &F, FunctionAnalysisManager &FAM);
153
154 bool fold(CallInst *CI);
155
156 void initNativeFuncs();
157
158 // Replace a normal math function call with that native version
159 bool useNative(CallInst *CI);
160};
161
162} // end namespace llvm
163
164template <typename IRB>
165static CallInst *CreateCallEx(IRB &B, FunctionCallee Callee, Value *Arg,
166 const Twine &Name = "") {
167 CallInst *R = B.CreateCall(Callee, Arg, Name);
168 if (Function *F = dyn_cast<Function>(Val: Callee.getCallee()))
169 R->setCallingConv(F->getCallingConv());
170 return R;
171}
172
173template <typename IRB>
174static CallInst *CreateCallEx2(IRB &B, FunctionCallee Callee, Value *Arg1,
175 Value *Arg2, const Twine &Name = "") {
176 CallInst *R = B.CreateCall(Callee, {Arg1, Arg2}, Name);
177 if (Function *F = dyn_cast<Function>(Val: Callee.getCallee()))
178 R->setCallingConv(F->getCallingConv());
179 return R;
180}
181
182static FunctionType *getPownType(FunctionType *FT) {
183 Type *PowNExpTy = Type::getInt32Ty(C&: FT->getContext());
184 if (VectorType *VecTy = dyn_cast<VectorType>(Val: FT->getReturnType()))
185 PowNExpTy = VectorType::get(ElementType: PowNExpTy, EC: VecTy->getElementCount());
186
187 return FunctionType::get(Result: FT->getReturnType(),
188 Params: {FT->getParamType(i: 0), PowNExpTy}, isVarArg: false);
189}
190
191// Data structures for table-driven optimizations.
192// FuncTbl works for both f32 and f64 functions with 1 input argument
193
194struct TableEntry {
195 double result;
196 double input;
197};
198
199/* a list of {result, input} */
200static const TableEntry tbl_acos[] = {
201 {MATH_PI / 2.0, .input: 0.0},
202 {MATH_PI / 2.0, .input: -0.0},
203 {.result: 0.0, .input: 1.0},
204 {MATH_PI, .input: -1.0}
205};
206static const TableEntry tbl_acosh[] = {
207 {.result: 0.0, .input: 1.0}
208};
209static const TableEntry tbl_acospi[] = {
210 {.result: 0.5, .input: 0.0},
211 {.result: 0.5, .input: -0.0},
212 {.result: 0.0, .input: 1.0},
213 {.result: 1.0, .input: -1.0}
214};
215static const TableEntry tbl_asin[] = {
216 {.result: 0.0, .input: 0.0},
217 {.result: -0.0, .input: -0.0},
218 {MATH_PI / 2.0, .input: 1.0},
219 {.result: -MATH_PI / 2.0, .input: -1.0}
220};
221static const TableEntry tbl_asinh[] = {
222 {.result: 0.0, .input: 0.0},
223 {.result: -0.0, .input: -0.0}
224};
225static const TableEntry tbl_asinpi[] = {
226 {.result: 0.0, .input: 0.0},
227 {.result: -0.0, .input: -0.0},
228 {.result: 0.5, .input: 1.0},
229 {.result: -0.5, .input: -1.0}
230};
231static const TableEntry tbl_atan[] = {
232 {.result: 0.0, .input: 0.0},
233 {.result: -0.0, .input: -0.0},
234 {MATH_PI / 4.0, .input: 1.0},
235 {.result: -MATH_PI / 4.0, .input: -1.0}
236};
237static const TableEntry tbl_atanh[] = {
238 {.result: 0.0, .input: 0.0},
239 {.result: -0.0, .input: -0.0}
240};
241static const TableEntry tbl_atanpi[] = {
242 {.result: 0.0, .input: 0.0},
243 {.result: -0.0, .input: -0.0},
244 {.result: 0.25, .input: 1.0},
245 {.result: -0.25, .input: -1.0}
246};
247static const TableEntry tbl_cbrt[] = {
248 {.result: 0.0, .input: 0.0},
249 {.result: -0.0, .input: -0.0},
250 {.result: 1.0, .input: 1.0},
251 {.result: -1.0, .input: -1.0},
252};
253static const TableEntry tbl_cos[] = {
254 {.result: 1.0, .input: 0.0},
255 {.result: 1.0, .input: -0.0}
256};
257static const TableEntry tbl_cosh[] = {
258 {.result: 1.0, .input: 0.0},
259 {.result: 1.0, .input: -0.0}
260};
261static const TableEntry tbl_cospi[] = {
262 {.result: 1.0, .input: 0.0},
263 {.result: 1.0, .input: -0.0}
264};
265static const TableEntry tbl_erfc[] = {
266 {.result: 1.0, .input: 0.0},
267 {.result: 1.0, .input: -0.0}
268};
269static const TableEntry tbl_erf[] = {
270 {.result: 0.0, .input: 0.0},
271 {.result: -0.0, .input: -0.0}
272};
273static const TableEntry tbl_exp[] = {
274 {.result: 1.0, .input: 0.0},
275 {.result: 1.0, .input: -0.0},
276 {MATH_E, .input: 1.0}
277};
278static const TableEntry tbl_exp2[] = {
279 {.result: 1.0, .input: 0.0},
280 {.result: 1.0, .input: -0.0},
281 {.result: 2.0, .input: 1.0}
282};
283static const TableEntry tbl_exp10[] = {
284 {.result: 1.0, .input: 0.0},
285 {.result: 1.0, .input: -0.0},
286 {.result: 10.0, .input: 1.0}
287};
288static const TableEntry tbl_expm1[] = {
289 {.result: 0.0, .input: 0.0},
290 {.result: -0.0, .input: -0.0}
291};
292static const TableEntry tbl_log[] = {
293 {.result: 0.0, .input: 1.0},
294 {.result: 1.0, MATH_E}
295};
296static const TableEntry tbl_log2[] = {
297 {.result: 0.0, .input: 1.0},
298 {.result: 1.0, .input: 2.0}
299};
300static const TableEntry tbl_log10[] = {
301 {.result: 0.0, .input: 1.0},
302 {.result: 1.0, .input: 10.0}
303};
304static const TableEntry tbl_rsqrt[] = {
305 {.result: 1.0, .input: 1.0},
306 {MATH_SQRT1_2, .input: 2.0}
307};
308static const TableEntry tbl_sin[] = {
309 {.result: 0.0, .input: 0.0},
310 {.result: -0.0, .input: -0.0}
311};
312static const TableEntry tbl_sinh[] = {
313 {.result: 0.0, .input: 0.0},
314 {.result: -0.0, .input: -0.0}
315};
316static const TableEntry tbl_sinpi[] = {
317 {.result: 0.0, .input: 0.0},
318 {.result: -0.0, .input: -0.0}
319};
320static const TableEntry tbl_sqrt[] = {
321 {.result: 0.0, .input: 0.0},
322 {.result: 1.0, .input: 1.0},
323 {MATH_SQRT2, .input: 2.0}
324};
325static const TableEntry tbl_tan[] = {
326 {.result: 0.0, .input: 0.0},
327 {.result: -0.0, .input: -0.0}
328};
329static const TableEntry tbl_tanh[] = {
330 {.result: 0.0, .input: 0.0},
331 {.result: -0.0, .input: -0.0}
332};
333static const TableEntry tbl_tanpi[] = {
334 {.result: 0.0, .input: 0.0},
335 {.result: -0.0, .input: -0.0}
336};
337static const TableEntry tbl_tgamma[] = {
338 {.result: 1.0, .input: 1.0},
339 {.result: 1.0, .input: 2.0},
340 {.result: 2.0, .input: 3.0},
341 {.result: 6.0, .input: 4.0}
342};
343
344static bool HasNative(AMDGPULibFunc::EFuncId id) {
345 switch(id) {
346 case AMDGPULibFunc::EI_DIVIDE:
347 case AMDGPULibFunc::EI_COS:
348 case AMDGPULibFunc::EI_EXP:
349 case AMDGPULibFunc::EI_EXP2:
350 case AMDGPULibFunc::EI_EXP10:
351 case AMDGPULibFunc::EI_LOG:
352 case AMDGPULibFunc::EI_LOG2:
353 case AMDGPULibFunc::EI_LOG10:
354 case AMDGPULibFunc::EI_POWR:
355 case AMDGPULibFunc::EI_RECIP:
356 case AMDGPULibFunc::EI_RSQRT:
357 case AMDGPULibFunc::EI_SIN:
358 case AMDGPULibFunc::EI_SINCOS:
359 case AMDGPULibFunc::EI_SQRT:
360 case AMDGPULibFunc::EI_TAN:
361 return true;
362 default:;
363 }
364 return false;
365}
366
367using TableRef = ArrayRef<TableEntry>;
368
369static TableRef getOptTable(AMDGPULibFunc::EFuncId id) {
370 switch(id) {
371 case AMDGPULibFunc::EI_ACOS: return TableRef(tbl_acos);
372 case AMDGPULibFunc::EI_ACOSH: return TableRef(tbl_acosh);
373 case AMDGPULibFunc::EI_ACOSPI: return TableRef(tbl_acospi);
374 case AMDGPULibFunc::EI_ASIN: return TableRef(tbl_asin);
375 case AMDGPULibFunc::EI_ASINH: return TableRef(tbl_asinh);
376 case AMDGPULibFunc::EI_ASINPI: return TableRef(tbl_asinpi);
377 case AMDGPULibFunc::EI_ATAN: return TableRef(tbl_atan);
378 case AMDGPULibFunc::EI_ATANH: return TableRef(tbl_atanh);
379 case AMDGPULibFunc::EI_ATANPI: return TableRef(tbl_atanpi);
380 case AMDGPULibFunc::EI_CBRT: return TableRef(tbl_cbrt);
381 case AMDGPULibFunc::EI_NCOS:
382 case AMDGPULibFunc::EI_COS: return TableRef(tbl_cos);
383 case AMDGPULibFunc::EI_COSH: return TableRef(tbl_cosh);
384 case AMDGPULibFunc::EI_COSPI: return TableRef(tbl_cospi);
385 case AMDGPULibFunc::EI_ERFC: return TableRef(tbl_erfc);
386 case AMDGPULibFunc::EI_ERF: return TableRef(tbl_erf);
387 case AMDGPULibFunc::EI_EXP: return TableRef(tbl_exp);
388 case AMDGPULibFunc::EI_NEXP2:
389 case AMDGPULibFunc::EI_EXP2: return TableRef(tbl_exp2);
390 case AMDGPULibFunc::EI_EXP10: return TableRef(tbl_exp10);
391 case AMDGPULibFunc::EI_EXPM1: return TableRef(tbl_expm1);
392 case AMDGPULibFunc::EI_LOG: return TableRef(tbl_log);
393 case AMDGPULibFunc::EI_NLOG2:
394 case AMDGPULibFunc::EI_LOG2: return TableRef(tbl_log2);
395 case AMDGPULibFunc::EI_LOG10: return TableRef(tbl_log10);
396 case AMDGPULibFunc::EI_NRSQRT:
397 case AMDGPULibFunc::EI_RSQRT: return TableRef(tbl_rsqrt);
398 case AMDGPULibFunc::EI_NSIN:
399 case AMDGPULibFunc::EI_SIN: return TableRef(tbl_sin);
400 case AMDGPULibFunc::EI_SINH: return TableRef(tbl_sinh);
401 case AMDGPULibFunc::EI_SINPI: return TableRef(tbl_sinpi);
402 case AMDGPULibFunc::EI_NSQRT:
403 case AMDGPULibFunc::EI_SQRT: return TableRef(tbl_sqrt);
404 case AMDGPULibFunc::EI_TAN: return TableRef(tbl_tan);
405 case AMDGPULibFunc::EI_TANH: return TableRef(tbl_tanh);
406 case AMDGPULibFunc::EI_TANPI: return TableRef(tbl_tanpi);
407 case AMDGPULibFunc::EI_TGAMMA: return TableRef(tbl_tgamma);
408 default:;
409 }
410 return TableRef();
411}
412
413static inline int getVecSize(const AMDGPULibFunc& FInfo) {
414 return FInfo.getLeads()[0].VectorSize;
415}
416
417static inline AMDGPULibFunc::EType getArgType(const AMDGPULibFunc& FInfo) {
418 return (AMDGPULibFunc::EType)FInfo.getLeads()[0].ArgType;
419}
420
421FunctionCallee AMDGPULibCalls::getFunction(Module *M, const FuncInfo &fInfo) {
422 // If we are doing PreLinkOpt, the function is external. So it is safe to
423 // use getOrInsertFunction() at this stage.
424
425 return EnablePreLink ? AMDGPULibFunc::getOrInsertFunction(M, fInfo)
426 : AMDGPULibFunc::getFunction(M, fInfo);
427}
428
429FunctionCallee AMDGPULibCalls::getFloatFastVariant(
430 Module *M, const FuncInfo &fInfo, FuncInfo &newInfo,
431 AMDGPULibFunc::EFuncId NewFunc, AMDGPULibFunc::EFuncId FastVariant) {
432 assert(NewFunc != FastVariant);
433
434 if (FastVariant != AMDGPULibFunc::EI_NONE &&
435 getArgType(FInfo: fInfo) == AMDGPULibFunc::F32) {
436 newInfo = AMDGPULibFunc(FastVariant, fInfo);
437 if (FunctionCallee NewCallee = getFunction(M, fInfo: newInfo))
438 return NewCallee;
439 }
440
441 newInfo = AMDGPULibFunc(NewFunc, fInfo);
442 return getFunction(M, fInfo: newInfo);
443}
444
445bool AMDGPULibCalls::parseFunctionName(const StringRef &FMangledName,
446 FuncInfo &FInfo) {
447 return AMDGPULibFunc::parse(MangledName: FMangledName, Ptr&: FInfo);
448}
449
450bool AMDGPULibCalls::isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const {
451 return FPOp->hasApproxFunc() && FPOp->hasNoNaNs() && FPOp->hasNoInfs();
452}
453
454bool AMDGPULibCalls::canIncreasePrecisionOfConstantFold(
455 const FPMathOperator *FPOp) const {
456 // TODO: Refine to approxFunc or contract
457 return FPOp->isFast();
458}
459
460AMDGPULibCalls::AMDGPULibCalls(Function &F, FunctionAnalysisManager &FAM)
461 : SQ(F.getParent()->getDataLayout(),
462 &FAM.getResult<TargetLibraryAnalysis>(IR&: F),
463 FAM.getCachedResult<DominatorTreeAnalysis>(IR&: F),
464 &FAM.getResult<AssumptionAnalysis>(IR&: F)) {}
465
466bool AMDGPULibCalls::useNativeFunc(const StringRef F) const {
467 return AllNative || llvm::is_contained(Range&: UseNative, Element: F);
468}
469
470void AMDGPULibCalls::initNativeFuncs() {
471 AllNative = useNativeFunc(F: "all") ||
472 (UseNative.getNumOccurrences() && UseNative.size() == 1 &&
473 UseNative.begin()->empty());
474}
475
476bool AMDGPULibCalls::sincosUseNative(CallInst *aCI, const FuncInfo &FInfo) {
477 bool native_sin = useNativeFunc(F: "sin");
478 bool native_cos = useNativeFunc(F: "cos");
479
480 if (native_sin && native_cos) {
481 Module *M = aCI->getModule();
482 Value *opr0 = aCI->getArgOperand(i: 0);
483
484 AMDGPULibFunc nf;
485 nf.getLeads()[0].ArgType = FInfo.getLeads()[0].ArgType;
486 nf.getLeads()[0].VectorSize = FInfo.getLeads()[0].VectorSize;
487
488 nf.setPrefix(AMDGPULibFunc::NATIVE);
489 nf.setId(AMDGPULibFunc::EI_SIN);
490 FunctionCallee sinExpr = getFunction(M, fInfo: nf);
491
492 nf.setPrefix(AMDGPULibFunc::NATIVE);
493 nf.setId(AMDGPULibFunc::EI_COS);
494 FunctionCallee cosExpr = getFunction(M, fInfo: nf);
495 if (sinExpr && cosExpr) {
496 Value *sinval =
497 CallInst::Create(Func: sinExpr, Args: opr0, NameStr: "splitsin", InsertBefore: aCI->getIterator());
498 Value *cosval =
499 CallInst::Create(Func: cosExpr, Args: opr0, NameStr: "splitcos", InsertBefore: aCI->getIterator());
500 new StoreInst(cosval, aCI->getArgOperand(i: 1), aCI->getIterator());
501
502 DEBUG_WITH_TYPE("usenative", dbgs() << "<useNative> replace " << *aCI
503 << " with native version of sin/cos");
504
505 replaceCall(I: aCI, With: sinval);
506 return true;
507 }
508 }
509 return false;
510}
511
512bool AMDGPULibCalls::useNative(CallInst *aCI) {
513 Function *Callee = aCI->getCalledFunction();
514 if (!Callee || aCI->isNoBuiltin())
515 return false;
516
517 FuncInfo FInfo;
518 if (!parseFunctionName(FMangledName: Callee->getName(), FInfo) || !FInfo.isMangled() ||
519 FInfo.getPrefix() != AMDGPULibFunc::NOPFX ||
520 getArgType(FInfo) == AMDGPULibFunc::F64 || !HasNative(id: FInfo.getId()) ||
521 !(AllNative || useNativeFunc(F: FInfo.getName()))) {
522 return false;
523 }
524
525 if (FInfo.getId() == AMDGPULibFunc::EI_SINCOS)
526 return sincosUseNative(aCI, FInfo);
527
528 FInfo.setPrefix(AMDGPULibFunc::NATIVE);
529 FunctionCallee F = getFunction(M: aCI->getModule(), fInfo: FInfo);
530 if (!F)
531 return false;
532
533 aCI->setCalledFunction(F);
534 DEBUG_WITH_TYPE("usenative", dbgs() << "<useNative> replace " << *aCI
535 << " with native version");
536 return true;
537}
538
539// Clang emits call of __read_pipe_2 or __read_pipe_4 for OpenCL read_pipe
540// builtin, with appended type size and alignment arguments, where 2 or 4
541// indicates the original number of arguments. The library has optimized version
542// of __read_pipe_2/__read_pipe_4 when the type size and alignment has the same
543// power of 2 value. This function transforms __read_pipe_2 to __read_pipe_2_N
544// for such cases where N is the size in bytes of the type (N = 1, 2, 4, 8, ...,
545// 128). The same for __read_pipe_4, write_pipe_2, and write_pipe_4.
546bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
547 const FuncInfo &FInfo) {
548 auto *Callee = CI->getCalledFunction();
549 if (!Callee->isDeclaration())
550 return false;
551
552 assert(Callee->hasName() && "Invalid read_pipe/write_pipe function");
553 auto *M = Callee->getParent();
554 std::string Name = std::string(Callee->getName());
555 auto NumArg = CI->arg_size();
556 if (NumArg != 4 && NumArg != 6)
557 return false;
558 ConstantInt *PacketSize =
559 dyn_cast<ConstantInt>(Val: CI->getArgOperand(i: NumArg - 2));
560 ConstantInt *PacketAlign =
561 dyn_cast<ConstantInt>(Val: CI->getArgOperand(i: NumArg - 1));
562 if (!PacketSize || !PacketAlign)
563 return false;
564
565 unsigned Size = PacketSize->getZExtValue();
566 Align Alignment = PacketAlign->getAlignValue();
567 if (Alignment != Size)
568 return false;
569
570 unsigned PtrArgLoc = CI->arg_size() - 3;
571 Value *PtrArg = CI->getArgOperand(i: PtrArgLoc);
572 Type *PtrTy = PtrArg->getType();
573
574 SmallVector<llvm::Type *, 6> ArgTys;
575 for (unsigned I = 0; I != PtrArgLoc; ++I)
576 ArgTys.push_back(Elt: CI->getArgOperand(i: I)->getType());
577 ArgTys.push_back(Elt: PtrTy);
578
579 Name = Name + "_" + std::to_string(val: Size);
580 auto *FTy = FunctionType::get(Result: Callee->getReturnType(),
581 Params: ArrayRef<Type *>(ArgTys), isVarArg: false);
582 AMDGPULibFunc NewLibFunc(Name, FTy);
583 FunctionCallee F = AMDGPULibFunc::getOrInsertFunction(M, fInfo: NewLibFunc);
584 if (!F)
585 return false;
586
587 SmallVector<Value *, 6> Args;
588 for (unsigned I = 0; I != PtrArgLoc; ++I)
589 Args.push_back(Elt: CI->getArgOperand(i: I));
590 Args.push_back(Elt: PtrArg);
591
592 auto *NCI = B.CreateCall(Callee: F, Args);
593 NCI->setAttributes(CI->getAttributes());
594 CI->replaceAllUsesWith(V: NCI);
595 CI->dropAllReferences();
596 CI->eraseFromParent();
597
598 return true;
599}
600
601// This function returns false if no change; return true otherwise.
602bool AMDGPULibCalls::fold(CallInst *CI) {
603 Function *Callee = CI->getCalledFunction();
604 // Ignore indirect calls.
605 if (!Callee || Callee->isIntrinsic() || CI->isNoBuiltin())
606 return false;
607
608 FuncInfo FInfo;
609 if (!parseFunctionName(FMangledName: Callee->getName(), FInfo))
610 return false;
611
612 // Further check the number of arguments to see if they match.
613 // TODO: Check calling convention matches too
614 if (!FInfo.isCompatibleSignature(M: *Callee->getParent(), FuncTy: CI->getFunctionType()))
615 return false;
616
617 LLVM_DEBUG(dbgs() << "AMDIC: try folding " << *CI << '\n');
618
619 if (TDOFold(CI, FInfo))
620 return true;
621
622 IRBuilder<> B(CI);
623 if (CI->isStrictFP())
624 B.setIsFPConstrained(true);
625
626 if (FPMathOperator *FPOp = dyn_cast<FPMathOperator>(Val: CI)) {
627 // Under unsafe-math, evaluate calls if possible.
628 // According to Brian Sumner, we can do this for all f32 function calls
629 // using host's double function calls.
630 if (canIncreasePrecisionOfConstantFold(FPOp) && evaluateCall(aCI: CI, FInfo))
631 return true;
632
633 // Copy fast flags from the original call.
634 FastMathFlags FMF = FPOp->getFastMathFlags();
635 B.setFastMathFlags(FMF);
636
637 // Specialized optimizations for each function call.
638 //
639 // TODO: Handle native functions
640 switch (FInfo.getId()) {
641 case AMDGPULibFunc::EI_EXP:
642 if (FMF.none())
643 return false;
644 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::exp,
645 AllowMinSizeF32: FMF.approxFunc());
646 case AMDGPULibFunc::EI_EXP2:
647 if (FMF.none())
648 return false;
649 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::exp2,
650 AllowMinSizeF32: FMF.approxFunc());
651 case AMDGPULibFunc::EI_LOG:
652 if (FMF.none())
653 return false;
654 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::log,
655 AllowMinSizeF32: FMF.approxFunc());
656 case AMDGPULibFunc::EI_LOG2:
657 if (FMF.none())
658 return false;
659 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::log2,
660 AllowMinSizeF32: FMF.approxFunc());
661 case AMDGPULibFunc::EI_LOG10:
662 if (FMF.none())
663 return false;
664 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::log10,
665 AllowMinSizeF32: FMF.approxFunc());
666 case AMDGPULibFunc::EI_FMIN:
667 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::minnum,
668 AllowMinSizeF32: true, AllowF64: true);
669 case AMDGPULibFunc::EI_FMAX:
670 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::maxnum,
671 AllowMinSizeF32: true, AllowF64: true);
672 case AMDGPULibFunc::EI_FMA:
673 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::fma, AllowMinSizeF32: true,
674 AllowF64: true);
675 case AMDGPULibFunc::EI_MAD:
676 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::fmuladd,
677 AllowMinSizeF32: true, AllowF64: true);
678 case AMDGPULibFunc::EI_FABS:
679 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::fabs, AllowMinSizeF32: true,
680 AllowF64: true, AllowStrictFP: true);
681 case AMDGPULibFunc::EI_COPYSIGN:
682 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::copysign,
683 AllowMinSizeF32: true, AllowF64: true, AllowStrictFP: true);
684 case AMDGPULibFunc::EI_FLOOR:
685 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::floor, AllowMinSizeF32: true,
686 AllowF64: true);
687 case AMDGPULibFunc::EI_CEIL:
688 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::ceil, AllowMinSizeF32: true,
689 AllowF64: true);
690 case AMDGPULibFunc::EI_TRUNC:
691 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::trunc, AllowMinSizeF32: true,
692 AllowF64: true);
693 case AMDGPULibFunc::EI_RINT:
694 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::rint, AllowMinSizeF32: true,
695 AllowF64: true);
696 case AMDGPULibFunc::EI_ROUND:
697 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::round, AllowMinSizeF32: true,
698 AllowF64: true);
699 case AMDGPULibFunc::EI_LDEXP: {
700 if (!shouldReplaceLibcallWithIntrinsic(CI, AllowMinSizeF32: true, AllowF64: true))
701 return false;
702
703 Value *Arg1 = CI->getArgOperand(i: 1);
704 if (VectorType *VecTy = dyn_cast<VectorType>(Val: CI->getType());
705 VecTy && !isa<VectorType>(Val: Arg1->getType())) {
706 Value *SplatArg1 = B.CreateVectorSplat(EC: VecTy->getElementCount(), V: Arg1);
707 CI->setArgOperand(i: 1, v: SplatArg1);
708 }
709
710 CI->setCalledFunction(Intrinsic::getOrInsertDeclaration(
711 M: CI->getModule(), id: Intrinsic::ldexp,
712 Tys: {CI->getType(), CI->getArgOperand(i: 1)->getType()}));
713 return true;
714 }
715 case AMDGPULibFunc::EI_POW:
716 case AMDGPULibFunc::EI_POW_FAST:
717 return tryOptimizePow(FPOp, B, FInfo);
718 case AMDGPULibFunc::EI_POWR:
719 case AMDGPULibFunc::EI_POWR_FAST: {
720 if (fold_pow(FPOp, B, FInfo))
721 return true;
722 if (!FMF.approxFunc())
723 return false;
724
725 if (FInfo.getId() == AMDGPULibFunc::EI_POWR && FMF.approxFunc() &&
726 getArgType(FInfo) == AMDGPULibFunc::F32) {
727 Module *M = Callee->getParent();
728 AMDGPULibFunc PowrFastInfo(AMDGPULibFunc::EI_POWR_FAST, FInfo);
729 if (FunctionCallee PowrFastFunc = getFunction(M, fInfo: PowrFastInfo)) {
730 CI->setCalledFunction(PowrFastFunc);
731 return true;
732 }
733 }
734
735 if (!shouldReplaceLibcallWithIntrinsic(CI))
736 return false;
737 return expandFastPow(FPOp, B, Kind: PowKind::PowR);
738 }
739 case AMDGPULibFunc::EI_POWN:
740 case AMDGPULibFunc::EI_POWN_FAST: {
741 if (fold_pow(FPOp, B, FInfo))
742 return true;
743 if (!FMF.approxFunc())
744 return false;
745
746 if (FInfo.getId() == AMDGPULibFunc::EI_POWN &&
747 getArgType(FInfo) == AMDGPULibFunc::F32) {
748 Module *M = Callee->getParent();
749 AMDGPULibFunc PownFastInfo(AMDGPULibFunc::EI_POWN_FAST, FInfo);
750 if (FunctionCallee PownFastFunc = getFunction(M, fInfo: PownFastInfo)) {
751 CI->setCalledFunction(PownFastFunc);
752 return true;
753 }
754 }
755
756 if (!shouldReplaceLibcallWithIntrinsic(CI))
757 return false;
758 return expandFastPow(FPOp, B, Kind: PowKind::PowN);
759 }
760 case AMDGPULibFunc::EI_ROOTN:
761 case AMDGPULibFunc::EI_ROOTN_FAST: {
762 if (fold_rootn(FPOp, B, FInfo))
763 return true;
764 if (!FMF.approxFunc())
765 return false;
766
767 if (getArgType(FInfo) == AMDGPULibFunc::F32) {
768 Module *M = Callee->getParent();
769 AMDGPULibFunc RootnFastInfo(AMDGPULibFunc::EI_ROOTN_FAST, FInfo);
770 if (FunctionCallee RootnFastFunc = getFunction(M, fInfo: RootnFastInfo)) {
771 CI->setCalledFunction(RootnFastFunc);
772 return true;
773 }
774 }
775
776 return expandFastPow(FPOp, B, Kind: PowKind::RootN);
777 }
778 case AMDGPULibFunc::EI_SQRT:
779 // TODO: Allow with strictfp + constrained intrinsic
780 return tryReplaceLibcallWithSimpleIntrinsic(
781 B, CI, IntrID: Intrinsic::sqrt, AllowMinSizeF32: true, AllowF64: true, /*AllowStrictFP=*/false);
782 case AMDGPULibFunc::EI_COS:
783 case AMDGPULibFunc::EI_SIN:
784 return fold_sincos(FPOp, B, FInfo);
785 default:
786 break;
787 }
788 } else {
789 // Specialized optimizations for each function call
790 switch (FInfo.getId()) {
791 case AMDGPULibFunc::EI_READ_PIPE_2:
792 case AMDGPULibFunc::EI_READ_PIPE_4:
793 case AMDGPULibFunc::EI_WRITE_PIPE_2:
794 case AMDGPULibFunc::EI_WRITE_PIPE_4:
795 return fold_read_write_pipe(CI, B, FInfo);
796 default:
797 break;
798 }
799 }
800
801 return false;
802}
803
804static Constant *getConstantFloatVector(const ArrayRef<APFloat> Values,
805 const Type *Ty) {
806 Type *ElemTy = Ty->getScalarType();
807 const fltSemantics &FltSem = ElemTy->getFltSemantics();
808
809 SmallVector<Constant *, 4> ConstValues;
810 ConstValues.reserve(N: Values.size());
811 for (APFloat APF : Values) {
812 bool Unused;
813 APF.convert(ToSemantics: FltSem, RM: APFloat::rmNearestTiesToEven, losesInfo: &Unused);
814 ConstValues.push_back(Elt: ConstantFP::get(Ty: ElemTy, V: APF));
815 }
816 return ConstantVector::get(V: ConstValues);
817}
818
819bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
820 // Table-Driven optimization
821 const TableRef tr = getOptTable(id: FInfo.getId());
822 if (tr.empty())
823 return false;
824
825 int const sz = (int)tr.size();
826 Value *opr0 = CI->getArgOperand(i: 0);
827
828 int vecSize = getVecSize(FInfo);
829 if (vecSize > 1) {
830 // Vector version
831 Constant *CV = dyn_cast<Constant>(Val: opr0);
832 if (CV && CV->getType()->isVectorTy()) {
833 SmallVector<APFloat, 4> Values;
834 Values.reserve(N: vecSize);
835 for (int eltNo = 0; eltNo < vecSize; ++eltNo) {
836 ConstantFP *eltval =
837 cast<ConstantFP>(Val: CV->getAggregateElement(Elt: (unsigned)eltNo));
838 auto MatchingRow = llvm::find_if(Range: tr, P: [eltval](const TableEntry &entry) {
839 return eltval->isExactlyValue(V: entry.input);
840 });
841 if (MatchingRow == tr.end())
842 return false;
843 Values.push_back(Elt: APFloat(MatchingRow->result));
844 }
845 Constant *NewValues = getConstantFloatVector(Values, Ty: CI->getType());
846 LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *NewValues << "\n");
847 replaceCall(I: CI, With: NewValues);
848 return true;
849 }
850 } else {
851 // Scalar version
852 if (ConstantFP *CF = dyn_cast<ConstantFP>(Val: opr0)) {
853 for (int i = 0; i < sz; ++i) {
854 if (CF->isExactlyValue(V: tr[i].input)) {
855 Value *nval = ConstantFP::get(Ty: CF->getType(), V: tr[i].result);
856 LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n");
857 replaceCall(I: CI, With: nval);
858 return true;
859 }
860 }
861 }
862 }
863
864 return false;
865}
866
867namespace llvm {
868static double log2(double V) {
869#if _XOPEN_SOURCE >= 600 || defined(_ISOC99_SOURCE) || _POSIX_C_SOURCE >= 200112L
870 return ::log2(x: V);
871#else
872 return log(V) / numbers::ln2;
873#endif
874}
875} // namespace llvm
876
877bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B,
878 const FuncInfo &FInfo) {
879 assert((FInfo.getId() == AMDGPULibFunc::EI_POW ||
880 FInfo.getId() == AMDGPULibFunc::EI_POW_FAST ||
881 FInfo.getId() == AMDGPULibFunc::EI_POWR ||
882 FInfo.getId() == AMDGPULibFunc::EI_POWR_FAST ||
883 FInfo.getId() == AMDGPULibFunc::EI_POWN ||
884 FInfo.getId() == AMDGPULibFunc::EI_POWN_FAST) &&
885 "fold_pow: encounter a wrong function call");
886
887 Module *M = B.GetInsertBlock()->getModule();
888 Type *eltType = FPOp->getType()->getScalarType();
889 Value *opr0 = FPOp->getOperand(i: 0);
890 Value *opr1 = FPOp->getOperand(i: 1);
891
892 const APFloat *CF = nullptr;
893 const APInt *CINT = nullptr;
894 if (!match(V: opr1, P: m_APFloatAllowPoison(Res&: CF)))
895 match(V: opr1, P: m_APIntAllowPoison(Res&: CINT));
896
897 // 0x1111111 means that we don't do anything for this call.
898 int ci_opr1 = (CINT ? (int)CINT->getSExtValue() : 0x1111111);
899
900 if ((CF && CF->isZero()) || (CINT && ci_opr1 == 0)) {
901 // pow/powr/pown(x, 0) == 1
902 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1\n");
903 Constant *cnval = ConstantFP::get(Ty: eltType, V: 1.0);
904 if (getVecSize(FInfo) > 1) {
905 cnval = ConstantDataVector::getSplat(NumElts: getVecSize(FInfo), Elt: cnval);
906 }
907 replaceCall(I: FPOp, With: cnval);
908 return true;
909 }
910 if ((CF && CF->isExactlyValue(V: 1.0)) || (CINT && ci_opr1 == 1)) {
911 // pow/powr/pown(x, 1.0) = x
912 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << *opr0 << "\n");
913 replaceCall(I: FPOp, With: opr0);
914 return true;
915 }
916 if ((CF && CF->isExactlyValue(V: 2.0)) || (CINT && ci_opr1 == 2)) {
917 // pow/powr/pown(x, 2.0) = x*x
918 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << *opr0 << " * "
919 << *opr0 << "\n");
920 Value *nval = B.CreateFMul(L: opr0, R: opr0, Name: "__pow2");
921 replaceCall(I: FPOp, With: nval);
922 return true;
923 }
924 if ((CF && CF->isExactlyValue(V: -1.0)) || (CINT && ci_opr1 == -1)) {
925 // pow/powr/pown(x, -1.0) = 1.0/x
926 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1 / " << *opr0 << "\n");
927 Constant *cnval = ConstantFP::get(Ty: eltType, V: 1.0);
928 if (getVecSize(FInfo) > 1) {
929 cnval = ConstantDataVector::getSplat(NumElts: getVecSize(FInfo), Elt: cnval);
930 }
931 Value *nval = B.CreateFDiv(L: cnval, R: opr0, Name: "__powrecip");
932 replaceCall(I: FPOp, With: nval);
933 return true;
934 }
935
936 if (CF && (CF->isExactlyValue(V: 0.5) || CF->isExactlyValue(V: -0.5))) {
937 // pow[r](x, [-]0.5) = sqrt(x)
938 bool issqrt = CF->isExactlyValue(V: 0.5);
939 if (FunctionCallee FPExpr =
940 getFunction(M, fInfo: AMDGPULibFunc(issqrt ? AMDGPULibFunc::EI_SQRT
941 : AMDGPULibFunc::EI_RSQRT,
942 FInfo))) {
943 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << FInfo.getName()
944 << '(' << *opr0 << ")\n");
945 Value *nval = CreateCallEx(B,Callee: FPExpr, Arg: opr0, Name: issqrt ? "__pow2sqrt"
946 : "__pow2rsqrt");
947 replaceCall(I: FPOp, With: nval);
948 return true;
949 }
950 }
951
952 if (!isUnsafeFiniteOnlyMath(FPOp))
953 return false;
954
955 // Unsafe Math optimization
956
957 // Remember that ci_opr1 is set if opr1 is integral
958 if (CF) {
959 double dval = (getArgType(FInfo) == AMDGPULibFunc::F32)
960 ? (double)CF->convertToFloat()
961 : CF->convertToDouble();
962 int ival = (int)dval;
963 if ((double)ival == dval) {
964 ci_opr1 = ival;
965 } else
966 ci_opr1 = 0x11111111;
967 }
968
969 // pow/powr/pown(x, c) = [1/](x*x*..x); where
970 // trunc(c) == c && the number of x == c && |c| <= 12
971 unsigned abs_opr1 = (ci_opr1 < 0) ? -ci_opr1 : ci_opr1;
972 if (abs_opr1 <= 12) {
973 Constant *cnval;
974 Value *nval;
975 if (abs_opr1 == 0) {
976 cnval = ConstantFP::get(Ty: eltType, V: 1.0);
977 if (getVecSize(FInfo) > 1) {
978 cnval = ConstantDataVector::getSplat(NumElts: getVecSize(FInfo), Elt: cnval);
979 }
980 nval = cnval;
981 } else {
982 Value *valx2 = nullptr;
983 nval = nullptr;
984 while (abs_opr1 > 0) {
985 valx2 = valx2 ? B.CreateFMul(L: valx2, R: valx2, Name: "__powx2") : opr0;
986 if (abs_opr1 & 1) {
987 nval = nval ? B.CreateFMul(L: nval, R: valx2, Name: "__powprod") : valx2;
988 }
989 abs_opr1 >>= 1;
990 }
991 }
992
993 if (ci_opr1 < 0) {
994 cnval = ConstantFP::get(Ty: eltType, V: 1.0);
995 if (getVecSize(FInfo) > 1) {
996 cnval = ConstantDataVector::getSplat(NumElts: getVecSize(FInfo), Elt: cnval);
997 }
998 nval = B.CreateFDiv(L: cnval, R: nval, Name: "__1powprod");
999 }
1000 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> "
1001 << ((ci_opr1 < 0) ? "1/prod(" : "prod(") << *opr0
1002 << ")\n");
1003 replaceCall(I: FPOp, With: nval);
1004 return true;
1005 }
1006
1007 // If we should use the generic intrinsic instead of emitting a libcall
1008 const bool ShouldUseIntrinsic = eltType->isFloatTy() || eltType->isHalfTy();
1009
1010 // powr ---> exp2(y * log2(x))
1011 // pown/pow ---> powr(fabs(x), y) | (x & ((int)y << 31))
1012 FunctionCallee ExpExpr;
1013 if (ShouldUseIntrinsic)
1014 ExpExpr = Intrinsic::getOrInsertDeclaration(M, id: Intrinsic::exp2,
1015 Tys: {FPOp->getType()});
1016 else {
1017 ExpExpr = getFunction(M, fInfo: AMDGPULibFunc(AMDGPULibFunc::EI_EXP2, FInfo));
1018 if (!ExpExpr)
1019 return false;
1020 }
1021
1022 bool needlog = false;
1023 bool needabs = false;
1024 bool needcopysign = false;
1025 Constant *cnval = nullptr;
1026 if (getVecSize(FInfo) == 1) {
1027 CF = nullptr;
1028 match(V: opr0, P: m_APFloatAllowPoison(Res&: CF));
1029
1030 if (CF) {
1031 double V = (getArgType(FInfo) == AMDGPULibFunc::F32)
1032 ? (double)CF->convertToFloat()
1033 : CF->convertToDouble();
1034
1035 V = log2(V: std::abs(x: V));
1036 cnval = ConstantFP::get(Ty: eltType, V);
1037 needcopysign = (FInfo.getId() != AMDGPULibFunc::EI_POWR &&
1038 FInfo.getId() != AMDGPULibFunc::EI_POWR_FAST) &&
1039 CF->isNegative();
1040 } else {
1041 needlog = true;
1042 needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR &&
1043 FInfo.getId() != AMDGPULibFunc::EI_POWR_FAST;
1044 }
1045 } else {
1046 ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(Val: opr0);
1047
1048 if (!CDV) {
1049 needlog = true;
1050 needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR &&
1051 FInfo.getId() != AMDGPULibFunc::EI_POWR_FAST;
1052 } else {
1053 assert ((int)CDV->getNumElements() == getVecSize(FInfo) &&
1054 "Wrong vector size detected");
1055
1056 SmallVector<double, 0> DVal;
1057 for (int i=0; i < getVecSize(FInfo); ++i) {
1058 double V = CDV->getElementAsAPFloat(i).convertToDouble();
1059 if (V < 0.0) needcopysign = true;
1060 V = log2(V: std::abs(x: V));
1061 DVal.push_back(Elt: V);
1062 }
1063 if (getArgType(FInfo) == AMDGPULibFunc::F32) {
1064 SmallVector<float, 0> FVal;
1065 for (double D : DVal)
1066 FVal.push_back(Elt: (float)D);
1067 ArrayRef<float> tmp(FVal);
1068 cnval = ConstantDataVector::get(Context&: M->getContext(), Elts: tmp);
1069 } else {
1070 ArrayRef<double> tmp(DVal);
1071 cnval = ConstantDataVector::get(Context&: M->getContext(), Elts: tmp);
1072 }
1073 }
1074 }
1075
1076 if (needcopysign && (FInfo.getId() == AMDGPULibFunc::EI_POW ||
1077 FInfo.getId() == AMDGPULibFunc::EI_POW_FAST)) {
1078 // We cannot handle corner cases for a general pow() function, give up
1079 // unless y is a constant integral value. Then proceed as if it were pown.
1080 if (!isKnownIntegral(V: opr1, SQ: SQ.getWithInstruction(I: cast<Instruction>(Val: FPOp)),
1081 FMF: FPOp->getFastMathFlags()))
1082 return false;
1083 }
1084
1085 Value *nval;
1086 if (needabs) {
1087 nval = B.CreateUnaryIntrinsic(ID: Intrinsic::fabs, V: opr0, FMFSource: nullptr, Name: "__fabs");
1088 } else {
1089 nval = cnval ? cnval : opr0;
1090 }
1091 if (needlog) {
1092 FunctionCallee LogExpr;
1093 if (ShouldUseIntrinsic) {
1094 LogExpr = Intrinsic::getOrInsertDeclaration(M, id: Intrinsic::log2,
1095 Tys: {FPOp->getType()});
1096 } else {
1097 LogExpr = getFunction(M, fInfo: AMDGPULibFunc(AMDGPULibFunc::EI_LOG2, FInfo));
1098 if (!LogExpr)
1099 return false;
1100 }
1101
1102 nval = CreateCallEx(B,Callee: LogExpr, Arg: nval, Name: "__log2");
1103 }
1104
1105 if (FInfo.getId() == AMDGPULibFunc::EI_POWN ||
1106 FInfo.getId() == AMDGPULibFunc::EI_POWN_FAST) {
1107 // convert int(32) to fp(f32 or f64)
1108 opr1 = B.CreateSIToFP(V: opr1, DestTy: nval->getType(), Name: "pownI2F");
1109 }
1110 nval = B.CreateFMul(L: opr1, R: nval, Name: "__ylogx");
1111
1112 CallInst *Exp2Call = CreateCallEx(B, Callee: ExpExpr, Arg: nval, Name: "__exp2");
1113
1114 // TODO: Generalized fpclass logic for pow
1115 FPClassTest KnownNot = FPClassTest::fcNegative;
1116 if (FPOp->hasNoNaNs())
1117 KnownNot |= FPClassTest::fcNan;
1118
1119 Exp2Call->addRetAttr(
1120 Attr: Attribute::getWithNoFPClass(Context&: Exp2Call->getContext(), Mask: KnownNot));
1121 nval = Exp2Call;
1122
1123 if (needcopysign) {
1124 Type* nTyS = B.getIntNTy(N: eltType->getPrimitiveSizeInBits());
1125 Type *nTy = FPOp->getType()->getWithNewType(EltTy: nTyS);
1126 Value *opr_n = FPOp->getOperand(i: 1);
1127 if (opr_n->getType()->getScalarType()->isIntegerTy())
1128 opr_n = B.CreateZExtOrTrunc(V: opr_n, DestTy: nTy, Name: "__ytou");
1129 else
1130 opr_n = B.CreateFPToSI(V: opr1, DestTy: nTy, Name: "__ytou");
1131
1132 unsigned size = nTy->getScalarSizeInBits();
1133 Value *sign = B.CreateShl(LHS: opr_n, RHS: size-1, Name: "__yeven");
1134 sign = B.CreateAnd(LHS: B.CreateBitCast(V: opr0, DestTy: nTy), RHS: sign, Name: "__pow_sign");
1135
1136 nval = B.CreateCopySign(LHS: nval, RHS: B.CreateBitCast(V: sign, DestTy: nval->getType()),
1137 FMFSource: nullptr, Name: "__pow_sign");
1138 }
1139
1140 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> "
1141 << "exp2(" << *opr1 << " * log2(" << *opr0 << "))\n");
1142 replaceCall(I: FPOp, With: nval);
1143
1144 return true;
1145}
1146
1147bool AMDGPULibCalls::fold_rootn(FPMathOperator *FPOp, IRBuilder<> &B,
1148 const FuncInfo &FInfo) {
1149 Value *opr0 = FPOp->getOperand(i: 0);
1150 Value *opr1 = FPOp->getOperand(i: 1);
1151
1152 const APInt *CINT = nullptr;
1153 if (!match(V: opr1, P: m_APIntAllowPoison(Res&: CINT)))
1154 return false;
1155
1156 Function *Parent = B.GetInsertBlock()->getParent();
1157
1158 int ci_opr1 = (int)CINT->getSExtValue();
1159 if (ci_opr1 == 1 && !Parent->hasFnAttribute(Kind: Attribute::StrictFP)) {
1160 // rootn(x, 1) = x
1161 //
1162 // TODO: Insert constrained canonicalize for strictfp case.
1163 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << *opr0 << '\n');
1164 replaceCall(I: FPOp, With: opr0);
1165 return true;
1166 }
1167
1168 Module *M = B.GetInsertBlock()->getModule();
1169
1170 CallInst *CI = cast<CallInst>(Val: FPOp);
1171 if (ci_opr1 == 2 &&
1172 shouldReplaceLibcallWithIntrinsic(CI,
1173 /*AllowMinSizeF32=*/true,
1174 /*AllowF64=*/true)) {
1175 // rootn(x, 2) = sqrt(x)
1176 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> sqrt(" << *opr0 << ")\n");
1177
1178 CallInst *NewCall = B.CreateUnaryIntrinsic(ID: Intrinsic::sqrt, V: opr0, FMFSource: CI);
1179 NewCall->takeName(V: CI);
1180
1181 // OpenCL rootn has a looser ulp of 2 requirement than sqrt, so add some
1182 // metadata.
1183 MDBuilder MDHelper(M->getContext());
1184 MDNode *FPMD = MDHelper.createFPMath(Accuracy: std::max(a: FPOp->getFPAccuracy(), b: 2.0f));
1185 NewCall->setMetadata(KindID: LLVMContext::MD_fpmath, Node: FPMD);
1186
1187 replaceCall(I: CI, With: NewCall);
1188 return true;
1189 }
1190
1191 if (ci_opr1 == 3) { // rootn(x, 3) = cbrt(x)
1192 if (FunctionCallee FPExpr =
1193 getFunction(M, fInfo: AMDGPULibFunc(AMDGPULibFunc::EI_CBRT, FInfo))) {
1194 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> cbrt(" << *opr0
1195 << ")\n");
1196 Value *nval = CreateCallEx(B,Callee: FPExpr, Arg: opr0, Name: "__rootn2cbrt");
1197 replaceCall(I: FPOp, With: nval);
1198 return true;
1199 }
1200 } else if (ci_opr1 == -1) { // rootn(x, -1) = 1.0/x
1201 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1.0 / " << *opr0 << "\n");
1202 Value *nval = B.CreateFDiv(L: ConstantFP::get(Ty: opr0->getType(), V: 1.0),
1203 R: opr0,
1204 Name: "__rootn2div");
1205 replaceCall(I: FPOp, With: nval);
1206 return true;
1207 }
1208
1209 if (ci_opr1 == -2 &&
1210 shouldReplaceLibcallWithIntrinsic(CI,
1211 /*AllowMinSizeF32=*/true,
1212 /*AllowF64=*/true)) {
1213 // rootn(x, -2) = rsqrt(x)
1214
1215 // The original rootn had looser ulp requirements than the resultant sqrt
1216 // and fdiv.
1217 MDBuilder MDHelper(M->getContext());
1218 MDNode *FPMD = MDHelper.createFPMath(Accuracy: std::max(a: FPOp->getFPAccuracy(), b: 2.0f));
1219
1220 // TODO: Could handle strictfp but need to fix strict sqrt emission
1221 FastMathFlags FMF = FPOp->getFastMathFlags();
1222 FMF.setAllowContract(true);
1223
1224 CallInst *Sqrt = B.CreateUnaryIntrinsic(ID: Intrinsic::sqrt, V: opr0, FMFSource: CI);
1225 Instruction *RSqrt = cast<Instruction>(
1226 Val: B.CreateFDiv(L: ConstantFP::get(Ty: opr0->getType(), V: 1.0), R: Sqrt));
1227 Sqrt->setFastMathFlags(FMF);
1228 RSqrt->setFastMathFlags(FMF);
1229 RSqrt->setMetadata(KindID: LLVMContext::MD_fpmath, Node: FPMD);
1230
1231 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> rsqrt(" << *opr0
1232 << ")\n");
1233 replaceCall(I: CI, With: RSqrt);
1234 return true;
1235 }
1236
1237 return false;
1238}
1239
1240// is_integer(y) => trunc(y) == y
1241static Value *emitIsInteger(IRBuilder<> &B, Value *Y) {
1242 Value *TruncY = B.CreateUnaryIntrinsic(ID: Intrinsic::trunc, V: Y);
1243 return B.CreateFCmpOEQ(LHS: TruncY, RHS: Y);
1244}
1245
1246static Value *emitIsEvenInteger(IRBuilder<> &B, Value *Y) {
1247 // Even integers are still integers after division by 2.
1248 auto *HalfY = B.CreateFMul(L: Y, R: ConstantFP::get(Ty: Y->getType(), V: 0.5));
1249 return emitIsInteger(B, Y: HalfY);
1250}
1251
1252// is_odd_integer(y) => is_integer(y) && !is_even_integer(y)
1253static Value *emitIsOddInteger(IRBuilder<> &B, Value *Y) {
1254 Value *IsIntY = emitIsInteger(B, Y);
1255 Value *IsEvenY = emitIsEvenInteger(B, Y);
1256 Value *NotEvenY = B.CreateNot(V: IsEvenY);
1257 return B.CreateAnd(LHS: IsIntY, RHS: NotEvenY);
1258}
1259
1260// isinf(val) => fabs(val) == +inf
1261static Value *emitIsInf(IRBuilder<> &B, Value *val) {
1262 auto *fabsVal = B.CreateUnaryIntrinsic(ID: Intrinsic::fabs, V: val);
1263 return B.CreateFCmpOEQ(LHS: fabsVal, RHS: ConstantFP::getInfinity(Ty: val->getType()));
1264}
1265
1266// y * log2(fabs(x))
1267static Value *emitFastExpYLnx(IRBuilder<> &B, Value *X, Value *Y) {
1268 Value *AbsX = B.CreateUnaryIntrinsic(ID: Intrinsic::fabs, V: X);
1269 Value *LogAbsX = B.CreateUnaryIntrinsic(ID: Intrinsic::log2, V: AbsX);
1270 Value *YTimesLogX = B.CreateFMul(L: Y, R: LogAbsX);
1271 return B.CreateUnaryIntrinsic(ID: Intrinsic::exp2, V: YTimesLogX);
1272}
1273
1274/// Emit special case management epilog code for fast pow, powr, pown, and rootn
1275/// expansions. \p x and \p y should be the arguments to the library call
1276/// (possibly with some values clamped). \p expylnx should be the result to use
1277/// in normal circumstances.
1278static Value *emitPowFixup(IRBuilder<> &B, Value *X, Value *Y, Value *ExpYLnX,
1279 PowKind Kind) {
1280 Constant *Zero = ConstantFP::getZero(Ty: X->getType());
1281 Constant *One = ConstantFP::get(Ty: X->getType(), V: 1.0);
1282 Constant *QNaN = ConstantFP::getQNaN(Ty: X->getType());
1283 Constant *PInf = ConstantFP::getInfinity(Ty: X->getType());
1284
1285 switch (Kind) {
1286 case PowKind::Pow: {
1287 // is_odd_integer(y)
1288 Value *IsOddY = emitIsOddInteger(B, Y);
1289
1290 // ret = copysign(expylnx, is_odd_y ? x : 1.0f)
1291 Value *SelSign = B.CreateSelect(C: IsOddY, True: X, False: One);
1292 Value *Ret = B.CreateCopySign(LHS: ExpYLnX, RHS: SelSign);
1293
1294 // if (x < 0 && !is_integer(y)) ret = QNAN
1295 Value *IsIntY = emitIsInteger(B, Y);
1296 Value *condNegX = B.CreateFCmpOLT(LHS: X, RHS: Zero);
1297 Value *condNotIntY = B.CreateNot(V: IsIntY);
1298 Value *condNaN = B.CreateAnd(LHS: condNegX, RHS: condNotIntY);
1299 Ret = B.CreateSelect(C: condNaN, True: QNaN, False: Ret);
1300
1301 // if (isinf(ay)) { ... }
1302
1303 // FIXME: Missing backend optimization to save on materialization cost of
1304 // mixed sign constant infinities.
1305 Value *YIsInf = emitIsInf(B, val: Y);
1306
1307 Value *AY = B.CreateUnaryIntrinsic(ID: Intrinsic::fabs, V: Y);
1308 Value *YIsNegInf = B.CreateFCmpUNE(LHS: Y, RHS: AY);
1309
1310 Value *AX = B.CreateUnaryIntrinsic(ID: Intrinsic::fabs, V: X);
1311 Value *AxEqOne = B.CreateFCmpOEQ(LHS: AX, RHS: One);
1312 Value *AxLtOne = B.CreateFCmpOLT(LHS: AX, RHS: One);
1313 Value *XorCond = B.CreateXor(LHS: AxLtOne, RHS: YIsNegInf);
1314 Value *SelInf =
1315 B.CreateSelect(C: AxEqOne, True: AX, False: B.CreateSelect(C: XorCond, True: Zero, False: AY));
1316 Ret = B.CreateSelect(C: YIsInf, True: SelInf, False: Ret);
1317
1318 // if (isinf(ax) || x == 0.0f) { ... }
1319 Value *XIsInf = emitIsInf(B, val: X);
1320 Value *XEqZero = B.CreateFCmpOEQ(LHS: X, RHS: Zero);
1321 Value *AxInfOrZero = B.CreateOr(LHS: XIsInf, RHS: XEqZero);
1322 Value *YLtZero = B.CreateFCmpOLT(LHS: Y, RHS: Zero);
1323 Value *XorZeroInf = B.CreateXor(LHS: XEqZero, RHS: YLtZero);
1324 Value *SelVal = B.CreateSelect(C: XorZeroInf, True: Zero, False: PInf);
1325 Value *SelSign2 = B.CreateSelect(C: IsOddY, True: X, False: Zero);
1326 Value *Copysign = B.CreateCopySign(LHS: SelVal, RHS: SelSign2);
1327 Ret = B.CreateSelect(C: AxInfOrZero, True: Copysign, False: Ret);
1328
1329 // if (isunordered(x, y)) ret = QNAN
1330 Value *isUnordered = B.CreateFCmpUNO(LHS: X, RHS: Y);
1331 return B.CreateSelect(C: isUnordered, True: QNaN, False: Ret);
1332 }
1333 case PowKind::PowR: {
1334 Value *YIsNeg = B.CreateFCmpOLT(LHS: Y, RHS: Zero);
1335 Value *IZ = B.CreateSelect(C: YIsNeg, True: PInf, False: Zero);
1336 Value *ZI = B.CreateSelect(C: YIsNeg, True: Zero, False: PInf);
1337
1338 Value *YEqZero = B.CreateFCmpOEQ(LHS: Y, RHS: Zero);
1339 Value *SelZeroCase = B.CreateSelect(C: YEqZero, True: QNaN, False: IZ);
1340 Value *XEqZero = B.CreateFCmpOEQ(LHS: X, RHS: Zero);
1341 Value *Ret = B.CreateSelect(C: XEqZero, True: SelZeroCase, False: ExpYLnX);
1342
1343 Value *XEqInf = B.CreateFCmpOEQ(LHS: X, RHS: PInf);
1344 Value *YNeZero = B.CreateFCmpUNE(LHS: Y, RHS: Zero);
1345 Value *CondInfCase = B.CreateAnd(LHS: XEqInf, RHS: YNeZero);
1346 Ret = B.CreateSelect(C: CondInfCase, True: ZI, False: Ret);
1347
1348 Value *IsInfY = emitIsInf(B, val: Y);
1349 Value *XNeOne = B.CreateFCmpUNE(LHS: X, RHS: One);
1350 Value *CondInfY = B.CreateAnd(LHS: IsInfY, RHS: XNeOne);
1351 Value *XLtOne = B.CreateFCmpOLT(LHS: X, RHS: One);
1352 Value *SelInfYCase = B.CreateSelect(C: XLtOne, True: IZ, False: ZI);
1353 Ret = B.CreateSelect(C: CondInfY, True: SelInfYCase, False: Ret);
1354
1355 Value *IsUnordered = B.CreateFCmpUNO(LHS: X, RHS: Y);
1356 return B.CreateSelect(C: IsUnordered, True: QNaN, False: Ret);
1357 }
1358 case PowKind::PowN: {
1359 Constant *ZeroI = ConstantInt::get(Ty: Y->getType(), V: 0);
1360
1361 // is_odd_y = (ny & 1) != 0
1362 Value *OneI = ConstantInt::get(Ty: Y->getType(), V: 1);
1363 Value *YAnd1 = B.CreateAnd(LHS: Y, RHS: OneI);
1364 Value *IsOddY = B.CreateICmpNE(LHS: YAnd1, RHS: ZeroI);
1365
1366 // ret = copysign(expylnx, is_odd_y ? x : 1.0f)
1367 Value *SelSign = B.CreateSelect(C: IsOddY, True: X, False: One);
1368 Value *Ret = B.CreateCopySign(LHS: ExpYLnX, RHS: SelSign);
1369
1370 // if (isinf(x) || x == 0.0f)
1371 Value *FabsX = B.CreateUnaryIntrinsic(ID: Intrinsic::fabs, V: X);
1372 Value *XIsInf = B.CreateFCmpOEQ(LHS: FabsX, RHS: PInf);
1373 Value *XEqZero = B.CreateFCmpOEQ(LHS: X, RHS: Zero);
1374 Value *InfOrZero = B.CreateOr(LHS: XIsInf, RHS: XEqZero);
1375
1376 // (x == 0.0f) ^ (ny < 0) ? 0.0f : +inf
1377 Value *YLtZero = B.CreateICmpSLT(LHS: Y, RHS: ZeroI);
1378 Value *XorZeroInf = B.CreateXor(LHS: XEqZero, RHS: YLtZero);
1379 Value *SelVal = B.CreateSelect(C: XorZeroInf, True: Zero, False: PInf);
1380
1381 // copysign(selVal, is_odd_y ? x : 0.0f)
1382 Value *SelSign2 = B.CreateSelect(C: IsOddY, True: X, False: Zero);
1383 Value *Copysign = B.CreateCopySign(LHS: SelVal, RHS: SelSign2);
1384
1385 return B.CreateSelect(C: InfOrZero, True: Copysign, False: Ret);
1386 }
1387 case PowKind::RootN: {
1388 Constant *ZeroI = ConstantInt::get(Ty: Y->getType(), V: 0);
1389
1390 // is_odd_y = (ny & 1) != 0
1391 Value *YAnd1 = B.CreateAnd(LHS: Y, RHS: ConstantInt::get(Ty: Y->getType(), V: 1));
1392 Value *IsOddY = B.CreateICmpNE(LHS: YAnd1, RHS: ZeroI);
1393
1394 // ret = copysign(expylnx, is_odd_y ? x : 1.0f)
1395 Value *SelSign = B.CreateSelect(C: IsOddY, True: X, False: One);
1396 Value *Ret = B.CreateCopySign(LHS: ExpYLnX, RHS: SelSign);
1397
1398 // if (isinf(x) || x == 0.0f)
1399 Value *FabsX = B.CreateUnaryIntrinsic(ID: Intrinsic::fabs, V: X);
1400 Value *IsInfX = B.CreateFCmpOEQ(LHS: FabsX, RHS: PInf);
1401 Value *XEqZero = B.CreateFCmpOEQ(LHS: X, RHS: Zero);
1402 Value *CondInfOrZero = B.CreateOr(LHS: IsInfX, RHS: XEqZero);
1403
1404 // (x == 0.0f) ^ (ny < 0) ? 0.0f : +inf
1405 Value *YLtZero = B.CreateICmpSLT(LHS: Y, RHS: ZeroI);
1406 Value *XorZeroInf = B.CreateXor(LHS: XEqZero, RHS: YLtZero);
1407 Value *SelVal = B.CreateSelect(C: XorZeroInf, True: Zero, False: PInf);
1408
1409 // copysign(selVal, is_odd_y ? x : 0.0f)
1410 Value *SelSign2 = B.CreateSelect(C: IsOddY, True: X, False: Zero);
1411 Value *Copysign = B.CreateCopySign(LHS: SelVal, RHS: SelSign2);
1412
1413 Ret = B.CreateSelect(C: CondInfOrZero, True: Copysign, False: Ret);
1414
1415 // if ((x < 0.0f && !is_odd_y) || ny == 0) ret = QNAN
1416 Value *XIsNeg = B.CreateFCmpOLT(LHS: X, RHS: Zero);
1417 Value *NotOddY = B.CreateNot(V: IsOddY);
1418 Value *CondNegAndNotOdd = B.CreateAnd(LHS: XIsNeg, RHS: NotOddY);
1419 Value *YEqZero = B.CreateICmpEQ(LHS: Y, RHS: ZeroI);
1420 Value *CondBad = B.CreateOr(LHS: CondNegAndNotOdd, RHS: YEqZero);
1421 return B.CreateSelect(C: CondBad, True: QNaN, False: Ret);
1422 }
1423 }
1424
1425 llvm_unreachable("covered switch");
1426}
1427
1428// TODO: Move the fold_pow folding to sqrt/fdiv here
1429bool AMDGPULibCalls::expandFastPow(FPMathOperator *FPOp, IRBuilder<> &B,
1430 PowKind Kind) {
1431 Type *Ty = FPOp->getType();
1432
1433 // There's currently no reason to do this for half. The correct path is
1434 // promote to float and use the fast float expansion.
1435 //
1436 // TODO: We could move this expansion to lowering to get half pow to work.
1437 if (!Ty->getScalarType()->isFloatTy())
1438 return false;
1439
1440 // TODO: Verify optimization for double and bfloat.
1441 Value *X = FPOp->getOperand(i: 0);
1442 Value *Y = FPOp->getOperand(i: 1);
1443
1444 switch (Kind) {
1445 case PowKind::Pow: {
1446 Constant *One = ConstantFP::get(Ty: X->getType(), V: 1.0);
1447
1448 // if (x == 1.0f) y = 1.0f;
1449 Value *XEqOne = B.CreateFCmpOEQ(LHS: X, RHS: One);
1450 Y = B.CreateSelect(C: XEqOne, True: One, False: Y);
1451
1452 // if (y == 0.0f) x = 1.0f;
1453 Value *YEqZero = B.CreateFCmpOEQ(LHS: Y, RHS: ConstantFP::getZero(Ty: X->getType()));
1454 X = B.CreateSelect(C: YEqZero, True: One, False: X);
1455
1456 Value *ExpYLnX = emitFastExpYLnx(B, X, Y);
1457 Value *Fixed = emitPowFixup(B, X, Y, ExpYLnX, Kind);
1458 replaceCall(I: FPOp, With: Fixed);
1459 return true;
1460 }
1461 case PowKind::PowR: {
1462 Value *NegX = B.CreateFCmpOLT(LHS: X, RHS: ConstantFP::getZero(Ty: X->getType()));
1463 X = B.CreateSelect(C: NegX, True: ConstantFP::getQNaN(Ty: X->getType()), False: X);
1464
1465 Value *ExpYLnX = emitFastExpYLnx(B, X, Y);
1466 Value *Fixed = emitPowFixup(B, X, Y, ExpYLnX, Kind);
1467 replaceCall(I: FPOp, With: Fixed);
1468 return true;
1469 }
1470 case PowKind::PowN: {
1471 // ny == 0
1472 Value *YEqZero = B.CreateICmpEQ(LHS: Y, RHS: ConstantInt::get(Ty: Y->getType(), V: 0));
1473
1474 // x = (ny == 0 ? 1.0f : x)
1475 X = B.CreateSelect(C: YEqZero, True: ConstantFP::get(Ty: X->getType(), V: 1.0), False: X);
1476
1477 Value *CastY = B.CreateSIToFP(V: Y, DestTy: X->getType());
1478 Value *ExpYLnX = emitFastExpYLnx(B, X, Y: CastY);
1479 Value *Fixed = emitPowFixup(B, X, Y, ExpYLnX, Kind);
1480 replaceCall(I: FPOp, With: Fixed);
1481 return true;
1482 }
1483 case PowKind::RootN: {
1484 Value *CastY = B.CreateSIToFP(V: Y, DestTy: X->getType());
1485
1486 // This is afn anyway, so we will turn into rcp.
1487 Value *RcpY = B.CreateFDiv(L: ConstantFP::get(Ty: X->getType(), V: 1.0), R: CastY);
1488
1489 Value *ExpYLnX = emitFastExpYLnx(B, X, Y: RcpY);
1490 Value *Fixed = emitPowFixup(B, X, Y, ExpYLnX, Kind);
1491 replaceCall(I: FPOp, With: Fixed);
1492 return true;
1493 }
1494 }
1495 llvm_unreachable("Unhandled PowKind enum");
1496}
1497
1498bool AMDGPULibCalls::tryOptimizePow(FPMathOperator *FPOp, IRBuilder<> &B,
1499 const FuncInfo &FInfo) {
1500 FastMathFlags FMF = FPOp->getFastMathFlags();
1501 CallInst *Call = cast<CallInst>(Val: FPOp);
1502 Module *M = Call->getModule();
1503
1504 FuncInfo PowrInfo;
1505 AMDGPULibFunc::EFuncId FastPowrFuncId =
1506 FMF.approxFunc() || FInfo.getId() == AMDGPULibFunc::EI_POW_FAST
1507 ? AMDGPULibFunc::EI_POWR_FAST
1508 : AMDGPULibFunc::EI_NONE;
1509 FunctionCallee PowrFunc = getFloatFastVariant(
1510 M, fInfo: FInfo, newInfo&: PowrInfo, NewFunc: AMDGPULibFunc::EI_POWR, FastVariant: FastPowrFuncId);
1511
1512 // TODO: Prefer fast pown to fast powr, but slow powr to slow pown.
1513
1514 // pow(x, y) -> powr(x, y) for x >= -0.0
1515 // TODO: Account for flags on current call
1516 if (PowrFunc && cannotBeOrderedLessThanZero(V: FPOp->getOperand(i: 0),
1517 SQ: SQ.getWithInstruction(I: Call))) {
1518 Call->setCalledFunction(PowrFunc);
1519 return fold_pow(FPOp, B, FInfo: PowrInfo) || true;
1520 }
1521
1522 // pow(x, y) -> pown(x, y) for known integral y
1523 if (isKnownIntegral(V: FPOp->getOperand(i: 1), SQ: SQ.getWithInstruction(I: Call),
1524 FMF: FPOp->getFastMathFlags())) {
1525 FunctionType *PownType = getPownType(FT: Call->getFunctionType());
1526
1527 FuncInfo PownInfo;
1528 AMDGPULibFunc::EFuncId FastPownFuncId =
1529 FMF.approxFunc() || FInfo.getId() == AMDGPULibFunc::EI_POW_FAST
1530 ? AMDGPULibFunc::EI_POWN_FAST
1531 : AMDGPULibFunc::EI_NONE;
1532 FunctionCallee PownFunc = getFloatFastVariant(
1533 M, fInfo: FInfo, newInfo&: PownInfo, NewFunc: AMDGPULibFunc::EI_POWN, FastVariant: FastPownFuncId);
1534
1535 if (PownFunc) {
1536 // TODO: If the incoming integral value is an sitofp/uitofp, it won't
1537 // fold out without a known range. We can probably take the source
1538 // value directly.
1539 Value *CastedArg =
1540 B.CreateFPToSI(V: FPOp->getOperand(i: 1), DestTy: PownType->getParamType(i: 1));
1541 // Have to drop any nofpclass attributes on the original call site.
1542 Call->removeParamAttrs(
1543 ArgNo: 1, AttrsToRemove: AttributeFuncs::typeIncompatible(Ty: CastedArg->getType(),
1544 AS: Call->getParamAttributes(ArgNo: 1)));
1545 Call->setCalledFunction(PownFunc);
1546 Call->setArgOperand(i: 1, v: CastedArg);
1547 return fold_pow(FPOp, B, FInfo: PownInfo) || true;
1548 }
1549 }
1550
1551 if (fold_pow(FPOp, B, FInfo))
1552 return true;
1553
1554 if (!FMF.approxFunc())
1555 return false;
1556
1557 if (FInfo.getId() == AMDGPULibFunc::EI_POW && FMF.approxFunc() &&
1558 getArgType(FInfo) == AMDGPULibFunc::F32) {
1559 AMDGPULibFunc PowFastInfo(AMDGPULibFunc::EI_POW_FAST, FInfo);
1560 if (FunctionCallee PowFastFunc = getFunction(M, fInfo: PowFastInfo)) {
1561 Call->setCalledFunction(PowFastFunc);
1562 return fold_pow(FPOp, B, FInfo: PowFastInfo) || true;
1563 }
1564 }
1565
1566 return expandFastPow(FPOp, B, Kind: PowKind::Pow);
1567}
1568
1569// Get a scalar native builtin single argument FP function
1570FunctionCallee AMDGPULibCalls::getNativeFunction(Module *M,
1571 const FuncInfo &FInfo) {
1572 if (getArgType(FInfo) == AMDGPULibFunc::F64 || !HasNative(id: FInfo.getId()))
1573 return nullptr;
1574 FuncInfo nf = FInfo;
1575 nf.setPrefix(AMDGPULibFunc::NATIVE);
1576 return getFunction(M, fInfo: nf);
1577}
1578
1579// Some library calls are just wrappers around llvm intrinsics, but compiled
1580// conservatively. Preserve the flags from the original call site by
1581// substituting them with direct calls with all the flags.
1582bool AMDGPULibCalls::shouldReplaceLibcallWithIntrinsic(const CallInst *CI,
1583 bool AllowMinSizeF32,
1584 bool AllowF64,
1585 bool AllowStrictFP) {
1586 Type *FltTy = CI->getType()->getScalarType();
1587 const bool IsF32 = FltTy->isFloatTy();
1588
1589 // f64 intrinsics aren't implemented for most operations.
1590 if (!IsF32 && !FltTy->isHalfTy() && (!AllowF64 || !FltTy->isDoubleTy()))
1591 return false;
1592
1593 // We're implicitly inlining by replacing the libcall with the intrinsic, so
1594 // don't do it for noinline call sites.
1595 if (CI->isNoInline())
1596 return false;
1597
1598 const Function *ParentF = CI->getFunction();
1599 // TODO: Handle strictfp
1600 if (!AllowStrictFP && ParentF->hasFnAttribute(Kind: Attribute::StrictFP))
1601 return false;
1602
1603 if (IsF32 && !AllowMinSizeF32 && ParentF->hasMinSize())
1604 return false;
1605 return true;
1606}
1607
1608void AMDGPULibCalls::replaceLibCallWithSimpleIntrinsic(IRBuilder<> &B,
1609 CallInst *CI,
1610 Intrinsic::ID IntrID) {
1611 if (CI->arg_size() == 2) {
1612 Value *Arg0 = CI->getArgOperand(i: 0);
1613 Value *Arg1 = CI->getArgOperand(i: 1);
1614 VectorType *Arg0VecTy = dyn_cast<VectorType>(Val: Arg0->getType());
1615 VectorType *Arg1VecTy = dyn_cast<VectorType>(Val: Arg1->getType());
1616 if (Arg0VecTy && !Arg1VecTy) {
1617 Value *SplatRHS = B.CreateVectorSplat(EC: Arg0VecTy->getElementCount(), V: Arg1);
1618 CI->setArgOperand(i: 1, v: SplatRHS);
1619 } else if (!Arg0VecTy && Arg1VecTy) {
1620 Value *SplatLHS = B.CreateVectorSplat(EC: Arg1VecTy->getElementCount(), V: Arg0);
1621 CI->setArgOperand(i: 0, v: SplatLHS);
1622 }
1623 }
1624
1625 CI->setCalledFunction(Intrinsic::getOrInsertDeclaration(
1626 M: CI->getModule(), id: IntrID, Tys: {CI->getType()}));
1627}
1628
1629bool AMDGPULibCalls::tryReplaceLibcallWithSimpleIntrinsic(
1630 IRBuilder<> &B, CallInst *CI, Intrinsic::ID IntrID, bool AllowMinSizeF32,
1631 bool AllowF64, bool AllowStrictFP) {
1632 if (!shouldReplaceLibcallWithIntrinsic(CI, AllowMinSizeF32, AllowF64,
1633 AllowStrictFP))
1634 return false;
1635 replaceLibCallWithSimpleIntrinsic(B, CI, IntrID);
1636 return true;
1637}
1638
1639std::tuple<Value *, Value *, Value *>
1640AMDGPULibCalls::insertSinCos(Value *Arg, FastMathFlags FMF, IRBuilder<> &B,
1641 FunctionCallee Fsincos) {
1642 DebugLoc DL = B.getCurrentDebugLocation();
1643 Function *F = B.GetInsertBlock()->getParent();
1644 B.SetInsertPointPastAllocas(F);
1645
1646 AllocaInst *Alloc = B.CreateAlloca(Ty: Arg->getType(), ArraySize: nullptr, Name: "__sincos_");
1647
1648 if (Instruction *ArgInst = dyn_cast<Instruction>(Val: Arg)) {
1649 // If the argument is an instruction, it must dominate all uses so put our
1650 // sincos call there. Otherwise, right after the allocas works well enough
1651 // if it's an argument or constant.
1652
1653 B.SetInsertPoint(TheBB: ArgInst->getParent(), IP: ++ArgInst->getIterator());
1654
1655 // SetInsertPoint unwelcomely always tries to set the debug loc.
1656 B.SetCurrentDebugLocation(DL);
1657 }
1658
1659 Type *CosPtrTy = Fsincos.getFunctionType()->getParamType(i: 1);
1660
1661 // The allocaInst allocates the memory in private address space. This need
1662 // to be addrspacecasted to point to the address space of cos pointer type.
1663 // In OpenCL 2.0 this is generic, while in 1.2 that is private.
1664 Value *CastAlloc = B.CreateAddrSpaceCast(V: Alloc, DestTy: CosPtrTy);
1665
1666 CallInst *SinCos = CreateCallEx2(B, Callee: Fsincos, Arg1: Arg, Arg2: CastAlloc);
1667
1668 // TODO: Is it worth trying to preserve the location for the cos calls for the
1669 // load?
1670
1671 LoadInst *LoadCos = B.CreateLoad(Ty: Arg->getType(), Ptr: Alloc);
1672 return {SinCos, LoadCos, SinCos};
1673}
1674
1675// fold sin, cos -> sincos.
1676bool AMDGPULibCalls::fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B,
1677 const FuncInfo &fInfo) {
1678 assert(fInfo.getId() == AMDGPULibFunc::EI_SIN ||
1679 fInfo.getId() == AMDGPULibFunc::EI_COS);
1680
1681 if ((getArgType(FInfo: fInfo) != AMDGPULibFunc::F32 &&
1682 getArgType(FInfo: fInfo) != AMDGPULibFunc::F64) ||
1683 fInfo.getPrefix() != AMDGPULibFunc::NOPFX)
1684 return false;
1685
1686 bool const isSin = fInfo.getId() == AMDGPULibFunc::EI_SIN;
1687
1688 Value *CArgVal = FPOp->getOperand(i: 0);
1689
1690 // TODO: Constant fold the call
1691 if (isa<ConstantData>(Val: CArgVal))
1692 return false;
1693
1694 CallInst *CI = cast<CallInst>(Val: FPOp);
1695
1696 Function *F = B.GetInsertBlock()->getParent();
1697 Module *M = F->getParent();
1698
1699 // Merge the sin and cos. For OpenCL 2.0, there may only be a generic pointer
1700 // implementation. Prefer the private form if available.
1701 AMDGPULibFunc SinCosLibFuncPrivate(AMDGPULibFunc::EI_SINCOS, fInfo);
1702 SinCosLibFuncPrivate.getLeads()[0].PtrKind =
1703 AMDGPULibFunc::getEPtrKindFromAddrSpace(AS: AMDGPUAS::PRIVATE_ADDRESS);
1704
1705 AMDGPULibFunc SinCosLibFuncGeneric(AMDGPULibFunc::EI_SINCOS, fInfo);
1706 SinCosLibFuncGeneric.getLeads()[0].PtrKind =
1707 AMDGPULibFunc::getEPtrKindFromAddrSpace(AS: AMDGPUAS::FLAT_ADDRESS);
1708
1709 FunctionCallee FSinCosPrivate = getFunction(M, fInfo: SinCosLibFuncPrivate);
1710 FunctionCallee FSinCosGeneric = getFunction(M, fInfo: SinCosLibFuncGeneric);
1711 FunctionCallee FSinCos = FSinCosPrivate ? FSinCosPrivate : FSinCosGeneric;
1712 if (!FSinCos)
1713 return false;
1714
1715 SmallVector<CallInst *> SinCalls;
1716 SmallVector<CallInst *> CosCalls;
1717 SmallVector<CallInst *> SinCosCalls;
1718 FuncInfo PartnerInfo(isSin ? AMDGPULibFunc::EI_COS : AMDGPULibFunc::EI_SIN,
1719 fInfo);
1720 const std::string PairName = PartnerInfo.mangle();
1721
1722 StringRef SinName = isSin ? CI->getCalledFunction()->getName() : PairName;
1723 StringRef CosName = isSin ? PairName : CI->getCalledFunction()->getName();
1724 const std::string SinCosPrivateName = SinCosLibFuncPrivate.mangle();
1725 const std::string SinCosGenericName = SinCosLibFuncGeneric.mangle();
1726
1727 // Intersect the two sets of flags.
1728 FastMathFlags FMF = FPOp->getFastMathFlags();
1729 MDNode *FPMath = CI->getMetadata(KindID: LLVMContext::MD_fpmath);
1730
1731 SmallVector<DILocation *> MergeDbgLocs = {CI->getDebugLoc()};
1732
1733 for (User* U : CArgVal->users()) {
1734 CallInst *XI = dyn_cast<CallInst>(Val: U);
1735 if (!XI || XI->getFunction() != F || XI->isNoBuiltin())
1736 continue;
1737
1738 Function *UCallee = XI->getCalledFunction();
1739 if (!UCallee)
1740 continue;
1741
1742 bool Handled = true;
1743
1744 if (UCallee->getName() == SinName)
1745 SinCalls.push_back(Elt: XI);
1746 else if (UCallee->getName() == CosName)
1747 CosCalls.push_back(Elt: XI);
1748 else if (UCallee->getName() == SinCosPrivateName ||
1749 UCallee->getName() == SinCosGenericName)
1750 SinCosCalls.push_back(Elt: XI);
1751 else
1752 Handled = false;
1753
1754 if (Handled) {
1755 MergeDbgLocs.push_back(Elt: XI->getDebugLoc());
1756 auto *OtherOp = cast<FPMathOperator>(Val: XI);
1757 FMF &= OtherOp->getFastMathFlags();
1758 FPMath = MDNode::getMostGenericFPMath(
1759 A: FPMath, B: XI->getMetadata(KindID: LLVMContext::MD_fpmath));
1760 }
1761 }
1762
1763 if (SinCalls.empty() || CosCalls.empty())
1764 return false;
1765
1766 B.setFastMathFlags(FMF);
1767 B.setDefaultFPMathTag(FPMath);
1768 DILocation *DbgLoc = DILocation::getMergedLocations(Locs: MergeDbgLocs);
1769 B.SetCurrentDebugLocation(DbgLoc);
1770
1771 auto [Sin, Cos, SinCos] = insertSinCos(Arg: CArgVal, FMF, B, Fsincos: FSinCos);
1772
1773 auto replaceTrigInsts = [](ArrayRef<CallInst *> Calls, Value *Res) {
1774 for (CallInst *C : Calls)
1775 C->replaceAllUsesWith(V: Res);
1776
1777 // Leave the other dead instructions to avoid clobbering iterators.
1778 };
1779
1780 replaceTrigInsts(SinCalls, Sin);
1781 replaceTrigInsts(CosCalls, Cos);
1782 replaceTrigInsts(SinCosCalls, SinCos);
1783
1784 // It's safe to delete the original now.
1785 CI->eraseFromParent();
1786 return true;
1787}
1788
1789bool AMDGPULibCalls::evaluateScalarMathFunc(const FuncInfo &FInfo,
1790 APFloat &Res0, APFloat &Res1,
1791 Constant *copr0, Constant *copr1) {
1792 // By default, opr0/opr1/opr3 holds values of float/double type.
1793 // If they are not float/double, each function has to its
1794 // operand separately.
1795 double opr0 = 0.0, opr1 = 0.0;
1796 ConstantFP *fpopr0 = dyn_cast_or_null<ConstantFP>(Val: copr0);
1797 ConstantFP *fpopr1 = dyn_cast_or_null<ConstantFP>(Val: copr1);
1798 if (fpopr0) {
1799 opr0 = (getArgType(FInfo) == AMDGPULibFunc::F64)
1800 ? fpopr0->getValueAPF().convertToDouble()
1801 : (double)fpopr0->getValueAPF().convertToFloat();
1802 }
1803
1804 if (fpopr1) {
1805 opr1 = (getArgType(FInfo) == AMDGPULibFunc::F64)
1806 ? fpopr1->getValueAPF().convertToDouble()
1807 : (double)fpopr1->getValueAPF().convertToFloat();
1808 }
1809
1810 switch (FInfo.getId()) {
1811 default:
1812 return false;
1813
1814 case AMDGPULibFunc::EI_ACOS:
1815 Res0 = APFloat{acos(x: opr0)};
1816 return true;
1817
1818 case AMDGPULibFunc::EI_ACOSH:
1819 // acosh(x) == log(x + sqrt(x*x - 1))
1820 Res0 = APFloat{log(x: opr0 + sqrt(x: opr0 * opr0 - 1.0))};
1821 return true;
1822
1823 case AMDGPULibFunc::EI_ACOSPI:
1824 Res0 = APFloat{acos(x: opr0) / MATH_PI};
1825 return true;
1826
1827 case AMDGPULibFunc::EI_ASIN:
1828 Res0 = APFloat{asin(x: opr0)};
1829 return true;
1830
1831 case AMDGPULibFunc::EI_ASINH:
1832 // asinh(x) == log(x + sqrt(x*x + 1))
1833 Res0 = APFloat{log(x: opr0 + sqrt(x: opr0 * opr0 + 1.0))};
1834 return true;
1835
1836 case AMDGPULibFunc::EI_ASINPI:
1837 Res0 = APFloat{asin(x: opr0) / MATH_PI};
1838 return true;
1839
1840 case AMDGPULibFunc::EI_ATAN:
1841 Res0 = APFloat{atan(x: opr0)};
1842 return true;
1843
1844 case AMDGPULibFunc::EI_ATANH:
1845 // atanh(x) == (log(x+1) - log(x-1))/2;
1846 Res0 = APFloat{(log(x: opr0 + 1.0) - log(x: opr0 - 1.0)) / 2.0};
1847 return true;
1848
1849 case AMDGPULibFunc::EI_ATANPI:
1850 Res0 = APFloat{atan(x: opr0) / MATH_PI};
1851 return true;
1852
1853 case AMDGPULibFunc::EI_CBRT:
1854 Res0 =
1855 APFloat{(opr0 < 0.0) ? -pow(x: -opr0, y: 1.0 / 3.0) : pow(x: opr0, y: 1.0 / 3.0)};
1856 return true;
1857
1858 case AMDGPULibFunc::EI_COS:
1859 Res0 = APFloat{cos(x: opr0)};
1860 return true;
1861
1862 case AMDGPULibFunc::EI_COSH:
1863 Res0 = APFloat{cosh(x: opr0)};
1864 return true;
1865
1866 case AMDGPULibFunc::EI_COSPI:
1867 Res0 = APFloat{cos(MATH_PI * opr0)};
1868 return true;
1869
1870 case AMDGPULibFunc::EI_EXP:
1871 Res0 = APFloat{exp(x: opr0)};
1872 return true;
1873
1874 case AMDGPULibFunc::EI_EXP2:
1875 Res0 = APFloat{pow(x: 2.0, y: opr0)};
1876 return true;
1877
1878 case AMDGPULibFunc::EI_EXP10:
1879 Res0 = APFloat{pow(x: 10.0, y: opr0)};
1880 return true;
1881
1882 case AMDGPULibFunc::EI_LOG:
1883 Res0 = APFloat{log(x: opr0)};
1884 return true;
1885
1886 case AMDGPULibFunc::EI_LOG2:
1887 Res0 = APFloat{log(x: opr0) / log(x: 2.0)};
1888 return true;
1889
1890 case AMDGPULibFunc::EI_LOG10:
1891 Res0 = APFloat{log(x: opr0) / log(x: 10.0)};
1892 return true;
1893
1894 case AMDGPULibFunc::EI_RSQRT:
1895 Res0 = APFloat{1.0 / sqrt(x: opr0)};
1896 return true;
1897
1898 case AMDGPULibFunc::EI_SIN:
1899 Res0 = APFloat{sin(x: opr0)};
1900 return true;
1901
1902 case AMDGPULibFunc::EI_SINH:
1903 Res0 = APFloat{sinh(x: opr0)};
1904 return true;
1905
1906 case AMDGPULibFunc::EI_SINPI:
1907 Res0 = APFloat{sin(MATH_PI * opr0)};
1908 return true;
1909
1910 case AMDGPULibFunc::EI_TAN:
1911 Res0 = APFloat{tan(x: opr0)};
1912 return true;
1913
1914 case AMDGPULibFunc::EI_TANH:
1915 Res0 = APFloat{tanh(x: opr0)};
1916 return true;
1917
1918 case AMDGPULibFunc::EI_TANPI:
1919 Res0 = APFloat{tan(MATH_PI * opr0)};
1920 return true;
1921
1922 // two-arg functions
1923 case AMDGPULibFunc::EI_POW:
1924 case AMDGPULibFunc::EI_POWR:
1925 Res0 = APFloat{pow(x: opr0, y: opr1)};
1926 return true;
1927
1928 case AMDGPULibFunc::EI_POWN: {
1929 if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(Val: copr1)) {
1930 double val = (double)iopr1->getSExtValue();
1931 Res0 = APFloat{pow(x: opr0, y: val)};
1932 return true;
1933 }
1934 return false;
1935 }
1936
1937 case AMDGPULibFunc::EI_ROOTN: {
1938 if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(Val: copr1)) {
1939 double val = (double)iopr1->getSExtValue();
1940 Res0 = APFloat{pow(x: opr0, y: 1.0 / val)};
1941 return true;
1942 }
1943 return false;
1944 }
1945
1946 // with ptr arg
1947 case AMDGPULibFunc::EI_SINCOS:
1948 Res0 = APFloat{sin(x: opr0)};
1949 Res1 = APFloat{cos(x: opr0)};
1950 return true;
1951 }
1952
1953 return false;
1954}
1955
1956bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) {
1957 int numArgs = (int)aCI->arg_size();
1958 if (numArgs > 3)
1959 return false;
1960
1961 Constant *copr0 = nullptr;
1962 Constant *copr1 = nullptr;
1963 if (numArgs > 0) {
1964 if ((copr0 = dyn_cast<Constant>(Val: aCI->getArgOperand(i: 0))) == nullptr)
1965 return false;
1966 }
1967
1968 if (numArgs > 1) {
1969 if ((copr1 = dyn_cast<Constant>(Val: aCI->getArgOperand(i: 1))) == nullptr) {
1970 if (FInfo.getId() != AMDGPULibFunc::EI_SINCOS)
1971 return false;
1972 }
1973 }
1974
1975 // At this point, all arguments to aCI are constants.
1976
1977 // max vector size is 16, and sincos will generate two results.
1978 SmallVector<APFloat, 16> Val0, Val1;
1979 int FuncVecSize = getVecSize(FInfo);
1980 bool hasTwoResults = (FInfo.getId() == AMDGPULibFunc::EI_SINCOS);
1981 if (FuncVecSize == 1) {
1982 if (!evaluateScalarMathFunc(FInfo, Res0&: Val0.emplace_back(Args: 0.0),
1983 Res1&: Val1.emplace_back(Args: 0.0), copr0, copr1)) {
1984 return false;
1985 }
1986 } else {
1987 ConstantDataVector *CDV0 = dyn_cast_or_null<ConstantDataVector>(Val: copr0);
1988 ConstantDataVector *CDV1 = dyn_cast_or_null<ConstantDataVector>(Val: copr1);
1989 for (int i = 0; i < FuncVecSize; ++i) {
1990 Constant *celt0 = CDV0 ? CDV0->getElementAsConstant(i) : nullptr;
1991 Constant *celt1 = CDV1 ? CDV1->getElementAsConstant(i) : nullptr;
1992 if (!evaluateScalarMathFunc(FInfo, Res0&: Val0.emplace_back(Args: 0.0),
1993 Res1&: Val1.emplace_back(Args: 0.0), copr0: celt0, copr1: celt1)) {
1994 return false;
1995 }
1996 }
1997 }
1998
1999 Constant *nval0, *nval1;
2000 if (FuncVecSize == 1) {
2001 nval0 = ConstantFP::get(Ty: aCI->getType(), V: Val0[0]);
2002 if (hasTwoResults)
2003 nval1 = ConstantFP::get(Ty: aCI->getType(), V: Val1[0]);
2004 } else {
2005 nval0 = getConstantFloatVector(Values: Val0, Ty: aCI->getType());
2006 if (hasTwoResults)
2007 nval1 = getConstantFloatVector(Values: Val1, Ty: aCI->getType());
2008 }
2009
2010 if (hasTwoResults) {
2011 // sincos
2012 assert(FInfo.getId() == AMDGPULibFunc::EI_SINCOS &&
2013 "math function with ptr arg not supported yet");
2014 new StoreInst(nval1, aCI->getArgOperand(i: 1), aCI->getIterator());
2015 }
2016
2017 replaceCall(I: aCI, With: nval0);
2018 return true;
2019}
2020
2021PreservedAnalyses AMDGPUSimplifyLibCallsPass::run(Function &F,
2022 FunctionAnalysisManager &AM) {
2023 AMDGPULibCalls Simplifier(F, AM);
2024 Simplifier.initNativeFuncs();
2025
2026 bool Changed = false;
2027
2028 LLVM_DEBUG(dbgs() << "AMDIC: process function ";
2029 F.printAsOperand(dbgs(), false, F.getParent()); dbgs() << '\n';);
2030
2031 for (auto &BB : F) {
2032 for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;) {
2033 // Ignore non-calls.
2034 CallInst *CI = dyn_cast<CallInst>(Val&: I);
2035 ++I;
2036
2037 if (CI) {
2038 if (Simplifier.fold(CI))
2039 Changed = true;
2040 }
2041 }
2042 }
2043 return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
2044}
2045
2046PreservedAnalyses AMDGPUUseNativeCallsPass::run(Function &F,
2047 FunctionAnalysisManager &AM) {
2048 if (UseNative.empty())
2049 return PreservedAnalyses::all();
2050
2051 AMDGPULibCalls Simplifier(F, AM);
2052 Simplifier.initNativeFuncs();
2053
2054 bool Changed = false;
2055 for (auto &BB : F) {
2056 for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;) {
2057 // Ignore non-calls.
2058 CallInst *CI = dyn_cast<CallInst>(Val&: I);
2059 ++I;
2060 if (CI && Simplifier.useNative(aCI: CI))
2061 Changed = true;
2062 }
2063 }
2064 return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
2065}
2066