1//===- AMDGPULibCalls.cpp -------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file does AMD library function optimizations.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
15#include "AMDGPULibFunc.h"
16#include "llvm/Analysis/AssumptionCache.h"
17#include "llvm/Analysis/TargetLibraryInfo.h"
18#include "llvm/Analysis/ValueTracking.h"
19#include "llvm/IR/AttributeMask.h"
20#include "llvm/IR/Dominators.h"
21#include "llvm/IR/IRBuilder.h"
22#include "llvm/IR/IntrinsicsAMDGPU.h"
23#include "llvm/IR/MDBuilder.h"
24#include "llvm/IR/PatternMatch.h"
25#include <cmath>
26
27#define DEBUG_TYPE "amdgpu-simplifylib"
28
29using namespace llvm;
30using namespace llvm::PatternMatch;
31
32static cl::opt<bool> EnablePreLink("amdgpu-prelink",
33 cl::desc("Enable pre-link mode optimizations"),
34 cl::init(Val: false),
35 cl::Hidden);
36
37static cl::list<std::string> UseNative("amdgpu-use-native",
38 cl::desc("Comma separated list of functions to replace with native, or all"),
39 cl::CommaSeparated, cl::ValueOptional,
40 cl::Hidden);
41
42#define MATH_PI numbers::pi
43#define MATH_E numbers::e
44#define MATH_SQRT2 numbers::sqrt2
45#define MATH_SQRT1_2 numbers::inv_sqrt2
46
47enum class PowKind { Pow, PowR, PowN, RootN };
48
49namespace llvm {
50
51class AMDGPULibCalls {
52private:
53 SimplifyQuery SQ;
54
55 using FuncInfo = llvm::AMDGPULibFunc;
56
57 // -fuse-native.
58 bool AllNative = false;
59
60 bool useNativeFunc(const StringRef F) const;
61
62 // Return a pointer (pointer expr) to the function if function definition with
63 // "FuncName" exists. It may create a new function prototype in pre-link mode.
64 FunctionCallee getFunction(Module *M, const FuncInfo &fInfo);
65
66 /// Wrapper around getFunction which tries to use a faster variant if
67 /// available, and falls back to a less fast option.
68 ///
69 /// Return a replacement function for \p fInfo that has float-typed fast
70 /// variants. \p NewFunc is a base replacement function to use. \p
71 /// NewFuncFastVariant is a faster version to use if the calling context knows
72 /// it's legal. If there is no fast variant to use, \p NewFuncFastVariant
73 /// should be EI_NONE.
74 FunctionCallee getFloatFastVariant(Module *M, const FuncInfo &fInfo,
75 FuncInfo &newInfo,
76 AMDGPULibFunc::EFuncId NewFunc,
77 AMDGPULibFunc::EFuncId NewFuncFastVariant);
78
79 bool parseFunctionName(const StringRef &FMangledName, FuncInfo &FInfo);
80
81 bool TDOFold(CallInst *CI, const FuncInfo &FInfo);
82
83 /* Specialized optimizations */
84
85 // pow/powr/pown
86 bool fold_pow(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);
87
88 /// Peform a fast math expansion of pow, powr, pown or rootn.
89 bool expandFastPow(FPMathOperator *FPOp, IRBuilder<> &B, PowKind Kind);
90
91 bool tryOptimizePow(FPMathOperator *FPOp, IRBuilder<> &B,
92 const FuncInfo &FInfo);
93
94 // rootn
95 bool fold_rootn(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);
96
97 // -fuse-native for sincos
98 bool sincosUseNative(CallInst *aCI, const FuncInfo &FInfo);
99
100 // evaluate calls if calls' arguments are constants.
101 bool evaluateScalarMathFunc(const FuncInfo &FInfo, APFloat &Res0,
102 APFloat &Res1, Constant *copr0, Constant *copr1);
103 bool evaluateCall(CallInst *aCI, const FuncInfo &FInfo);
104
105 /// Insert a value to sincos function \p Fsincos. Returns (value of sin, value
106 /// of cos, sincos call).
107 std::tuple<Value *, Value *, Value *> insertSinCos(Value *Arg,
108 FastMathFlags FMF,
109 IRBuilder<> &B,
110 FunctionCallee Fsincos);
111
112 // sin/cos
113 bool fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);
114
115 // __read_pipe/__write_pipe
116 bool fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
117 const FuncInfo &FInfo);
118
119 // Get a scalar native builtin single argument FP function
120 FunctionCallee getNativeFunction(Module *M, const FuncInfo &FInfo);
121
122 /// Substitute a call to a known libcall with an intrinsic call. If \p
123 /// AllowMinSize is true, allow the replacement in a minsize function.
124 bool shouldReplaceLibcallWithIntrinsic(const CallInst *CI,
125 bool AllowMinSizeF32 = false,
126 bool AllowF64 = false,
127 bool AllowStrictFP = false);
128 void replaceLibCallWithSimpleIntrinsic(IRBuilder<> &B, CallInst *CI,
129 Intrinsic::ID IntrID);
130
131 bool tryReplaceLibcallWithSimpleIntrinsic(IRBuilder<> &B, CallInst *CI,
132 Intrinsic::ID IntrID,
133 bool AllowMinSizeF32 = false,
134 bool AllowF64 = false,
135 bool AllowStrictFP = false);
136
137protected:
138 bool isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const;
139
140 bool canIncreasePrecisionOfConstantFold(const FPMathOperator *FPOp) const;
141
142 static void replaceCall(Instruction *I, Value *With) {
143 I->replaceAllUsesWith(V: With);
144 I->eraseFromParent();
145 }
146
147 static void replaceCall(FPMathOperator *I, Value *With) {
148 replaceCall(I: cast<Instruction>(Val: I), With);
149 }
150
151public:
152 AMDGPULibCalls(Function &F, FunctionAnalysisManager &FAM);
153
154 bool fold(CallInst *CI);
155
156 void initNativeFuncs();
157
158 // Replace a normal math function call with that native version
159 bool useNative(CallInst *CI);
160};
161
162} // end namespace llvm
163
164template <typename IRB>
165static CallInst *CreateCallEx(IRB &B, FunctionCallee Callee, Value *Arg,
166 const Twine &Name = "") {
167 CallInst *R = B.CreateCall(Callee, Arg, Name);
168 if (Function *F = dyn_cast<Function>(Val: Callee.getCallee()))
169 R->setCallingConv(F->getCallingConv());
170 return R;
171}
172
173template <typename IRB>
174static CallInst *CreateCallEx2(IRB &B, FunctionCallee Callee, Value *Arg1,
175 Value *Arg2, const Twine &Name = "") {
176 CallInst *R = B.CreateCall(Callee, {Arg1, Arg2}, Name);
177 if (Function *F = dyn_cast<Function>(Val: Callee.getCallee()))
178 R->setCallingConv(F->getCallingConv());
179 return R;
180}
181
182static FunctionType *getPownType(FunctionType *FT) {
183 Type *PowNExpTy = Type::getInt32Ty(C&: FT->getContext());
184 if (VectorType *VecTy = dyn_cast<VectorType>(Val: FT->getReturnType()))
185 PowNExpTy = VectorType::get(ElementType: PowNExpTy, EC: VecTy->getElementCount());
186
187 return FunctionType::get(Result: FT->getReturnType(),
188 Params: {FT->getParamType(i: 0), PowNExpTy}, isVarArg: false);
189}
190
191// Data structures for table-driven optimizations.
192// FuncTbl works for both f32 and f64 functions with 1 input argument
193
194struct TableEntry {
195 double result;
196 double input;
197};
198
199/* a list of {result, input} */
200static const TableEntry tbl_acos[] = {
201 {MATH_PI / 2.0, .input: 0.0},
202 {MATH_PI / 2.0, .input: -0.0},
203 {.result: 0.0, .input: 1.0},
204 {MATH_PI, .input: -1.0}
205};
206static const TableEntry tbl_acosh[] = {
207 {.result: 0.0, .input: 1.0}
208};
209static const TableEntry tbl_acospi[] = {
210 {.result: 0.5, .input: 0.0},
211 {.result: 0.5, .input: -0.0},
212 {.result: 0.0, .input: 1.0},
213 {.result: 1.0, .input: -1.0}
214};
215static const TableEntry tbl_asin[] = {
216 {.result: 0.0, .input: 0.0},
217 {.result: -0.0, .input: -0.0},
218 {MATH_PI / 2.0, .input: 1.0},
219 {.result: -MATH_PI / 2.0, .input: -1.0}
220};
221static const TableEntry tbl_asinh[] = {
222 {.result: 0.0, .input: 0.0},
223 {.result: -0.0, .input: -0.0}
224};
225static const TableEntry tbl_asinpi[] = {
226 {.result: 0.0, .input: 0.0},
227 {.result: -0.0, .input: -0.0},
228 {.result: 0.5, .input: 1.0},
229 {.result: -0.5, .input: -1.0}
230};
231static const TableEntry tbl_atan[] = {
232 {.result: 0.0, .input: 0.0},
233 {.result: -0.0, .input: -0.0},
234 {MATH_PI / 4.0, .input: 1.0},
235 {.result: -MATH_PI / 4.0, .input: -1.0}
236};
237static const TableEntry tbl_atanh[] = {
238 {.result: 0.0, .input: 0.0},
239 {.result: -0.0, .input: -0.0}
240};
241static const TableEntry tbl_atanpi[] = {
242 {.result: 0.0, .input: 0.0},
243 {.result: -0.0, .input: -0.0},
244 {.result: 0.25, .input: 1.0},
245 {.result: -0.25, .input: -1.0}
246};
247static const TableEntry tbl_cbrt[] = {
248 {.result: 0.0, .input: 0.0},
249 {.result: -0.0, .input: -0.0},
250 {.result: 1.0, .input: 1.0},
251 {.result: -1.0, .input: -1.0},
252};
253static const TableEntry tbl_cos[] = {
254 {.result: 1.0, .input: 0.0},
255 {.result: 1.0, .input: -0.0}
256};
257static const TableEntry tbl_cosh[] = {
258 {.result: 1.0, .input: 0.0},
259 {.result: 1.0, .input: -0.0}
260};
261static const TableEntry tbl_cospi[] = {
262 {.result: 1.0, .input: 0.0},
263 {.result: 1.0, .input: -0.0}
264};
265static const TableEntry tbl_erfc[] = {
266 {.result: 1.0, .input: 0.0},
267 {.result: 1.0, .input: -0.0}
268};
269static const TableEntry tbl_erf[] = {
270 {.result: 0.0, .input: 0.0},
271 {.result: -0.0, .input: -0.0}
272};
273static const TableEntry tbl_exp[] = {
274 {.result: 1.0, .input: 0.0},
275 {.result: 1.0, .input: -0.0},
276 {MATH_E, .input: 1.0}
277};
278static const TableEntry tbl_exp2[] = {
279 {.result: 1.0, .input: 0.0},
280 {.result: 1.0, .input: -0.0},
281 {.result: 2.0, .input: 1.0}
282};
283static const TableEntry tbl_exp10[] = {
284 {.result: 1.0, .input: 0.0},
285 {.result: 1.0, .input: -0.0},
286 {.result: 10.0, .input: 1.0}
287};
288static const TableEntry tbl_expm1[] = {
289 {.result: 0.0, .input: 0.0},
290 {.result: -0.0, .input: -0.0}
291};
292static const TableEntry tbl_log[] = {
293 {.result: 0.0, .input: 1.0},
294 {.result: 1.0, MATH_E}
295};
296static const TableEntry tbl_log2[] = {
297 {.result: 0.0, .input: 1.0},
298 {.result: 1.0, .input: 2.0}
299};
300static const TableEntry tbl_log10[] = {
301 {.result: 0.0, .input: 1.0},
302 {.result: 1.0, .input: 10.0}
303};
304static const TableEntry tbl_rsqrt[] = {
305 {.result: 1.0, .input: 1.0},
306 {MATH_SQRT1_2, .input: 2.0}
307};
308static const TableEntry tbl_sin[] = {
309 {.result: 0.0, .input: 0.0},
310 {.result: -0.0, .input: -0.0}
311};
312static const TableEntry tbl_sinh[] = {
313 {.result: 0.0, .input: 0.0},
314 {.result: -0.0, .input: -0.0}
315};
316static const TableEntry tbl_sinpi[] = {
317 {.result: 0.0, .input: 0.0},
318 {.result: -0.0, .input: -0.0}
319};
320static const TableEntry tbl_sqrt[] = {
321 {.result: 0.0, .input: 0.0},
322 {.result: 1.0, .input: 1.0},
323 {MATH_SQRT2, .input: 2.0}
324};
325static const TableEntry tbl_tan[] = {
326 {.result: 0.0, .input: 0.0},
327 {.result: -0.0, .input: -0.0}
328};
329static const TableEntry tbl_tanh[] = {
330 {.result: 0.0, .input: 0.0},
331 {.result: -0.0, .input: -0.0}
332};
333static const TableEntry tbl_tanpi[] = {
334 {.result: 0.0, .input: 0.0},
335 {.result: -0.0, .input: -0.0}
336};
337static const TableEntry tbl_tgamma[] = {
338 {.result: 1.0, .input: 1.0},
339 {.result: 1.0, .input: 2.0},
340 {.result: 2.0, .input: 3.0},
341 {.result: 6.0, .input: 4.0}
342};
343
344static bool HasNative(AMDGPULibFunc::EFuncId id) {
345 switch(id) {
346 case AMDGPULibFunc::EI_DIVIDE:
347 case AMDGPULibFunc::EI_COS:
348 case AMDGPULibFunc::EI_EXP:
349 case AMDGPULibFunc::EI_EXP2:
350 case AMDGPULibFunc::EI_EXP10:
351 case AMDGPULibFunc::EI_LOG:
352 case AMDGPULibFunc::EI_LOG2:
353 case AMDGPULibFunc::EI_LOG10:
354 case AMDGPULibFunc::EI_POWR:
355 case AMDGPULibFunc::EI_RECIP:
356 case AMDGPULibFunc::EI_RSQRT:
357 case AMDGPULibFunc::EI_SIN:
358 case AMDGPULibFunc::EI_SINCOS:
359 case AMDGPULibFunc::EI_SQRT:
360 case AMDGPULibFunc::EI_TAN:
361 return true;
362 default:;
363 }
364 return false;
365}
366
367using TableRef = ArrayRef<TableEntry>;
368
369static TableRef getOptTable(AMDGPULibFunc::EFuncId id) {
370 switch(id) {
371 case AMDGPULibFunc::EI_ACOS: return TableRef(tbl_acos);
372 case AMDGPULibFunc::EI_ACOSH: return TableRef(tbl_acosh);
373 case AMDGPULibFunc::EI_ACOSPI: return TableRef(tbl_acospi);
374 case AMDGPULibFunc::EI_ASIN: return TableRef(tbl_asin);
375 case AMDGPULibFunc::EI_ASINH: return TableRef(tbl_asinh);
376 case AMDGPULibFunc::EI_ASINPI: return TableRef(tbl_asinpi);
377 case AMDGPULibFunc::EI_ATAN: return TableRef(tbl_atan);
378 case AMDGPULibFunc::EI_ATANH: return TableRef(tbl_atanh);
379 case AMDGPULibFunc::EI_ATANPI: return TableRef(tbl_atanpi);
380 case AMDGPULibFunc::EI_CBRT: return TableRef(tbl_cbrt);
381 case AMDGPULibFunc::EI_NCOS:
382 case AMDGPULibFunc::EI_COS: return TableRef(tbl_cos);
383 case AMDGPULibFunc::EI_COSH: return TableRef(tbl_cosh);
384 case AMDGPULibFunc::EI_COSPI: return TableRef(tbl_cospi);
385 case AMDGPULibFunc::EI_ERFC: return TableRef(tbl_erfc);
386 case AMDGPULibFunc::EI_ERF: return TableRef(tbl_erf);
387 case AMDGPULibFunc::EI_EXP: return TableRef(tbl_exp);
388 case AMDGPULibFunc::EI_NEXP2:
389 case AMDGPULibFunc::EI_EXP2: return TableRef(tbl_exp2);
390 case AMDGPULibFunc::EI_EXP10: return TableRef(tbl_exp10);
391 case AMDGPULibFunc::EI_EXPM1: return TableRef(tbl_expm1);
392 case AMDGPULibFunc::EI_LOG: return TableRef(tbl_log);
393 case AMDGPULibFunc::EI_NLOG2:
394 case AMDGPULibFunc::EI_LOG2: return TableRef(tbl_log2);
395 case AMDGPULibFunc::EI_LOG10: return TableRef(tbl_log10);
396 case AMDGPULibFunc::EI_NRSQRT:
397 case AMDGPULibFunc::EI_RSQRT: return TableRef(tbl_rsqrt);
398 case AMDGPULibFunc::EI_NSIN:
399 case AMDGPULibFunc::EI_SIN: return TableRef(tbl_sin);
400 case AMDGPULibFunc::EI_SINH: return TableRef(tbl_sinh);
401 case AMDGPULibFunc::EI_SINPI: return TableRef(tbl_sinpi);
402 case AMDGPULibFunc::EI_NSQRT:
403 case AMDGPULibFunc::EI_SQRT: return TableRef(tbl_sqrt);
404 case AMDGPULibFunc::EI_TAN: return TableRef(tbl_tan);
405 case AMDGPULibFunc::EI_TANH: return TableRef(tbl_tanh);
406 case AMDGPULibFunc::EI_TANPI: return TableRef(tbl_tanpi);
407 case AMDGPULibFunc::EI_TGAMMA: return TableRef(tbl_tgamma);
408 default:;
409 }
410 return TableRef();
411}
412
413static inline int getVecSize(const AMDGPULibFunc& FInfo) {
414 return FInfo.getLeads()[0].VectorSize;
415}
416
417static inline AMDGPULibFunc::EType getArgType(const AMDGPULibFunc& FInfo) {
418 return (AMDGPULibFunc::EType)FInfo.getLeads()[0].ArgType;
419}
420
421FunctionCallee AMDGPULibCalls::getFunction(Module *M, const FuncInfo &fInfo) {
422 // If we are doing PreLinkOpt, the function is external. So it is safe to
423 // use getOrInsertFunction() at this stage.
424
425 return EnablePreLink ? AMDGPULibFunc::getOrInsertFunction(M, fInfo)
426 : AMDGPULibFunc::getFunction(M, fInfo);
427}
428
429FunctionCallee AMDGPULibCalls::getFloatFastVariant(
430 Module *M, const FuncInfo &fInfo, FuncInfo &newInfo,
431 AMDGPULibFunc::EFuncId NewFunc, AMDGPULibFunc::EFuncId FastVariant) {
432 assert(NewFunc != FastVariant);
433
434 if (FastVariant != AMDGPULibFunc::EI_NONE &&
435 getArgType(FInfo: fInfo) == AMDGPULibFunc::F32) {
436 newInfo = AMDGPULibFunc(FastVariant, fInfo);
437 if (FunctionCallee NewCallee = getFunction(M, fInfo: newInfo))
438 return NewCallee;
439 }
440
441 newInfo = AMDGPULibFunc(NewFunc, fInfo);
442 return getFunction(M, fInfo: newInfo);
443}
444
445bool AMDGPULibCalls::parseFunctionName(const StringRef &FMangledName,
446 FuncInfo &FInfo) {
447 return AMDGPULibFunc::parse(MangledName: FMangledName, Ptr&: FInfo);
448}
449
450bool AMDGPULibCalls::isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const {
451 return FPOp->hasApproxFunc() && FPOp->hasNoNaNs() && FPOp->hasNoInfs();
452}
453
454bool AMDGPULibCalls::canIncreasePrecisionOfConstantFold(
455 const FPMathOperator *FPOp) const {
456 // TODO: Refine to approxFunc or contract
457 return FPOp->isFast();
458}
459
460AMDGPULibCalls::AMDGPULibCalls(Function &F, FunctionAnalysisManager &FAM)
461 : SQ(F.getParent()->getDataLayout(),
462 &FAM.getResult<TargetLibraryAnalysis>(IR&: F),
463 FAM.getCachedResult<DominatorTreeAnalysis>(IR&: F),
464 &FAM.getResult<AssumptionAnalysis>(IR&: F)) {}
465
466bool AMDGPULibCalls::useNativeFunc(const StringRef F) const {
467 return AllNative || llvm::is_contained(Range&: UseNative, Element: F);
468}
469
470void AMDGPULibCalls::initNativeFuncs() {
471 AllNative = useNativeFunc(F: "all") ||
472 (UseNative.getNumOccurrences() && UseNative.size() == 1 &&
473 UseNative.begin()->empty());
474}
475
476bool AMDGPULibCalls::sincosUseNative(CallInst *aCI, const FuncInfo &FInfo) {
477 bool native_sin = useNativeFunc(F: "sin");
478 bool native_cos = useNativeFunc(F: "cos");
479
480 if (native_sin && native_cos) {
481 Module *M = aCI->getModule();
482 Value *opr0 = aCI->getArgOperand(i: 0);
483
484 AMDGPULibFunc nf;
485 nf.getLeads()[0].ArgType = FInfo.getLeads()[0].ArgType;
486 nf.getLeads()[0].VectorSize = FInfo.getLeads()[0].VectorSize;
487
488 nf.setPrefix(AMDGPULibFunc::NATIVE);
489 nf.setId(AMDGPULibFunc::EI_SIN);
490 FunctionCallee sinExpr = getFunction(M, fInfo: nf);
491
492 nf.setPrefix(AMDGPULibFunc::NATIVE);
493 nf.setId(AMDGPULibFunc::EI_COS);
494 FunctionCallee cosExpr = getFunction(M, fInfo: nf);
495 if (sinExpr && cosExpr) {
496 Value *sinval =
497 CallInst::Create(Func: sinExpr, Args: opr0, NameStr: "splitsin", InsertBefore: aCI->getIterator());
498 Value *cosval =
499 CallInst::Create(Func: cosExpr, Args: opr0, NameStr: "splitcos", InsertBefore: aCI->getIterator());
500 new StoreInst(cosval, aCI->getArgOperand(i: 1), aCI->getIterator());
501
502 DEBUG_WITH_TYPE("usenative", dbgs() << "<useNative> replace " << *aCI
503 << " with native version of sin/cos");
504
505 replaceCall(I: aCI, With: sinval);
506 return true;
507 }
508 }
509 return false;
510}
511
512bool AMDGPULibCalls::useNative(CallInst *aCI) {
513 Function *Callee = aCI->getCalledFunction();
514 if (!Callee || aCI->isNoBuiltin())
515 return false;
516
517 FuncInfo FInfo;
518 if (!parseFunctionName(FMangledName: Callee->getName(), FInfo) || !FInfo.isMangled() ||
519 FInfo.getPrefix() != AMDGPULibFunc::NOPFX ||
520 getArgType(FInfo) == AMDGPULibFunc::F64 || !HasNative(id: FInfo.getId()) ||
521 !(AllNative || useNativeFunc(F: FInfo.getName()))) {
522 return false;
523 }
524
525 if (FInfo.getId() == AMDGPULibFunc::EI_SINCOS)
526 return sincosUseNative(aCI, FInfo);
527
528 FInfo.setPrefix(AMDGPULibFunc::NATIVE);
529 FunctionCallee F = getFunction(M: aCI->getModule(), fInfo: FInfo);
530 if (!F)
531 return false;
532
533 aCI->setCalledFunction(F);
534 DEBUG_WITH_TYPE("usenative", dbgs() << "<useNative> replace " << *aCI
535 << " with native version");
536 return true;
537}
538
539// Clang emits call of __read_pipe_2 or __read_pipe_4 for OpenCL read_pipe
540// builtin, with appended type size and alignment arguments, where 2 or 4
541// indicates the original number of arguments. The library has optimized version
542// of __read_pipe_2/__read_pipe_4 when the type size and alignment has the same
543// power of 2 value. This function transforms __read_pipe_2 to __read_pipe_2_N
544// for such cases where N is the size in bytes of the type (N = 1, 2, 4, 8, ...,
545// 128). The same for __read_pipe_4, write_pipe_2, and write_pipe_4.
546bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
547 const FuncInfo &FInfo) {
548 auto *Callee = CI->getCalledFunction();
549 if (!Callee->isDeclaration())
550 return false;
551
552 assert(Callee->hasName() && "Invalid read_pipe/write_pipe function");
553 auto *M = Callee->getParent();
554 std::string Name = std::string(Callee->getName());
555 auto NumArg = CI->arg_size();
556 if (NumArg != 4 && NumArg != 6)
557 return false;
558 ConstantInt *PacketSize =
559 dyn_cast<ConstantInt>(Val: CI->getArgOperand(i: NumArg - 2));
560 ConstantInt *PacketAlign =
561 dyn_cast<ConstantInt>(Val: CI->getArgOperand(i: NumArg - 1));
562 if (!PacketSize || !PacketAlign)
563 return false;
564
565 unsigned Size = PacketSize->getZExtValue();
566 Align Alignment = PacketAlign->getAlignValue();
567 if (Alignment != Size)
568 return false;
569
570 unsigned PtrArgLoc = CI->arg_size() - 3;
571 Value *PtrArg = CI->getArgOperand(i: PtrArgLoc);
572 Type *PtrTy = PtrArg->getType();
573
574 SmallVector<llvm::Type *, 6> ArgTys;
575 for (unsigned I = 0; I != PtrArgLoc; ++I)
576 ArgTys.push_back(Elt: CI->getArgOperand(i: I)->getType());
577 ArgTys.push_back(Elt: PtrTy);
578
579 Name = Name + "_" + std::to_string(val: Size);
580 auto *FTy = FunctionType::get(Result: Callee->getReturnType(),
581 Params: ArrayRef<Type *>(ArgTys), isVarArg: false);
582 AMDGPULibFunc NewLibFunc(Name, FTy);
583 FunctionCallee F = AMDGPULibFunc::getOrInsertFunction(M, fInfo: NewLibFunc);
584 if (!F)
585 return false;
586
587 SmallVector<Value *, 6> Args;
588 for (unsigned I = 0; I != PtrArgLoc; ++I)
589 Args.push_back(Elt: CI->getArgOperand(i: I));
590 Args.push_back(Elt: PtrArg);
591
592 auto *NCI = B.CreateCall(Callee: F, Args);
593 NCI->setAttributes(CI->getAttributes());
594 CI->replaceAllUsesWith(V: NCI);
595 CI->dropAllReferences();
596 CI->eraseFromParent();
597
598 return true;
599}
600
601// This function returns false if no change; return true otherwise.
602bool AMDGPULibCalls::fold(CallInst *CI) {
603 Function *Callee = CI->getCalledFunction();
604 // Ignore indirect calls.
605 if (!Callee || Callee->isIntrinsic() || CI->isNoBuiltin())
606 return false;
607
608 FuncInfo FInfo;
609 if (!parseFunctionName(FMangledName: Callee->getName(), FInfo))
610 return false;
611
612 // Further check the number of arguments to see if they match.
613 // TODO: Check calling convention matches too
614 if (!FInfo.isCompatibleSignature(M: *Callee->getParent(), FuncTy: CI->getFunctionType()))
615 return false;
616
617 LLVM_DEBUG(dbgs() << "AMDIC: try folding " << *CI << '\n');
618
619 if (TDOFold(CI, FInfo))
620 return true;
621
622 IRBuilder<> B(CI);
623 if (CI->isStrictFP())
624 B.setIsFPConstrained(true);
625
626 if (FPMathOperator *FPOp = dyn_cast<FPMathOperator>(Val: CI)) {
627 // Under unsafe-math, evaluate calls if possible.
628 // According to Brian Sumner, we can do this for all f32 function calls
629 // using host's double function calls.
630 if (canIncreasePrecisionOfConstantFold(FPOp) && evaluateCall(aCI: CI, FInfo))
631 return true;
632
633 // Copy fast flags from the original call.
634 FastMathFlags FMF = FPOp->getFastMathFlags();
635 B.setFastMathFlags(FMF);
636
637 // Specialized optimizations for each function call.
638 //
639 // TODO: Handle native functions
640 switch (FInfo.getId()) {
641 case AMDGPULibFunc::EI_EXP:
642 if (FMF.none())
643 return false;
644 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::exp,
645 AllowMinSizeF32: FMF.approxFunc());
646 case AMDGPULibFunc::EI_EXP2:
647 if (FMF.none())
648 return false;
649 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::exp2,
650 AllowMinSizeF32: FMF.approxFunc());
651 case AMDGPULibFunc::EI_LOG:
652 if (FMF.none())
653 return false;
654 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::log,
655 AllowMinSizeF32: FMF.approxFunc());
656 case AMDGPULibFunc::EI_LOG2:
657 if (FMF.none())
658 return false;
659 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::log2,
660 AllowMinSizeF32: FMF.approxFunc());
661 case AMDGPULibFunc::EI_LOG10:
662 if (FMF.none())
663 return false;
664 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::log10,
665 AllowMinSizeF32: FMF.approxFunc());
666 case AMDGPULibFunc::EI_FMIN:
667 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::minnum,
668 AllowMinSizeF32: true, AllowF64: true);
669 case AMDGPULibFunc::EI_FMAX:
670 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::maxnum,
671 AllowMinSizeF32: true, AllowF64: true);
672 case AMDGPULibFunc::EI_FMA:
673 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::fma, AllowMinSizeF32: true,
674 AllowF64: true);
675 case AMDGPULibFunc::EI_MAD:
676 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::fmuladd,
677 AllowMinSizeF32: true, AllowF64: true);
678 case AMDGPULibFunc::EI_FABS:
679 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::fabs, AllowMinSizeF32: true,
680 AllowF64: true, AllowStrictFP: true);
681 case AMDGPULibFunc::EI_COPYSIGN:
682 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::copysign,
683 AllowMinSizeF32: true, AllowF64: true, AllowStrictFP: true);
684 case AMDGPULibFunc::EI_FLOOR:
685 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::floor, AllowMinSizeF32: true,
686 AllowF64: true);
687 case AMDGPULibFunc::EI_CEIL:
688 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::ceil, AllowMinSizeF32: true,
689 AllowF64: true);
690 case AMDGPULibFunc::EI_TRUNC:
691 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::trunc, AllowMinSizeF32: true,
692 AllowF64: true);
693 case AMDGPULibFunc::EI_RINT:
694 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::rint, AllowMinSizeF32: true,
695 AllowF64: true);
696 case AMDGPULibFunc::EI_ROUND:
697 return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::round, AllowMinSizeF32: true,
698 AllowF64: true);
699 case AMDGPULibFunc::EI_LDEXP: {
700 if (!shouldReplaceLibcallWithIntrinsic(CI, AllowMinSizeF32: true, AllowF64: true))
701 return false;
702
703 Value *Arg1 = CI->getArgOperand(i: 1);
704 if (VectorType *VecTy = dyn_cast<VectorType>(Val: CI->getType());
705 VecTy && !isa<VectorType>(Val: Arg1->getType())) {
706 Value *SplatArg1 = B.CreateVectorSplat(EC: VecTy->getElementCount(), V: Arg1);
707 CI->setArgOperand(i: 1, v: SplatArg1);
708 }
709
710 CI->setCalledFunction(Intrinsic::getOrInsertDeclaration(
711 M: CI->getModule(), id: Intrinsic::ldexp,
712 OverloadTys: {CI->getType(), CI->getArgOperand(i: 1)->getType()}));
713 CI->setCallingConv(CallingConv::C);
714 return true;
715 }
716 case AMDGPULibFunc::EI_POW:
717 case AMDGPULibFunc::EI_POW_FAST:
718 return tryOptimizePow(FPOp, B, FInfo);
719 case AMDGPULibFunc::EI_POWR:
720 case AMDGPULibFunc::EI_POWR_FAST: {
721 if (fold_pow(FPOp, B, FInfo))
722 return true;
723 if (!FMF.approxFunc())
724 return false;
725
726 if (FInfo.getId() == AMDGPULibFunc::EI_POWR && FMF.approxFunc() &&
727 getArgType(FInfo) == AMDGPULibFunc::F32) {
728 Module *M = Callee->getParent();
729 AMDGPULibFunc PowrFastInfo(AMDGPULibFunc::EI_POWR_FAST, FInfo);
730 if (FunctionCallee PowrFastFunc = getFunction(M, fInfo: PowrFastInfo)) {
731 CI->setCalledFunction(PowrFastFunc);
732 return true;
733 }
734 }
735
736 if (!shouldReplaceLibcallWithIntrinsic(CI))
737 return false;
738 return expandFastPow(FPOp, B, Kind: PowKind::PowR);
739 }
740 case AMDGPULibFunc::EI_POWN:
741 case AMDGPULibFunc::EI_POWN_FAST: {
742 if (fold_pow(FPOp, B, FInfo))
743 return true;
744 if (!FMF.approxFunc())
745 return false;
746
747 if (FInfo.getId() == AMDGPULibFunc::EI_POWN &&
748 getArgType(FInfo) == AMDGPULibFunc::F32) {
749 Module *M = Callee->getParent();
750 AMDGPULibFunc PownFastInfo(AMDGPULibFunc::EI_POWN_FAST, FInfo);
751 if (FunctionCallee PownFastFunc = getFunction(M, fInfo: PownFastInfo)) {
752 CI->setCalledFunction(PownFastFunc);
753 return true;
754 }
755 }
756
757 if (!shouldReplaceLibcallWithIntrinsic(CI))
758 return false;
759 return expandFastPow(FPOp, B, Kind: PowKind::PowN);
760 }
761 case AMDGPULibFunc::EI_ROOTN:
762 case AMDGPULibFunc::EI_ROOTN_FAST: {
763 if (fold_rootn(FPOp, B, FInfo))
764 return true;
765 if (!FMF.approxFunc())
766 return false;
767
768 if (getArgType(FInfo) == AMDGPULibFunc::F32) {
769 Module *M = Callee->getParent();
770 AMDGPULibFunc RootnFastInfo(AMDGPULibFunc::EI_ROOTN_FAST, FInfo);
771 if (FunctionCallee RootnFastFunc = getFunction(M, fInfo: RootnFastInfo)) {
772 CI->setCalledFunction(RootnFastFunc);
773 return true;
774 }
775 }
776
777 return expandFastPow(FPOp, B, Kind: PowKind::RootN);
778 }
779 case AMDGPULibFunc::EI_SQRT:
780 // TODO: Allow with strictfp + constrained intrinsic
781 return tryReplaceLibcallWithSimpleIntrinsic(
782 B, CI, IntrID: Intrinsic::sqrt, AllowMinSizeF32: true, AllowF64: true, /*AllowStrictFP=*/false);
783 case AMDGPULibFunc::EI_COS:
784 case AMDGPULibFunc::EI_SIN:
785 return fold_sincos(FPOp, B, FInfo);
786 default:
787 break;
788 }
789 } else {
790 // Specialized optimizations for each function call
791 switch (FInfo.getId()) {
792 case AMDGPULibFunc::EI_READ_PIPE_2:
793 case AMDGPULibFunc::EI_READ_PIPE_4:
794 case AMDGPULibFunc::EI_WRITE_PIPE_2:
795 case AMDGPULibFunc::EI_WRITE_PIPE_4:
796 return fold_read_write_pipe(CI, B, FInfo);
797 default:
798 break;
799 }
800 }
801
802 return false;
803}
804
805static Constant *getConstantFloatVector(const ArrayRef<APFloat> Values,
806 const Type *Ty) {
807 Type *ElemTy = Ty->getScalarType();
808 const fltSemantics &FltSem = ElemTy->getFltSemantics();
809
810 SmallVector<Constant *, 4> ConstValues;
811 ConstValues.reserve(N: Values.size());
812 for (APFloat APF : Values) {
813 bool Unused;
814 APF.convert(ToSemantics: FltSem, RM: APFloat::rmNearestTiesToEven, losesInfo: &Unused);
815 ConstValues.push_back(Elt: ConstantFP::get(Ty: ElemTy, V: APF));
816 }
817 return ConstantVector::get(V: ConstValues);
818}
819
820bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
821 // Table-Driven optimization
822 const TableRef tr = getOptTable(id: FInfo.getId());
823 if (tr.empty())
824 return false;
825
826 int const sz = (int)tr.size();
827 Value *opr0 = CI->getArgOperand(i: 0);
828
829 int vecSize = getVecSize(FInfo);
830 if (vecSize > 1) {
831 // Vector version
832 Constant *CV = dyn_cast<Constant>(Val: opr0);
833 if (CV && CV->getType()->isVectorTy()) {
834 SmallVector<APFloat, 4> Values;
835 Values.reserve(N: vecSize);
836 for (int eltNo = 0; eltNo < vecSize; ++eltNo) {
837 ConstantFP *eltval =
838 cast<ConstantFP>(Val: CV->getAggregateElement(Elt: (unsigned)eltNo));
839 auto MatchingRow = llvm::find_if(Range: tr, P: [eltval](const TableEntry &entry) {
840 return eltval->isExactlyValue(V: entry.input);
841 });
842 if (MatchingRow == tr.end())
843 return false;
844 Values.push_back(Elt: APFloat(MatchingRow->result));
845 }
846 Constant *NewValues = getConstantFloatVector(Values, Ty: CI->getType());
847 LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *NewValues << "\n");
848 replaceCall(I: CI, With: NewValues);
849 return true;
850 }
851 } else {
852 // Scalar version
853 if (ConstantFP *CF = dyn_cast<ConstantFP>(Val: opr0)) {
854 for (int i = 0; i < sz; ++i) {
855 if (CF->isExactlyValue(V: tr[i].input)) {
856 Value *nval = ConstantFP::get(Ty: CF->getType(), V: tr[i].result);
857 LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n");
858 replaceCall(I: CI, With: nval);
859 return true;
860 }
861 }
862 }
863 }
864
865 return false;
866}
867
868namespace llvm {
869static double log2(double V) {
870#if _XOPEN_SOURCE >= 600 || defined(_ISOC99_SOURCE) || _POSIX_C_SOURCE >= 200112L
871 return ::log2(x: V);
872#else
873 return log(V) / numbers::ln2;
874#endif
875}
876} // namespace llvm
877
878bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B,
879 const FuncInfo &FInfo) {
880 assert((FInfo.getId() == AMDGPULibFunc::EI_POW ||
881 FInfo.getId() == AMDGPULibFunc::EI_POW_FAST ||
882 FInfo.getId() == AMDGPULibFunc::EI_POWR ||
883 FInfo.getId() == AMDGPULibFunc::EI_POWR_FAST ||
884 FInfo.getId() == AMDGPULibFunc::EI_POWN ||
885 FInfo.getId() == AMDGPULibFunc::EI_POWN_FAST) &&
886 "fold_pow: encounter a wrong function call");
887
888 Module *M = B.GetInsertBlock()->getModule();
889 Type *eltType = FPOp->getType()->getScalarType();
890 Value *opr0 = FPOp->getOperand(i: 0);
891 Value *opr1 = FPOp->getOperand(i: 1);
892
893 const APFloat *CF = nullptr;
894 const APInt *CINT = nullptr;
895 if (!match(V: opr1, P: m_APFloatAllowPoison(Res&: CF)))
896 match(V: opr1, P: m_APIntAllowPoison(Res&: CINT));
897
898 // 0x1111111 means that we don't do anything for this call.
899 int ci_opr1 = (CINT ? (int)CINT->getSExtValue() : 0x1111111);
900
901 if ((CF && CF->isZero()) || (CINT && ci_opr1 == 0)) {
902 // pow/powr/pown(x, 0) == 1
903 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1\n");
904 Constant *cnval = ConstantFP::get(Ty: eltType, V: 1.0);
905 if (getVecSize(FInfo) > 1) {
906 cnval = ConstantDataVector::getSplat(NumElts: getVecSize(FInfo), Elt: cnval);
907 }
908 replaceCall(I: FPOp, With: cnval);
909 return true;
910 }
911 if ((CF && CF->isOne()) || (CINT && ci_opr1 == 1)) {
912 // pow/powr/pown(x, 1.0) = x
913 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << *opr0 << "\n");
914 replaceCall(I: FPOp, With: opr0);
915 return true;
916 }
917 if ((CF && CF->isExactlyValue(V: 2.0)) || (CINT && ci_opr1 == 2)) {
918 // pow/powr/pown(x, 2.0) = x*x
919 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << *opr0 << " * "
920 << *opr0 << "\n");
921 Value *nval = B.CreateFMul(L: opr0, R: opr0, Name: "__pow2");
922 replaceCall(I: FPOp, With: nval);
923 return true;
924 }
925 if ((CF && CF->isMinusOne()) || (CINT && ci_opr1 == -1)) {
926 // pow/powr/pown(x, -1.0) = 1.0/x
927 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1 / " << *opr0 << "\n");
928 Constant *cnval = ConstantFP::get(Ty: eltType, V: 1.0);
929 if (getVecSize(FInfo) > 1) {
930 cnval = ConstantDataVector::getSplat(NumElts: getVecSize(FInfo), Elt: cnval);
931 }
932 Value *nval = B.CreateFDiv(L: cnval, R: opr0, Name: "__powrecip");
933 replaceCall(I: FPOp, With: nval);
934 return true;
935 }
936
937 if (CF && (CF->isExactlyValue(V: 0.5) || CF->isExactlyValue(V: -0.5))) {
938 // pow[r](x, [-]0.5) = sqrt(x)
939 bool issqrt = CF->isExactlyValue(V: 0.5);
940 if (FunctionCallee FPExpr =
941 getFunction(M, fInfo: AMDGPULibFunc(issqrt ? AMDGPULibFunc::EI_SQRT
942 : AMDGPULibFunc::EI_RSQRT,
943 FInfo))) {
944 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << FInfo.getName()
945 << '(' << *opr0 << ")\n");
946 Value *nval = CreateCallEx(B,Callee: FPExpr, Arg: opr0, Name: issqrt ? "__pow2sqrt"
947 : "__pow2rsqrt");
948 replaceCall(I: FPOp, With: nval);
949 return true;
950 }
951 }
952
953 if (!isUnsafeFiniteOnlyMath(FPOp))
954 return false;
955
956 // Unsafe Math optimization
957
958 // Remember that ci_opr1 is set if opr1 is integral
959 if (CF) {
960 double dval = (getArgType(FInfo) == AMDGPULibFunc::F32)
961 ? (double)CF->convertToFloat()
962 : CF->convertToDouble();
963 int ival = (int)dval;
964 if ((double)ival == dval) {
965 ci_opr1 = ival;
966 } else
967 ci_opr1 = 0x11111111;
968 }
969
970 // pow/powr/pown(x, c) = [1/](x*x*..x); where
971 // trunc(c) == c && the number of x == c && |c| <= 12
972 unsigned abs_opr1 = (ci_opr1 < 0) ? -ci_opr1 : ci_opr1;
973 if (abs_opr1 <= 12) {
974 Constant *cnval;
975 Value *nval;
976 if (abs_opr1 == 0) {
977 cnval = ConstantFP::get(Ty: eltType, V: 1.0);
978 if (getVecSize(FInfo) > 1) {
979 cnval = ConstantDataVector::getSplat(NumElts: getVecSize(FInfo), Elt: cnval);
980 }
981 nval = cnval;
982 } else {
983 Value *valx2 = nullptr;
984 nval = nullptr;
985 while (abs_opr1 > 0) {
986 valx2 = valx2 ? B.CreateFMul(L: valx2, R: valx2, Name: "__powx2") : opr0;
987 if (abs_opr1 & 1) {
988 nval = nval ? B.CreateFMul(L: nval, R: valx2, Name: "__powprod") : valx2;
989 }
990 abs_opr1 >>= 1;
991 }
992 }
993
994 if (ci_opr1 < 0) {
995 cnval = ConstantFP::get(Ty: eltType, V: 1.0);
996 if (getVecSize(FInfo) > 1) {
997 cnval = ConstantDataVector::getSplat(NumElts: getVecSize(FInfo), Elt: cnval);
998 }
999 nval = B.CreateFDiv(L: cnval, R: nval, Name: "__1powprod");
1000 }
1001 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> "
1002 << ((ci_opr1 < 0) ? "1/prod(" : "prod(") << *opr0
1003 << ")\n");
1004 replaceCall(I: FPOp, With: nval);
1005 return true;
1006 }
1007
1008 // If we should use the generic intrinsic instead of emitting a libcall
1009 const bool ShouldUseIntrinsic = eltType->isFloatTy() || eltType->isHalfTy();
1010
1011 // powr ---> exp2(y * log2(x))
1012 // pown/pow ---> powr(fabs(x), y) | (x & ((int)y << 31))
1013 FunctionCallee ExpExpr;
1014 if (ShouldUseIntrinsic)
1015 ExpExpr = Intrinsic::getOrInsertDeclaration(M, id: Intrinsic::exp2,
1016 OverloadTys: {FPOp->getType()});
1017 else {
1018 ExpExpr = getFunction(M, fInfo: AMDGPULibFunc(AMDGPULibFunc::EI_EXP2, FInfo));
1019 if (!ExpExpr)
1020 return false;
1021 }
1022
1023 bool needlog = false;
1024 bool needabs = false;
1025 bool needcopysign = false;
1026 Constant *cnval = nullptr;
1027 if (getVecSize(FInfo) == 1) {
1028 CF = nullptr;
1029 match(V: opr0, P: m_APFloatAllowPoison(Res&: CF));
1030
1031 if (CF) {
1032 double V = (getArgType(FInfo) == AMDGPULibFunc::F32)
1033 ? (double)CF->convertToFloat()
1034 : CF->convertToDouble();
1035
1036 V = log2(V: std::abs(x: V));
1037 cnval = ConstantFP::get(Ty: eltType, V);
1038 needcopysign = (FInfo.getId() != AMDGPULibFunc::EI_POWR &&
1039 FInfo.getId() != AMDGPULibFunc::EI_POWR_FAST) &&
1040 CF->isNegative();
1041 } else {
1042 needlog = true;
1043 needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR &&
1044 FInfo.getId() != AMDGPULibFunc::EI_POWR_FAST;
1045 }
1046 } else {
1047 ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(Val: opr0);
1048
1049 if (!CDV) {
1050 needlog = true;
1051 needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR &&
1052 FInfo.getId() != AMDGPULibFunc::EI_POWR_FAST;
1053 } else {
1054 assert ((int)CDV->getNumElements() == getVecSize(FInfo) &&
1055 "Wrong vector size detected");
1056
1057 SmallVector<double, 0> DVal;
1058 for (int i=0; i < getVecSize(FInfo); ++i) {
1059 double V = CDV->getElementAsAPFloat(i).convertToDouble();
1060 if (V < 0.0) needcopysign = true;
1061 V = log2(V: std::abs(x: V));
1062 DVal.push_back(Elt: V);
1063 }
1064 if (getArgType(FInfo) == AMDGPULibFunc::F32) {
1065 SmallVector<float, 0> FVal;
1066 for (double D : DVal)
1067 FVal.push_back(Elt: (float)D);
1068 ArrayRef<float> tmp(FVal);
1069 cnval = ConstantDataVector::get(Context&: M->getContext(), Elts: tmp);
1070 } else {
1071 ArrayRef<double> tmp(DVal);
1072 cnval = ConstantDataVector::get(Context&: M->getContext(), Elts: tmp);
1073 }
1074 }
1075 }
1076
1077 if (needcopysign && (FInfo.getId() == AMDGPULibFunc::EI_POW ||
1078 FInfo.getId() == AMDGPULibFunc::EI_POW_FAST)) {
1079 // We cannot handle corner cases for a general pow() function, give up
1080 // unless y is a constant integral value. Then proceed as if it were pown.
1081 if (!isKnownIntegral(V: opr1, SQ: SQ.getWithInstruction(I: cast<Instruction>(Val: FPOp)),
1082 FMF: FPOp->getFastMathFlags()))
1083 return false;
1084 }
1085
1086 Value *nval;
1087 if (needabs) {
1088 nval = B.CreateFAbs(V: opr0, FMFSource: nullptr, Name: "__fabs");
1089 } else {
1090 nval = cnval ? cnval : opr0;
1091 }
1092 if (needlog) {
1093 FunctionCallee LogExpr;
1094 if (ShouldUseIntrinsic) {
1095 LogExpr = Intrinsic::getOrInsertDeclaration(M, id: Intrinsic::log2,
1096 OverloadTys: {FPOp->getType()});
1097 } else {
1098 LogExpr = getFunction(M, fInfo: AMDGPULibFunc(AMDGPULibFunc::EI_LOG2, FInfo));
1099 if (!LogExpr)
1100 return false;
1101 }
1102
1103 nval = CreateCallEx(B,Callee: LogExpr, Arg: nval, Name: "__log2");
1104 }
1105
1106 if (FInfo.getId() == AMDGPULibFunc::EI_POWN ||
1107 FInfo.getId() == AMDGPULibFunc::EI_POWN_FAST) {
1108 // convert int(32) to fp(f32 or f64)
1109 opr1 = B.CreateSIToFP(V: opr1, DestTy: nval->getType(), Name: "pownI2F");
1110 }
1111 nval = B.CreateFMul(L: opr1, R: nval, Name: "__ylogx");
1112
1113 CallInst *Exp2Call = CreateCallEx(B, Callee: ExpExpr, Arg: nval, Name: "__exp2");
1114
1115 // TODO: Generalized fpclass logic for pow
1116 FPClassTest KnownNot = FPClassTest::fcNegative;
1117 if (FPOp->hasNoNaNs())
1118 KnownNot |= FPClassTest::fcNan;
1119
1120 Exp2Call->addRetAttr(
1121 Attr: Attribute::getWithNoFPClass(Context&: Exp2Call->getContext(), Mask: KnownNot));
1122 nval = Exp2Call;
1123
1124 if (needcopysign) {
1125 Type* nTyS = B.getIntNTy(N: eltType->getPrimitiveSizeInBits());
1126 Type *nTy = FPOp->getType()->getWithNewType(EltTy: nTyS);
1127 Value *opr_n = FPOp->getOperand(i: 1);
1128 if (opr_n->getType()->getScalarType()->isIntegerTy())
1129 opr_n = B.CreateZExtOrTrunc(V: opr_n, DestTy: nTy, Name: "__ytou");
1130 else
1131 opr_n = B.CreateFPToSI(V: opr1, DestTy: nTy, Name: "__ytou");
1132
1133 unsigned size = nTy->getScalarSizeInBits();
1134 Value *sign = B.CreateShl(LHS: opr_n, RHS: size-1, Name: "__yeven");
1135 sign = B.CreateAnd(LHS: B.CreateBitCast(V: opr0, DestTy: nTy), RHS: sign, Name: "__pow_sign");
1136
1137 nval = B.CreateCopySign(LHS: nval, RHS: B.CreateBitCast(V: sign, DestTy: nval->getType()),
1138 FMFSource: nullptr, Name: "__pow_sign");
1139 }
1140
1141 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> "
1142 << "exp2(" << *opr1 << " * log2(" << *opr0 << "))\n");
1143 replaceCall(I: FPOp, With: nval);
1144
1145 return true;
1146}
1147
1148bool AMDGPULibCalls::fold_rootn(FPMathOperator *FPOp, IRBuilder<> &B,
1149 const FuncInfo &FInfo) {
1150 Value *opr0 = FPOp->getOperand(i: 0);
1151 Value *opr1 = FPOp->getOperand(i: 1);
1152
1153 const APInt *CINT = nullptr;
1154 if (!match(V: opr1, P: m_APIntAllowPoison(Res&: CINT)))
1155 return false;
1156
1157 Function *Parent = B.GetInsertBlock()->getParent();
1158
1159 int ci_opr1 = (int)CINT->getSExtValue();
1160 if (ci_opr1 == 1 && !Parent->hasFnAttribute(Kind: Attribute::StrictFP)) {
1161 // rootn(x, 1) = x
1162 //
1163 // TODO: Insert constrained canonicalize for strictfp case.
1164 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << *opr0 << '\n');
1165 replaceCall(I: FPOp, With: opr0);
1166 return true;
1167 }
1168
1169 Module *M = B.GetInsertBlock()->getModule();
1170
1171 CallInst *CI = cast<CallInst>(Val: FPOp);
1172
1173 // rootn and sqrt disagree on signed-zero / -Inf inputs (e.g. rootn(-0.0, 2)
1174 // is +0.0, sqrt(-0.0) is -0.0), so require nsz/ninf.
1175 bool FMFOkForSqrt = FPOp->hasNoSignedZeros() && FPOp->hasNoInfs();
1176
1177 if (ci_opr1 == 2 && FMFOkForSqrt &&
1178 shouldReplaceLibcallWithIntrinsic(CI,
1179 /*AllowMinSizeF32=*/true,
1180 /*AllowF64=*/true)) {
1181 // rootn(x, 2) = sqrt(x)
1182 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> sqrt(" << *opr0 << ")\n");
1183
1184 Value *NewCall = B.CreateUnaryIntrinsic(ID: Intrinsic::sqrt, Op: opr0, FMFSource: CI);
1185 NewCall->takeName(V: CI);
1186
1187 // OpenCL rootn has a looser ulp of 2 requirement than sqrt, so add some
1188 // metadata.
1189 MDBuilder MDHelper(M->getContext());
1190 MDNode *FPMD = MDHelper.createFPMath(Accuracy: std::max(a: FPOp->getFPAccuracy(), b: 2.0f));
1191 if (auto *NewCallI = dyn_cast<Instruction>(Val: NewCall))
1192 NewCallI->setMetadata(KindID: LLVMContext::MD_fpmath, Node: FPMD);
1193
1194 replaceCall(I: CI, With: NewCall);
1195 return true;
1196 }
1197
1198 if (ci_opr1 == 3) { // rootn(x, 3) = cbrt(x)
1199 if (FunctionCallee FPExpr =
1200 getFunction(M, fInfo: AMDGPULibFunc(AMDGPULibFunc::EI_CBRT, FInfo))) {
1201 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> cbrt(" << *opr0
1202 << ")\n");
1203 Value *nval = CreateCallEx(B,Callee: FPExpr, Arg: opr0, Name: "__rootn2cbrt");
1204 replaceCall(I: FPOp, With: nval);
1205 return true;
1206 }
1207 } else if (ci_opr1 == -1) { // rootn(x, -1) = 1.0/x
1208 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1.0 / " << *opr0 << "\n");
1209 Value *nval = B.CreateFDiv(L: ConstantFP::get(Ty: opr0->getType(), V: 1.0),
1210 R: opr0,
1211 Name: "__rootn2div");
1212 replaceCall(I: FPOp, With: nval);
1213 return true;
1214 }
1215
1216 if (ci_opr1 == -2 && FMFOkForSqrt &&
1217 shouldReplaceLibcallWithIntrinsic(CI,
1218 /*AllowMinSizeF32=*/true,
1219 /*AllowF64=*/true)) {
1220 // rootn(x, -2) = rsqrt(x)
1221
1222 // The original rootn had looser ulp requirements than the resultant sqrt
1223 // and fdiv.
1224 MDBuilder MDHelper(M->getContext());
1225 MDNode *FPMD = MDHelper.createFPMath(Accuracy: std::max(a: FPOp->getFPAccuracy(), b: 2.0f));
1226
1227 // TODO: Could handle strictfp but need to fix strict sqrt emission
1228 FastMathFlags FMF = FPOp->getFastMathFlags();
1229 FMF.setAllowContract(true);
1230
1231 Value *Sqrt = B.CreateUnaryIntrinsic(ID: Intrinsic::sqrt, Op: opr0, FMFSource: CI);
1232 Instruction *RSqrt = cast<Instruction>(
1233 Val: B.CreateFDiv(L: ConstantFP::get(Ty: opr0->getType(), V: 1.0), R: Sqrt));
1234 if (auto *SqrtI = dyn_cast<Instruction>(Val: Sqrt))
1235 SqrtI->setFastMathFlags(FMF);
1236 RSqrt->setFastMathFlags(FMF);
1237 RSqrt->setMetadata(KindID: LLVMContext::MD_fpmath, Node: FPMD);
1238
1239 LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> rsqrt(" << *opr0
1240 << ")\n");
1241 replaceCall(I: CI, With: RSqrt);
1242 return true;
1243 }
1244
1245 return false;
1246}
1247
1248// is_integer(y) => trunc(y) == y
1249static Value *emitIsInteger(IRBuilder<> &B, Value *Y) {
1250 Value *TruncY = B.CreateUnaryIntrinsic(ID: Intrinsic::trunc, Op: Y);
1251 return B.CreateFCmpOEQ(LHS: TruncY, RHS: Y);
1252}
1253
1254static Value *emitIsEvenInteger(IRBuilder<> &B, Value *Y) {
1255 // Even integers are still integers after division by 2.
1256 auto *HalfY = B.CreateFMul(L: Y, R: ConstantFP::get(Ty: Y->getType(), V: 0.5));
1257 return emitIsInteger(B, Y: HalfY);
1258}
1259
1260// is_odd_integer(y) => is_integer(y) && !is_even_integer(y)
1261static Value *emitIsOddInteger(IRBuilder<> &B, Value *Y) {
1262 Value *IsIntY = emitIsInteger(B, Y);
1263 Value *IsEvenY = emitIsEvenInteger(B, Y);
1264 Value *NotEvenY = B.CreateNot(V: IsEvenY);
1265 return B.CreateAnd(LHS: IsIntY, RHS: NotEvenY);
1266}
1267
1268// isinf(val) => fabs(val) == +inf
1269static Value *emitIsInf(IRBuilder<> &B, Value *val) {
1270 auto *fabsVal = B.CreateFAbs(V: val);
1271 return B.CreateFCmpOEQ(LHS: fabsVal, RHS: ConstantFP::getInfinity(Ty: val->getType()));
1272}
1273
1274// y * log2(fabs(x))
1275static Value *emitFastExpYLnx(IRBuilder<> &B, Value *X, Value *Y) {
1276 Value *AbsX = B.CreateFAbs(V: X);
1277 Value *LogAbsX = B.CreateUnaryIntrinsic(ID: Intrinsic::log2, Op: AbsX);
1278 Value *YTimesLogX = B.CreateFMul(L: Y, R: LogAbsX);
1279 return B.CreateUnaryIntrinsic(ID: Intrinsic::exp2, Op: YTimesLogX);
1280}
1281
1282/// Emit special case management epilog code for fast pow, powr, pown, and rootn
1283/// expansions. \p x and \p y should be the arguments to the library call
1284/// (possibly with some values clamped). \p expylnx should be the result to use
1285/// in normal circumstances.
1286static Value *emitPowFixup(IRBuilder<> &B, Value *X, Value *Y, Value *ExpYLnX,
1287 PowKind Kind) {
1288 Constant *Zero = ConstantFP::getZero(Ty: X->getType());
1289 Constant *One = ConstantFP::get(Ty: X->getType(), V: 1.0);
1290 Constant *QNaN = ConstantFP::getQNaN(Ty: X->getType());
1291 Constant *PInf = ConstantFP::getInfinity(Ty: X->getType());
1292
1293 switch (Kind) {
1294 case PowKind::Pow: {
1295 // is_odd_integer(y)
1296 Value *IsOddY = emitIsOddInteger(B, Y);
1297
1298 // ret = copysign(expylnx, is_odd_y ? x : 1.0f)
1299 Value *SelSign = B.CreateSelect(C: IsOddY, True: X, False: One);
1300 Value *Ret = B.CreateCopySign(LHS: ExpYLnX, RHS: SelSign);
1301
1302 // if (x < 0 && !is_integer(y)) ret = QNAN
1303 Value *IsIntY = emitIsInteger(B, Y);
1304 Value *condNegX = B.CreateFCmpOLT(LHS: X, RHS: Zero);
1305 Value *condNotIntY = B.CreateNot(V: IsIntY);
1306 Value *condNaN = B.CreateAnd(LHS: condNegX, RHS: condNotIntY);
1307 Ret = B.CreateSelect(C: condNaN, True: QNaN, False: Ret);
1308
1309 // if (isinf(ay)) { ... }
1310
1311 // FIXME: Missing backend optimization to save on materialization cost of
1312 // mixed sign constant infinities.
1313 Value *YIsInf = emitIsInf(B, val: Y);
1314
1315 Value *AY = B.CreateFAbs(V: Y);
1316 Value *YIsNegInf = B.CreateFCmpUNE(LHS: Y, RHS: AY);
1317
1318 Value *AX = B.CreateFAbs(V: X);
1319 Value *AxEqOne = B.CreateFCmpOEQ(LHS: AX, RHS: One);
1320 Value *AxLtOne = B.CreateFCmpOLT(LHS: AX, RHS: One);
1321 Value *XorCond = B.CreateXor(LHS: AxLtOne, RHS: YIsNegInf);
1322 Value *SelInf =
1323 B.CreateSelect(C: AxEqOne, True: AX, False: B.CreateSelect(C: XorCond, True: Zero, False: AY));
1324 Ret = B.CreateSelect(C: YIsInf, True: SelInf, False: Ret);
1325
1326 // if (isinf(ax) || x == 0.0f) { ... }
1327 Value *XIsInf = emitIsInf(B, val: X);
1328 Value *XEqZero = B.CreateFCmpOEQ(LHS: X, RHS: Zero);
1329 Value *AxInfOrZero = B.CreateOr(LHS: XIsInf, RHS: XEqZero);
1330 Value *YLtZero = B.CreateFCmpOLT(LHS: Y, RHS: Zero);
1331 Value *XorZeroInf = B.CreateXor(LHS: XEqZero, RHS: YLtZero);
1332 Value *SelVal = B.CreateSelect(C: XorZeroInf, True: Zero, False: PInf);
1333 Value *SelSign2 = B.CreateSelect(C: IsOddY, True: X, False: Zero);
1334 Value *Copysign = B.CreateCopySign(LHS: SelVal, RHS: SelSign2);
1335 Ret = B.CreateSelect(C: AxInfOrZero, True: Copysign, False: Ret);
1336
1337 // if (isunordered(x, y)) ret = QNAN
1338 Value *isUnordered = B.CreateFCmpUNO(LHS: X, RHS: Y);
1339 return B.CreateSelect(C: isUnordered, True: QNaN, False: Ret);
1340 }
1341 case PowKind::PowR: {
1342 Value *YIsNeg = B.CreateFCmpOLT(LHS: Y, RHS: Zero);
1343 Value *IZ = B.CreateSelect(C: YIsNeg, True: PInf, False: Zero);
1344 Value *ZI = B.CreateSelect(C: YIsNeg, True: Zero, False: PInf);
1345
1346 Value *YEqZero = B.CreateFCmpOEQ(LHS: Y, RHS: Zero);
1347 Value *SelZeroCase = B.CreateSelect(C: YEqZero, True: QNaN, False: IZ);
1348 Value *XEqZero = B.CreateFCmpOEQ(LHS: X, RHS: Zero);
1349 Value *Ret = B.CreateSelect(C: XEqZero, True: SelZeroCase, False: ExpYLnX);
1350
1351 Value *XEqInf = B.CreateFCmpOEQ(LHS: X, RHS: PInf);
1352 Value *YNeZero = B.CreateFCmpUNE(LHS: Y, RHS: Zero);
1353 Value *CondInfCase = B.CreateAnd(LHS: XEqInf, RHS: YNeZero);
1354 Ret = B.CreateSelect(C: CondInfCase, True: ZI, False: Ret);
1355
1356 Value *IsInfY = emitIsInf(B, val: Y);
1357 Value *XNeOne = B.CreateFCmpUNE(LHS: X, RHS: One);
1358 Value *CondInfY = B.CreateAnd(LHS: IsInfY, RHS: XNeOne);
1359 Value *XLtOne = B.CreateFCmpOLT(LHS: X, RHS: One);
1360 Value *SelInfYCase = B.CreateSelect(C: XLtOne, True: IZ, False: ZI);
1361 Ret = B.CreateSelect(C: CondInfY, True: SelInfYCase, False: Ret);
1362
1363 Value *IsUnordered = B.CreateFCmpUNO(LHS: X, RHS: Y);
1364 return B.CreateSelect(C: IsUnordered, True: QNaN, False: Ret);
1365 }
1366 case PowKind::PowN: {
1367 Constant *ZeroI = ConstantInt::get(Ty: Y->getType(), V: 0);
1368
1369 // is_odd_y = (ny & 1) != 0
1370 Value *OneI = ConstantInt::get(Ty: Y->getType(), V: 1);
1371 Value *YAnd1 = B.CreateAnd(LHS: Y, RHS: OneI);
1372 Value *IsOddY = B.CreateICmpNE(LHS: YAnd1, RHS: ZeroI);
1373
1374 // ret = copysign(expylnx, is_odd_y ? x : 1.0f)
1375 Value *SelSign = B.CreateSelect(C: IsOddY, True: X, False: One);
1376 Value *Ret = B.CreateCopySign(LHS: ExpYLnX, RHS: SelSign);
1377
1378 // if (isinf(x) || x == 0.0f)
1379 Value *FabsX = B.CreateFAbs(V: X);
1380 Value *XIsInf = B.CreateFCmpOEQ(LHS: FabsX, RHS: PInf);
1381 Value *XEqZero = B.CreateFCmpOEQ(LHS: X, RHS: Zero);
1382 Value *InfOrZero = B.CreateOr(LHS: XIsInf, RHS: XEqZero);
1383
1384 // (x == 0.0f) ^ (ny < 0) ? 0.0f : +inf
1385 Value *YLtZero = B.CreateICmpSLT(LHS: Y, RHS: ZeroI);
1386 Value *XorZeroInf = B.CreateXor(LHS: XEqZero, RHS: YLtZero);
1387 Value *SelVal = B.CreateSelect(C: XorZeroInf, True: Zero, False: PInf);
1388
1389 // copysign(selVal, is_odd_y ? x : 0.0f)
1390 Value *SelSign2 = B.CreateSelect(C: IsOddY, True: X, False: Zero);
1391 Value *Copysign = B.CreateCopySign(LHS: SelVal, RHS: SelSign2);
1392
1393 return B.CreateSelect(C: InfOrZero, True: Copysign, False: Ret);
1394 }
1395 case PowKind::RootN: {
1396 Constant *ZeroI = ConstantInt::get(Ty: Y->getType(), V: 0);
1397
1398 // is_odd_y = (ny & 1) != 0
1399 Value *YAnd1 = B.CreateAnd(LHS: Y, RHS: ConstantInt::get(Ty: Y->getType(), V: 1));
1400 Value *IsOddY = B.CreateICmpNE(LHS: YAnd1, RHS: ZeroI);
1401
1402 // ret = copysign(expylnx, is_odd_y ? x : 1.0f)
1403 Value *SelSign = B.CreateSelect(C: IsOddY, True: X, False: One);
1404 Value *Ret = B.CreateCopySign(LHS: ExpYLnX, RHS: SelSign);
1405
1406 // if (isinf(x) || x == 0.0f)
1407 Value *FabsX = B.CreateFAbs(V: X);
1408 Value *IsInfX = B.CreateFCmpOEQ(LHS: FabsX, RHS: PInf);
1409 Value *XEqZero = B.CreateFCmpOEQ(LHS: X, RHS: Zero);
1410 Value *CondInfOrZero = B.CreateOr(LHS: IsInfX, RHS: XEqZero);
1411
1412 // (x == 0.0f) ^ (ny < 0) ? 0.0f : +inf
1413 Value *YLtZero = B.CreateICmpSLT(LHS: Y, RHS: ZeroI);
1414 Value *XorZeroInf = B.CreateXor(LHS: XEqZero, RHS: YLtZero);
1415 Value *SelVal = B.CreateSelect(C: XorZeroInf, True: Zero, False: PInf);
1416
1417 // copysign(selVal, is_odd_y ? x : 0.0f)
1418 Value *SelSign2 = B.CreateSelect(C: IsOddY, True: X, False: Zero);
1419 Value *Copysign = B.CreateCopySign(LHS: SelVal, RHS: SelSign2);
1420
1421 Ret = B.CreateSelect(C: CondInfOrZero, True: Copysign, False: Ret);
1422
1423 // if ((x < 0.0f && !is_odd_y) || ny == 0) ret = QNAN
1424 Value *XIsNeg = B.CreateFCmpOLT(LHS: X, RHS: Zero);
1425 Value *NotOddY = B.CreateNot(V: IsOddY);
1426 Value *CondNegAndNotOdd = B.CreateAnd(LHS: XIsNeg, RHS: NotOddY);
1427 Value *YEqZero = B.CreateICmpEQ(LHS: Y, RHS: ZeroI);
1428 Value *CondBad = B.CreateOr(LHS: CondNegAndNotOdd, RHS: YEqZero);
1429 return B.CreateSelect(C: CondBad, True: QNaN, False: Ret);
1430 }
1431 }
1432
1433 llvm_unreachable("covered switch");
1434}
1435
1436// TODO: Move the fold_pow folding to sqrt/fdiv here
1437bool AMDGPULibCalls::expandFastPow(FPMathOperator *FPOp, IRBuilder<> &B,
1438 PowKind Kind) {
1439 Type *Ty = FPOp->getType();
1440
1441 // There's currently no reason to do this for half. The correct path is
1442 // promote to float and use the fast float expansion.
1443 //
1444 // TODO: We could move this expansion to lowering to get half pow to work.
1445 if (!Ty->getScalarType()->isFloatTy())
1446 return false;
1447
1448 // TODO: Verify optimization for double and bfloat.
1449 Value *X = FPOp->getOperand(i: 0);
1450 Value *Y = FPOp->getOperand(i: 1);
1451
1452 switch (Kind) {
1453 case PowKind::Pow: {
1454 Constant *One = ConstantFP::get(Ty: X->getType(), V: 1.0);
1455
1456 // if (x == 1.0f) y = 1.0f;
1457 Value *XEqOne = B.CreateFCmpOEQ(LHS: X, RHS: One);
1458 Y = B.CreateSelect(C: XEqOne, True: One, False: Y);
1459
1460 // if (y == 0.0f) x = 1.0f;
1461 Value *YEqZero = B.CreateFCmpOEQ(LHS: Y, RHS: ConstantFP::getZero(Ty: X->getType()));
1462 X = B.CreateSelect(C: YEqZero, True: One, False: X);
1463
1464 Value *ExpYLnX = emitFastExpYLnx(B, X, Y);
1465 Value *Fixed = emitPowFixup(B, X, Y, ExpYLnX, Kind);
1466 replaceCall(I: FPOp, With: Fixed);
1467 return true;
1468 }
1469 case PowKind::PowR: {
1470 Value *NegX = B.CreateFCmpOLT(LHS: X, RHS: ConstantFP::getZero(Ty: X->getType()));
1471 X = B.CreateSelect(C: NegX, True: ConstantFP::getQNaN(Ty: X->getType()), False: X);
1472
1473 Value *ExpYLnX = emitFastExpYLnx(B, X, Y);
1474 Value *Fixed = emitPowFixup(B, X, Y, ExpYLnX, Kind);
1475 replaceCall(I: FPOp, With: Fixed);
1476 return true;
1477 }
1478 case PowKind::PowN: {
1479 // ny == 0
1480 Value *YEqZero = B.CreateICmpEQ(LHS: Y, RHS: ConstantInt::get(Ty: Y->getType(), V: 0));
1481
1482 // x = (ny == 0 ? 1.0f : x)
1483 X = B.CreateSelect(C: YEqZero, True: ConstantFP::get(Ty: X->getType(), V: 1.0), False: X);
1484
1485 Value *CastY = B.CreateSIToFP(V: Y, DestTy: X->getType());
1486 Value *ExpYLnX = emitFastExpYLnx(B, X, Y: CastY);
1487 Value *Fixed = emitPowFixup(B, X, Y, ExpYLnX, Kind);
1488 replaceCall(I: FPOp, With: Fixed);
1489 return true;
1490 }
1491 case PowKind::RootN: {
1492 Value *CastY = B.CreateSIToFP(V: Y, DestTy: X->getType());
1493
1494 // This is afn anyway, so we will turn into rcp.
1495 Value *RcpY = B.CreateFDiv(L: ConstantFP::get(Ty: X->getType(), V: 1.0), R: CastY);
1496
1497 Value *ExpYLnX = emitFastExpYLnx(B, X, Y: RcpY);
1498 Value *Fixed = emitPowFixup(B, X, Y, ExpYLnX, Kind);
1499 replaceCall(I: FPOp, With: Fixed);
1500 return true;
1501 }
1502 }
1503 llvm_unreachable("Unhandled PowKind enum");
1504}
1505
1506bool AMDGPULibCalls::tryOptimizePow(FPMathOperator *FPOp, IRBuilder<> &B,
1507 const FuncInfo &FInfo) {
1508 FastMathFlags FMF = FPOp->getFastMathFlags();
1509 CallInst *Call = cast<CallInst>(Val: FPOp);
1510 Module *M = Call->getModule();
1511
1512 FuncInfo PowrInfo;
1513 AMDGPULibFunc::EFuncId FastPowrFuncId =
1514 FMF.approxFunc() || FInfo.getId() == AMDGPULibFunc::EI_POW_FAST
1515 ? AMDGPULibFunc::EI_POWR_FAST
1516 : AMDGPULibFunc::EI_NONE;
1517 FunctionCallee PowrFunc = getFloatFastVariant(
1518 M, fInfo: FInfo, newInfo&: PowrInfo, NewFunc: AMDGPULibFunc::EI_POWR, FastVariant: FastPowrFuncId);
1519
1520 // TODO: Prefer fast pown to fast powr, but slow powr to slow pown.
1521
1522 // pow(x, y) -> powr(x, y) for x >= -0.0
1523 // TODO: Account for flags on current call
1524 if (PowrFunc && cannotBeOrderedLessThanZero(V: FPOp->getOperand(i: 0),
1525 SQ: SQ.getWithInstruction(I: Call))) {
1526 Call->setCalledFunction(PowrFunc);
1527 return fold_pow(FPOp, B, FInfo: PowrInfo) || true;
1528 }
1529
1530 // pow(x, y) -> pown(x, y) for known integral y
1531 if (isKnownIntegral(V: FPOp->getOperand(i: 1), SQ: SQ.getWithInstruction(I: Call),
1532 FMF: FPOp->getFastMathFlags())) {
1533 FunctionType *PownType = getPownType(FT: Call->getFunctionType());
1534
1535 FuncInfo PownInfo;
1536 AMDGPULibFunc::EFuncId FastPownFuncId =
1537 FMF.approxFunc() || FInfo.getId() == AMDGPULibFunc::EI_POW_FAST
1538 ? AMDGPULibFunc::EI_POWN_FAST
1539 : AMDGPULibFunc::EI_NONE;
1540 FunctionCallee PownFunc = getFloatFastVariant(
1541 M, fInfo: FInfo, newInfo&: PownInfo, NewFunc: AMDGPULibFunc::EI_POWN, FastVariant: FastPownFuncId);
1542
1543 if (PownFunc) {
1544 // TODO: If the incoming integral value is an sitofp/uitofp, it won't
1545 // fold out without a known range. We can probably take the source
1546 // value directly.
1547 Value *CastedArg =
1548 B.CreateFPToSI(V: FPOp->getOperand(i: 1), DestTy: PownType->getParamType(i: 1));
1549 // Have to drop any nofpclass attributes on the original call site.
1550 Call->removeParamAttrs(
1551 ArgNo: 1, AttrsToRemove: AttributeFuncs::typeIncompatible(Ty: CastedArg->getType(),
1552 AS: Call->getParamAttributes(ArgNo: 1)));
1553 Call->setCalledFunction(PownFunc);
1554 Call->setArgOperand(i: 1, v: CastedArg);
1555 return fold_pow(FPOp, B, FInfo: PownInfo) || true;
1556 }
1557 }
1558
1559 if (fold_pow(FPOp, B, FInfo))
1560 return true;
1561
1562 if (!FMF.approxFunc())
1563 return false;
1564
1565 if (FInfo.getId() == AMDGPULibFunc::EI_POW && FMF.approxFunc() &&
1566 getArgType(FInfo) == AMDGPULibFunc::F32) {
1567 AMDGPULibFunc PowFastInfo(AMDGPULibFunc::EI_POW_FAST, FInfo);
1568 if (FunctionCallee PowFastFunc = getFunction(M, fInfo: PowFastInfo)) {
1569 Call->setCalledFunction(PowFastFunc);
1570 return fold_pow(FPOp, B, FInfo: PowFastInfo) || true;
1571 }
1572 }
1573
1574 return expandFastPow(FPOp, B, Kind: PowKind::Pow);
1575}
1576
1577// Get a scalar native builtin single argument FP function
1578FunctionCallee AMDGPULibCalls::getNativeFunction(Module *M,
1579 const FuncInfo &FInfo) {
1580 if (getArgType(FInfo) == AMDGPULibFunc::F64 || !HasNative(id: FInfo.getId()))
1581 return nullptr;
1582 FuncInfo nf = FInfo;
1583 nf.setPrefix(AMDGPULibFunc::NATIVE);
1584 return getFunction(M, fInfo: nf);
1585}
1586
1587// Some library calls are just wrappers around llvm intrinsics, but compiled
1588// conservatively. Preserve the flags from the original call site by
1589// substituting them with direct calls with all the flags.
1590bool AMDGPULibCalls::shouldReplaceLibcallWithIntrinsic(const CallInst *CI,
1591 bool AllowMinSizeF32,
1592 bool AllowF64,
1593 bool AllowStrictFP) {
1594 Type *FltTy = CI->getType()->getScalarType();
1595 const bool IsF32 = FltTy->isFloatTy();
1596
1597 // f64 intrinsics aren't implemented for most operations.
1598 if (!IsF32 && !FltTy->isHalfTy() && (!AllowF64 || !FltTy->isDoubleTy()))
1599 return false;
1600
1601 // We're implicitly inlining by replacing the libcall with the intrinsic, so
1602 // don't do it for noinline call sites.
1603 if (CI->isNoInline())
1604 return false;
1605
1606 const Function *ParentF = CI->getFunction();
1607 // TODO: Handle strictfp
1608 if (!AllowStrictFP && ParentF->hasFnAttribute(Kind: Attribute::StrictFP))
1609 return false;
1610
1611 if (IsF32 && !AllowMinSizeF32 && ParentF->hasMinSize())
1612 return false;
1613 return true;
1614}
1615
1616void AMDGPULibCalls::replaceLibCallWithSimpleIntrinsic(IRBuilder<> &B,
1617 CallInst *CI,
1618 Intrinsic::ID IntrID) {
1619 if (CI->arg_size() == 2) {
1620 Value *Arg0 = CI->getArgOperand(i: 0);
1621 Value *Arg1 = CI->getArgOperand(i: 1);
1622 VectorType *Arg0VecTy = dyn_cast<VectorType>(Val: Arg0->getType());
1623 VectorType *Arg1VecTy = dyn_cast<VectorType>(Val: Arg1->getType());
1624 if (Arg0VecTy && !Arg1VecTy) {
1625 Value *SplatRHS = B.CreateVectorSplat(EC: Arg0VecTy->getElementCount(), V: Arg1);
1626 CI->setArgOperand(i: 1, v: SplatRHS);
1627 } else if (!Arg0VecTy && Arg1VecTy) {
1628 Value *SplatLHS = B.CreateVectorSplat(EC: Arg1VecTy->getElementCount(), V: Arg0);
1629 CI->setArgOperand(i: 0, v: SplatLHS);
1630 }
1631 }
1632
1633 CI->setCalledFunction(Intrinsic::getOrInsertDeclaration(
1634 M: CI->getModule(), id: IntrID, OverloadTys: {CI->getType()}));
1635 CI->setCallingConv(CallingConv::C);
1636}
1637
1638bool AMDGPULibCalls::tryReplaceLibcallWithSimpleIntrinsic(
1639 IRBuilder<> &B, CallInst *CI, Intrinsic::ID IntrID, bool AllowMinSizeF32,
1640 bool AllowF64, bool AllowStrictFP) {
1641 if (!shouldReplaceLibcallWithIntrinsic(CI, AllowMinSizeF32, AllowF64,
1642 AllowStrictFP))
1643 return false;
1644 replaceLibCallWithSimpleIntrinsic(B, CI, IntrID);
1645 return true;
1646}
1647
1648std::tuple<Value *, Value *, Value *>
1649AMDGPULibCalls::insertSinCos(Value *Arg, FastMathFlags FMF, IRBuilder<> &B,
1650 FunctionCallee Fsincos) {
1651 DebugLoc DL = B.getCurrentDebugLocation();
1652 Function *F = B.GetInsertBlock()->getParent();
1653 B.SetInsertPointPastAllocas(F);
1654
1655 AllocaInst *Alloc = B.CreateAlloca(Ty: Arg->getType(), ArraySize: nullptr, Name: "__sincos_");
1656
1657 if (Instruction *ArgInst = dyn_cast<Instruction>(Val: Arg)) {
1658 // If the argument is an instruction, it must dominate all uses so put our
1659 // sincos call there. Otherwise, right after the allocas works well enough
1660 // if it's an argument or constant.
1661
1662 B.SetInsertPoint(TheBB: ArgInst->getParent(), IP: ++ArgInst->getIterator());
1663
1664 // SetInsertPoint unwelcomely always tries to set the debug loc.
1665 B.SetCurrentDebugLocation(DL);
1666 }
1667
1668 Type *CosPtrTy = Fsincos.getFunctionType()->getParamType(i: 1);
1669
1670 // The allocaInst allocates the memory in private address space. This need
1671 // to be addrspacecasted to point to the address space of cos pointer type.
1672 // In OpenCL 2.0 this is generic, while in 1.2 that is private.
1673 Value *CastAlloc = B.CreateAddrSpaceCast(V: Alloc, DestTy: CosPtrTy);
1674
1675 CallInst *SinCos = CreateCallEx2(B, Callee: Fsincos, Arg1: Arg, Arg2: CastAlloc);
1676
1677 // TODO: Is it worth trying to preserve the location for the cos calls for the
1678 // load?
1679
1680 LoadInst *LoadCos = B.CreateLoad(Ty: Arg->getType(), Ptr: Alloc);
1681 return {SinCos, LoadCos, SinCos};
1682}
1683
1684// fold sin, cos -> sincos.
1685bool AMDGPULibCalls::fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B,
1686 const FuncInfo &fInfo) {
1687 assert(fInfo.getId() == AMDGPULibFunc::EI_SIN ||
1688 fInfo.getId() == AMDGPULibFunc::EI_COS);
1689
1690 if ((getArgType(FInfo: fInfo) != AMDGPULibFunc::F32 &&
1691 getArgType(FInfo: fInfo) != AMDGPULibFunc::F64) ||
1692 fInfo.getPrefix() != AMDGPULibFunc::NOPFX)
1693 return false;
1694
1695 bool const isSin = fInfo.getId() == AMDGPULibFunc::EI_SIN;
1696
1697 Value *CArgVal = FPOp->getOperand(i: 0);
1698
1699 // TODO: Constant fold the call
1700 if (isa<ConstantData>(Val: CArgVal))
1701 return false;
1702
1703 CallInst *CI = cast<CallInst>(Val: FPOp);
1704
1705 Function *F = B.GetInsertBlock()->getParent();
1706 Module *M = F->getParent();
1707
1708 // Merge the sin and cos. For OpenCL 2.0, there may only be a generic pointer
1709 // implementation. Prefer the private form if available.
1710 AMDGPULibFunc SinCosLibFuncPrivate(AMDGPULibFunc::EI_SINCOS, fInfo);
1711 SinCosLibFuncPrivate.getLeads()[0].PtrKind =
1712 AMDGPULibFunc::getEPtrKindFromAddrSpace(AS: AMDGPUAS::PRIVATE_ADDRESS);
1713
1714 AMDGPULibFunc SinCosLibFuncGeneric(AMDGPULibFunc::EI_SINCOS, fInfo);
1715 SinCosLibFuncGeneric.getLeads()[0].PtrKind =
1716 AMDGPULibFunc::getEPtrKindFromAddrSpace(AS: AMDGPUAS::FLAT_ADDRESS);
1717
1718 FunctionCallee FSinCosPrivate = getFunction(M, fInfo: SinCosLibFuncPrivate);
1719 FunctionCallee FSinCosGeneric = getFunction(M, fInfo: SinCosLibFuncGeneric);
1720 FunctionCallee FSinCos = FSinCosPrivate ? FSinCosPrivate : FSinCosGeneric;
1721 if (!FSinCos)
1722 return false;
1723
1724 SmallVector<CallInst *> SinCalls;
1725 SmallVector<CallInst *> CosCalls;
1726 SmallVector<CallInst *> SinCosCalls;
1727 FuncInfo PartnerInfo(isSin ? AMDGPULibFunc::EI_COS : AMDGPULibFunc::EI_SIN,
1728 fInfo);
1729 const std::string PairName = PartnerInfo.mangle();
1730
1731 StringRef SinName = isSin ? CI->getCalledFunction()->getName() : PairName;
1732 StringRef CosName = isSin ? PairName : CI->getCalledFunction()->getName();
1733 const std::string SinCosPrivateName = SinCosLibFuncPrivate.mangle();
1734 const std::string SinCosGenericName = SinCosLibFuncGeneric.mangle();
1735
1736 // Intersect the two sets of flags.
1737 FastMathFlags FMF = FPOp->getFastMathFlags();
1738 MDNode *FPMath = CI->getMetadata(KindID: LLVMContext::MD_fpmath);
1739
1740 SmallVector<DILocation *> MergeDbgLocs = {CI->getDebugLoc()};
1741
1742 for (User* U : CArgVal->users()) {
1743 CallInst *XI = dyn_cast<CallInst>(Val: U);
1744 if (!XI || XI->getFunction() != F || XI->isNoBuiltin())
1745 continue;
1746
1747 Function *UCallee = XI->getCalledFunction();
1748 if (!UCallee)
1749 continue;
1750
1751 bool Handled = true;
1752
1753 if (UCallee->getName() == SinName)
1754 SinCalls.push_back(Elt: XI);
1755 else if (UCallee->getName() == CosName)
1756 CosCalls.push_back(Elt: XI);
1757 else if (UCallee->getName() == SinCosPrivateName ||
1758 UCallee->getName() == SinCosGenericName)
1759 SinCosCalls.push_back(Elt: XI);
1760 else
1761 Handled = false;
1762
1763 if (Handled) {
1764 MergeDbgLocs.push_back(Elt: XI->getDebugLoc());
1765 auto *OtherOp = cast<FPMathOperator>(Val: XI);
1766 FMF &= OtherOp->getFastMathFlags();
1767 FPMath = MDNode::getMostGenericFPMath(
1768 A: FPMath, B: XI->getMetadata(KindID: LLVMContext::MD_fpmath));
1769 }
1770 }
1771
1772 if (SinCalls.empty() || CosCalls.empty())
1773 return false;
1774
1775 B.setFastMathFlags(FMF);
1776 B.setDefaultFPMathTag(FPMath);
1777 DILocation *DbgLoc = DILocation::getMergedLocations(Locs: MergeDbgLocs);
1778 B.SetCurrentDebugLocation(DbgLoc);
1779
1780 auto [Sin, Cos, SinCos] = insertSinCos(Arg: CArgVal, FMF, B, Fsincos: FSinCos);
1781
1782 auto replaceTrigInsts = [](ArrayRef<CallInst *> Calls, Value *Res) {
1783 for (CallInst *C : Calls)
1784 C->replaceAllUsesWith(V: Res);
1785
1786 // Leave the other dead instructions to avoid clobbering iterators.
1787 };
1788
1789 replaceTrigInsts(SinCalls, Sin);
1790 replaceTrigInsts(CosCalls, Cos);
1791 replaceTrigInsts(SinCosCalls, SinCos);
1792
1793 // It's safe to delete the original now.
1794 CI->eraseFromParent();
1795 return true;
1796}
1797
1798bool AMDGPULibCalls::evaluateScalarMathFunc(const FuncInfo &FInfo,
1799 APFloat &Res0, APFloat &Res1,
1800 Constant *copr0, Constant *copr1) {
1801 // By default, opr0/opr1/opr3 holds values of float/double type.
1802 // If they are not float/double, each function has to its
1803 // operand separately.
1804 double opr0 = 0.0, opr1 = 0.0;
1805 ConstantFP *fpopr0 = dyn_cast_or_null<ConstantFP>(Val: copr0);
1806 ConstantFP *fpopr1 = dyn_cast_or_null<ConstantFP>(Val: copr1);
1807 if (fpopr0) {
1808 opr0 = (getArgType(FInfo) == AMDGPULibFunc::F64)
1809 ? fpopr0->getValueAPF().convertToDouble()
1810 : (double)fpopr0->getValueAPF().convertToFloat();
1811 }
1812
1813 if (fpopr1) {
1814 opr1 = (getArgType(FInfo) == AMDGPULibFunc::F64)
1815 ? fpopr1->getValueAPF().convertToDouble()
1816 : (double)fpopr1->getValueAPF().convertToFloat();
1817 }
1818
1819 switch (FInfo.getId()) {
1820 default:
1821 return false;
1822
1823 case AMDGPULibFunc::EI_ACOS:
1824 Res0 = APFloat{acos(x: opr0)};
1825 return true;
1826
1827 case AMDGPULibFunc::EI_ACOSH:
1828 // acosh(x) == log(x + sqrt(x*x - 1))
1829 Res0 = APFloat{log(x: opr0 + sqrt(x: opr0 * opr0 - 1.0))};
1830 return true;
1831
1832 case AMDGPULibFunc::EI_ACOSPI:
1833 Res0 = APFloat{acos(x: opr0) / MATH_PI};
1834 return true;
1835
1836 case AMDGPULibFunc::EI_ASIN:
1837 Res0 = APFloat{asin(x: opr0)};
1838 return true;
1839
1840 case AMDGPULibFunc::EI_ASINH:
1841 // asinh(x) == log(x + sqrt(x*x + 1))
1842 Res0 = APFloat{log(x: opr0 + sqrt(x: opr0 * opr0 + 1.0))};
1843 return true;
1844
1845 case AMDGPULibFunc::EI_ASINPI:
1846 Res0 = APFloat{asin(x: opr0) / MATH_PI};
1847 return true;
1848
1849 case AMDGPULibFunc::EI_ATAN:
1850 Res0 = APFloat{atan(x: opr0)};
1851 return true;
1852
1853 case AMDGPULibFunc::EI_ATANH:
1854 // atanh(x) == (log(x+1) - log(x-1))/2;
1855 Res0 = APFloat{(log(x: opr0 + 1.0) - log(x: opr0 - 1.0)) / 2.0};
1856 return true;
1857
1858 case AMDGPULibFunc::EI_ATANPI:
1859 Res0 = APFloat{atan(x: opr0) / MATH_PI};
1860 return true;
1861
1862 case AMDGPULibFunc::EI_CBRT:
1863 Res0 =
1864 APFloat{(opr0 < 0.0) ? -pow(x: -opr0, y: 1.0 / 3.0) : pow(x: opr0, y: 1.0 / 3.0)};
1865 return true;
1866
1867 case AMDGPULibFunc::EI_COS:
1868 Res0 = APFloat{cos(x: opr0)};
1869 return true;
1870
1871 case AMDGPULibFunc::EI_COSH:
1872 Res0 = APFloat{cosh(x: opr0)};
1873 return true;
1874
1875 case AMDGPULibFunc::EI_COSPI:
1876 Res0 = APFloat{cos(MATH_PI * opr0)};
1877 return true;
1878
1879 case AMDGPULibFunc::EI_EXP:
1880 Res0 = APFloat{std::exp(x: opr0)};
1881 return true;
1882
1883 case AMDGPULibFunc::EI_EXP2:
1884 Res0 = APFloat{pow(x: 2.0, y: opr0)};
1885 return true;
1886
1887 case AMDGPULibFunc::EI_EXP10:
1888 Res0 = APFloat{pow(x: 10.0, y: opr0)};
1889 return true;
1890
1891 case AMDGPULibFunc::EI_LOG:
1892 Res0 = APFloat{log(x: opr0)};
1893 return true;
1894
1895 case AMDGPULibFunc::EI_LOG2:
1896 Res0 = APFloat{log(x: opr0) / log(x: 2.0)};
1897 return true;
1898
1899 case AMDGPULibFunc::EI_LOG10:
1900 Res0 = APFloat{log(x: opr0) / log(x: 10.0)};
1901 return true;
1902
1903 case AMDGPULibFunc::EI_RSQRT:
1904 Res0 = APFloat{1.0 / sqrt(x: opr0)};
1905 return true;
1906
1907 case AMDGPULibFunc::EI_SIN:
1908 Res0 = APFloat{sin(x: opr0)};
1909 return true;
1910
1911 case AMDGPULibFunc::EI_SINH:
1912 Res0 = APFloat{sinh(x: opr0)};
1913 return true;
1914
1915 case AMDGPULibFunc::EI_SINPI:
1916 Res0 = APFloat{sin(MATH_PI * opr0)};
1917 return true;
1918
1919 case AMDGPULibFunc::EI_TAN:
1920 Res0 = APFloat{tan(x: opr0)};
1921 return true;
1922
1923 case AMDGPULibFunc::EI_TANH:
1924 Res0 = APFloat{tanh(x: opr0)};
1925 return true;
1926
1927 case AMDGPULibFunc::EI_TANPI:
1928 Res0 = APFloat{tan(MATH_PI * opr0)};
1929 return true;
1930
1931 // two-arg functions
1932 case AMDGPULibFunc::EI_POW:
1933 case AMDGPULibFunc::EI_POWR:
1934 Res0 = APFloat{pow(x: opr0, y: opr1)};
1935 return true;
1936
1937 case AMDGPULibFunc::EI_POWN: {
1938 if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(Val: copr1)) {
1939 double val = (double)iopr1->getSExtValue();
1940 Res0 = APFloat{pow(x: opr0, y: val)};
1941 return true;
1942 }
1943 return false;
1944 }
1945
1946 case AMDGPULibFunc::EI_ROOTN: {
1947 if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(Val: copr1)) {
1948 double val = (double)iopr1->getSExtValue();
1949 Res0 = APFloat{pow(x: opr0, y: 1.0 / val)};
1950 return true;
1951 }
1952 return false;
1953 }
1954
1955 // with ptr arg
1956 case AMDGPULibFunc::EI_SINCOS:
1957 Res0 = APFloat{sin(x: opr0)};
1958 Res1 = APFloat{cos(x: opr0)};
1959 return true;
1960 }
1961
1962 return false;
1963}
1964
1965bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) {
1966 int numArgs = (int)aCI->arg_size();
1967 if (numArgs > 3)
1968 return false;
1969
1970 Constant *copr0 = nullptr;
1971 Constant *copr1 = nullptr;
1972 if (numArgs > 0) {
1973 if ((copr0 = dyn_cast<Constant>(Val: aCI->getArgOperand(i: 0))) == nullptr)
1974 return false;
1975 }
1976
1977 if (numArgs > 1) {
1978 if ((copr1 = dyn_cast<Constant>(Val: aCI->getArgOperand(i: 1))) == nullptr) {
1979 if (FInfo.getId() != AMDGPULibFunc::EI_SINCOS)
1980 return false;
1981 }
1982 }
1983
1984 // At this point, all arguments to aCI are constants.
1985
1986 // max vector size is 16, and sincos will generate two results.
1987 SmallVector<APFloat, 16> Val0, Val1;
1988 int FuncVecSize = getVecSize(FInfo);
1989 bool hasTwoResults = (FInfo.getId() == AMDGPULibFunc::EI_SINCOS);
1990 if (FuncVecSize == 1) {
1991 if (!evaluateScalarMathFunc(FInfo, Res0&: Val0.emplace_back(Args: 0.0),
1992 Res1&: Val1.emplace_back(Args: 0.0), copr0, copr1)) {
1993 return false;
1994 }
1995 } else {
1996 ConstantDataVector *CDV0 = dyn_cast_or_null<ConstantDataVector>(Val: copr0);
1997 ConstantDataVector *CDV1 = dyn_cast_or_null<ConstantDataVector>(Val: copr1);
1998 for (int i = 0; i < FuncVecSize; ++i) {
1999 Constant *celt0 = CDV0 ? CDV0->getElementAsConstant(i) : nullptr;
2000 Constant *celt1 = CDV1 ? CDV1->getElementAsConstant(i) : nullptr;
2001 if (!evaluateScalarMathFunc(FInfo, Res0&: Val0.emplace_back(Args: 0.0),
2002 Res1&: Val1.emplace_back(Args: 0.0), copr0: celt0, copr1: celt1)) {
2003 return false;
2004 }
2005 }
2006 }
2007
2008 Constant *nval0, *nval1;
2009 if (FuncVecSize == 1) {
2010 nval0 = ConstantFP::get(Ty: aCI->getType(), V: Val0[0]);
2011 if (hasTwoResults)
2012 nval1 = ConstantFP::get(Ty: aCI->getType(), V: Val1[0]);
2013 } else {
2014 nval0 = getConstantFloatVector(Values: Val0, Ty: aCI->getType());
2015 if (hasTwoResults)
2016 nval1 = getConstantFloatVector(Values: Val1, Ty: aCI->getType());
2017 }
2018
2019 if (hasTwoResults) {
2020 // sincos
2021 assert(FInfo.getId() == AMDGPULibFunc::EI_SINCOS &&
2022 "math function with ptr arg not supported yet");
2023 new StoreInst(nval1, aCI->getArgOperand(i: 1), aCI->getIterator());
2024 }
2025
2026 replaceCall(I: aCI, With: nval0);
2027 return true;
2028}
2029
2030PreservedAnalyses AMDGPUSimplifyLibCallsPass::run(Function &F,
2031 FunctionAnalysisManager &AM) {
2032 AMDGPULibCalls Simplifier(F, AM);
2033 Simplifier.initNativeFuncs();
2034
2035 bool Changed = false;
2036
2037 LLVM_DEBUG(dbgs() << "AMDIC: process function ";
2038 F.printAsOperand(dbgs(), false, F.getParent()); dbgs() << '\n';);
2039
2040 for (auto &BB : F) {
2041 for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;) {
2042 // Ignore non-calls.
2043 CallInst *CI = dyn_cast<CallInst>(Val&: I);
2044 ++I;
2045
2046 if (CI) {
2047 if (Simplifier.fold(CI))
2048 Changed = true;
2049 }
2050 }
2051 }
2052 return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
2053}
2054
2055PreservedAnalyses AMDGPUUseNativeCallsPass::run(Function &F,
2056 FunctionAnalysisManager &AM) {
2057 if (UseNative.empty())
2058 return PreservedAnalyses::all();
2059
2060 AMDGPULibCalls Simplifier(F, AM);
2061 Simplifier.initNativeFuncs();
2062
2063 bool Changed = false;
2064 for (auto &BB : F) {
2065 for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;) {
2066 // Ignore non-calls.
2067 CallInst *CI = dyn_cast<CallInst>(Val&: I);
2068 ++I;
2069 if (CI && Simplifier.useNative(aCI: CI))
2070 Changed = true;
2071 }
2072 }
2073 return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
2074}
2075