1 | //===- AMDGPULibCalls.cpp -------------------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// This file does AMD library function optimizations. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "AMDGPU.h" |
15 | #include "AMDGPULibFunc.h" |
16 | #include "GCNSubtarget.h" |
17 | #include "llvm/Analysis/AssumptionCache.h" |
18 | #include "llvm/Analysis/TargetLibraryInfo.h" |
19 | #include "llvm/Analysis/ValueTracking.h" |
20 | #include "llvm/IR/AttributeMask.h" |
21 | #include "llvm/IR/Dominators.h" |
22 | #include "llvm/IR/IRBuilder.h" |
23 | #include "llvm/IR/IntrinsicInst.h" |
24 | #include "llvm/IR/IntrinsicsAMDGPU.h" |
25 | #include "llvm/IR/MDBuilder.h" |
26 | #include "llvm/IR/PatternMatch.h" |
27 | #include "llvm/InitializePasses.h" |
28 | #include <cmath> |
29 | |
30 | #define DEBUG_TYPE "amdgpu-simplifylib" |
31 | |
32 | using namespace llvm; |
33 | using namespace llvm::PatternMatch; |
34 | |
35 | static cl::opt<bool> EnablePreLink("amdgpu-prelink" , |
36 | cl::desc("Enable pre-link mode optimizations" ), |
37 | cl::init(Val: false), |
38 | cl::Hidden); |
39 | |
40 | static cl::list<std::string> UseNative("amdgpu-use-native" , |
41 | cl::desc("Comma separated list of functions to replace with native, or all" ), |
42 | cl::CommaSeparated, cl::ValueOptional, |
43 | cl::Hidden); |
44 | |
45 | #define MATH_PI numbers::pi |
46 | #define MATH_E numbers::e |
47 | #define MATH_SQRT2 numbers::sqrt2 |
48 | #define MATH_SQRT1_2 numbers::inv_sqrt2 |
49 | |
50 | namespace llvm { |
51 | |
52 | class AMDGPULibCalls { |
53 | private: |
54 | const TargetLibraryInfo *TLInfo = nullptr; |
55 | AssumptionCache *AC = nullptr; |
56 | DominatorTree *DT = nullptr; |
57 | |
58 | using FuncInfo = llvm::AMDGPULibFunc; |
59 | |
60 | bool UnsafeFPMath = false; |
61 | |
62 | // -fuse-native. |
63 | bool AllNative = false; |
64 | |
65 | bool useNativeFunc(const StringRef F) const; |
66 | |
67 | // Return a pointer (pointer expr) to the function if function definition with |
68 | // "FuncName" exists. It may create a new function prototype in pre-link mode. |
69 | FunctionCallee getFunction(Module *M, const FuncInfo &fInfo); |
70 | |
71 | bool parseFunctionName(const StringRef &FMangledName, FuncInfo &FInfo); |
72 | |
73 | bool TDOFold(CallInst *CI, const FuncInfo &FInfo); |
74 | |
75 | /* Specialized optimizations */ |
76 | |
77 | // pow/powr/pown |
78 | bool fold_pow(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo); |
79 | |
80 | // rootn |
81 | bool fold_rootn(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo); |
82 | |
83 | // -fuse-native for sincos |
84 | bool sincosUseNative(CallInst *aCI, const FuncInfo &FInfo); |
85 | |
86 | // evaluate calls if calls' arguments are constants. |
87 | bool evaluateScalarMathFunc(const FuncInfo &FInfo, double &Res0, double &Res1, |
88 | Constant *copr0, Constant *copr1); |
89 | bool evaluateCall(CallInst *aCI, const FuncInfo &FInfo); |
90 | |
91 | /// Insert a value to sincos function \p Fsincos. Returns (value of sin, value |
92 | /// of cos, sincos call). |
93 | std::tuple<Value *, Value *, Value *> insertSinCos(Value *Arg, |
94 | FastMathFlags FMF, |
95 | IRBuilder<> &B, |
96 | FunctionCallee Fsincos); |
97 | |
98 | // sin/cos |
99 | bool fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo); |
100 | |
101 | // __read_pipe/__write_pipe |
102 | bool fold_read_write_pipe(CallInst *CI, IRBuilder<> &B, |
103 | const FuncInfo &FInfo); |
104 | |
105 | // Get a scalar native builtin single argument FP function |
106 | FunctionCallee getNativeFunction(Module *M, const FuncInfo &FInfo); |
107 | |
108 | /// Substitute a call to a known libcall with an intrinsic call. If \p |
109 | /// AllowMinSize is true, allow the replacement in a minsize function. |
110 | bool shouldReplaceLibcallWithIntrinsic(const CallInst *CI, |
111 | bool AllowMinSizeF32 = false, |
112 | bool AllowF64 = false, |
113 | bool AllowStrictFP = false); |
114 | void replaceLibCallWithSimpleIntrinsic(IRBuilder<> &B, CallInst *CI, |
115 | Intrinsic::ID IntrID); |
116 | |
117 | bool tryReplaceLibcallWithSimpleIntrinsic(IRBuilder<> &B, CallInst *CI, |
118 | Intrinsic::ID IntrID, |
119 | bool AllowMinSizeF32 = false, |
120 | bool AllowF64 = false, |
121 | bool AllowStrictFP = false); |
122 | |
123 | protected: |
124 | bool isUnsafeMath(const FPMathOperator *FPOp) const; |
125 | bool isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const; |
126 | |
127 | bool canIncreasePrecisionOfConstantFold(const FPMathOperator *FPOp) const; |
128 | |
129 | static void replaceCall(Instruction *I, Value *With) { |
130 | I->replaceAllUsesWith(V: With); |
131 | I->eraseFromParent(); |
132 | } |
133 | |
134 | static void replaceCall(FPMathOperator *I, Value *With) { |
135 | replaceCall(I: cast<Instruction>(Val: I), With); |
136 | } |
137 | |
138 | public: |
139 | AMDGPULibCalls() = default; |
140 | |
141 | bool fold(CallInst *CI); |
142 | |
143 | void initFunction(Function &F, FunctionAnalysisManager &FAM); |
144 | void initNativeFuncs(); |
145 | |
146 | // Replace a normal math function call with that native version |
147 | bool useNative(CallInst *CI); |
148 | }; |
149 | |
150 | } // end namespace llvm |
151 | |
152 | template <typename IRB> |
153 | static CallInst *CreateCallEx(IRB &B, FunctionCallee Callee, Value *Arg, |
154 | const Twine &Name = "" ) { |
155 | CallInst *R = B.CreateCall(Callee, Arg, Name); |
156 | if (Function *F = dyn_cast<Function>(Val: Callee.getCallee())) |
157 | R->setCallingConv(F->getCallingConv()); |
158 | return R; |
159 | } |
160 | |
161 | template <typename IRB> |
162 | static CallInst *CreateCallEx2(IRB &B, FunctionCallee Callee, Value *Arg1, |
163 | Value *Arg2, const Twine &Name = "" ) { |
164 | CallInst *R = B.CreateCall(Callee, {Arg1, Arg2}, Name); |
165 | if (Function *F = dyn_cast<Function>(Val: Callee.getCallee())) |
166 | R->setCallingConv(F->getCallingConv()); |
167 | return R; |
168 | } |
169 | |
170 | static FunctionType *getPownType(FunctionType *FT) { |
171 | Type *PowNExpTy = Type::getInt32Ty(C&: FT->getContext()); |
172 | if (VectorType *VecTy = dyn_cast<VectorType>(Val: FT->getReturnType())) |
173 | PowNExpTy = VectorType::get(ElementType: PowNExpTy, EC: VecTy->getElementCount()); |
174 | |
175 | return FunctionType::get(Result: FT->getReturnType(), |
176 | Params: {FT->getParamType(i: 0), PowNExpTy}, isVarArg: false); |
177 | } |
178 | |
179 | // Data structures for table-driven optimizations. |
180 | // FuncTbl works for both f32 and f64 functions with 1 input argument |
181 | |
182 | struct TableEntry { |
183 | double result; |
184 | double input; |
185 | }; |
186 | |
187 | /* a list of {result, input} */ |
188 | static const TableEntry tbl_acos[] = { |
189 | {MATH_PI / 2.0, .input: 0.0}, |
190 | {MATH_PI / 2.0, .input: -0.0}, |
191 | {.result: 0.0, .input: 1.0}, |
192 | {MATH_PI, .input: -1.0} |
193 | }; |
194 | static const TableEntry tbl_acosh[] = { |
195 | {.result: 0.0, .input: 1.0} |
196 | }; |
197 | static const TableEntry tbl_acospi[] = { |
198 | {.result: 0.5, .input: 0.0}, |
199 | {.result: 0.5, .input: -0.0}, |
200 | {.result: 0.0, .input: 1.0}, |
201 | {.result: 1.0, .input: -1.0} |
202 | }; |
203 | static const TableEntry tbl_asin[] = { |
204 | {.result: 0.0, .input: 0.0}, |
205 | {.result: -0.0, .input: -0.0}, |
206 | {MATH_PI / 2.0, .input: 1.0}, |
207 | {.result: -MATH_PI / 2.0, .input: -1.0} |
208 | }; |
209 | static const TableEntry tbl_asinh[] = { |
210 | {.result: 0.0, .input: 0.0}, |
211 | {.result: -0.0, .input: -0.0} |
212 | }; |
213 | static const TableEntry tbl_asinpi[] = { |
214 | {.result: 0.0, .input: 0.0}, |
215 | {.result: -0.0, .input: -0.0}, |
216 | {.result: 0.5, .input: 1.0}, |
217 | {.result: -0.5, .input: -1.0} |
218 | }; |
219 | static const TableEntry tbl_atan[] = { |
220 | {.result: 0.0, .input: 0.0}, |
221 | {.result: -0.0, .input: -0.0}, |
222 | {MATH_PI / 4.0, .input: 1.0}, |
223 | {.result: -MATH_PI / 4.0, .input: -1.0} |
224 | }; |
225 | static const TableEntry tbl_atanh[] = { |
226 | {.result: 0.0, .input: 0.0}, |
227 | {.result: -0.0, .input: -0.0} |
228 | }; |
229 | static const TableEntry tbl_atanpi[] = { |
230 | {.result: 0.0, .input: 0.0}, |
231 | {.result: -0.0, .input: -0.0}, |
232 | {.result: 0.25, .input: 1.0}, |
233 | {.result: -0.25, .input: -1.0} |
234 | }; |
235 | static const TableEntry tbl_cbrt[] = { |
236 | {.result: 0.0, .input: 0.0}, |
237 | {.result: -0.0, .input: -0.0}, |
238 | {.result: 1.0, .input: 1.0}, |
239 | {.result: -1.0, .input: -1.0}, |
240 | }; |
241 | static const TableEntry tbl_cos[] = { |
242 | {.result: 1.0, .input: 0.0}, |
243 | {.result: 1.0, .input: -0.0} |
244 | }; |
245 | static const TableEntry tbl_cosh[] = { |
246 | {.result: 1.0, .input: 0.0}, |
247 | {.result: 1.0, .input: -0.0} |
248 | }; |
249 | static const TableEntry tbl_cospi[] = { |
250 | {.result: 1.0, .input: 0.0}, |
251 | {.result: 1.0, .input: -0.0} |
252 | }; |
253 | static const TableEntry tbl_erfc[] = { |
254 | {.result: 1.0, .input: 0.0}, |
255 | {.result: 1.0, .input: -0.0} |
256 | }; |
257 | static const TableEntry tbl_erf[] = { |
258 | {.result: 0.0, .input: 0.0}, |
259 | {.result: -0.0, .input: -0.0} |
260 | }; |
261 | static const TableEntry tbl_exp[] = { |
262 | {.result: 1.0, .input: 0.0}, |
263 | {.result: 1.0, .input: -0.0}, |
264 | {MATH_E, .input: 1.0} |
265 | }; |
266 | static const TableEntry tbl_exp2[] = { |
267 | {.result: 1.0, .input: 0.0}, |
268 | {.result: 1.0, .input: -0.0}, |
269 | {.result: 2.0, .input: 1.0} |
270 | }; |
271 | static const TableEntry tbl_exp10[] = { |
272 | {.result: 1.0, .input: 0.0}, |
273 | {.result: 1.0, .input: -0.0}, |
274 | {.result: 10.0, .input: 1.0} |
275 | }; |
276 | static const TableEntry tbl_expm1[] = { |
277 | {.result: 0.0, .input: 0.0}, |
278 | {.result: -0.0, .input: -0.0} |
279 | }; |
280 | static const TableEntry tbl_log[] = { |
281 | {.result: 0.0, .input: 1.0}, |
282 | {.result: 1.0, MATH_E} |
283 | }; |
284 | static const TableEntry tbl_log2[] = { |
285 | {.result: 0.0, .input: 1.0}, |
286 | {.result: 1.0, .input: 2.0} |
287 | }; |
288 | static const TableEntry tbl_log10[] = { |
289 | {.result: 0.0, .input: 1.0}, |
290 | {.result: 1.0, .input: 10.0} |
291 | }; |
292 | static const TableEntry tbl_rsqrt[] = { |
293 | {.result: 1.0, .input: 1.0}, |
294 | {MATH_SQRT1_2, .input: 2.0} |
295 | }; |
296 | static const TableEntry tbl_sin[] = { |
297 | {.result: 0.0, .input: 0.0}, |
298 | {.result: -0.0, .input: -0.0} |
299 | }; |
300 | static const TableEntry tbl_sinh[] = { |
301 | {.result: 0.0, .input: 0.0}, |
302 | {.result: -0.0, .input: -0.0} |
303 | }; |
304 | static const TableEntry tbl_sinpi[] = { |
305 | {.result: 0.0, .input: 0.0}, |
306 | {.result: -0.0, .input: -0.0} |
307 | }; |
308 | static const TableEntry tbl_sqrt[] = { |
309 | {.result: 0.0, .input: 0.0}, |
310 | {.result: 1.0, .input: 1.0}, |
311 | {MATH_SQRT2, .input: 2.0} |
312 | }; |
313 | static const TableEntry tbl_tan[] = { |
314 | {.result: 0.0, .input: 0.0}, |
315 | {.result: -0.0, .input: -0.0} |
316 | }; |
317 | static const TableEntry tbl_tanh[] = { |
318 | {.result: 0.0, .input: 0.0}, |
319 | {.result: -0.0, .input: -0.0} |
320 | }; |
321 | static const TableEntry tbl_tanpi[] = { |
322 | {.result: 0.0, .input: 0.0}, |
323 | {.result: -0.0, .input: -0.0} |
324 | }; |
325 | static const TableEntry tbl_tgamma[] = { |
326 | {.result: 1.0, .input: 1.0}, |
327 | {.result: 1.0, .input: 2.0}, |
328 | {.result: 2.0, .input: 3.0}, |
329 | {.result: 6.0, .input: 4.0} |
330 | }; |
331 | |
332 | static bool HasNative(AMDGPULibFunc::EFuncId id) { |
333 | switch(id) { |
334 | case AMDGPULibFunc::EI_DIVIDE: |
335 | case AMDGPULibFunc::EI_COS: |
336 | case AMDGPULibFunc::EI_EXP: |
337 | case AMDGPULibFunc::EI_EXP2: |
338 | case AMDGPULibFunc::EI_EXP10: |
339 | case AMDGPULibFunc::EI_LOG: |
340 | case AMDGPULibFunc::EI_LOG2: |
341 | case AMDGPULibFunc::EI_LOG10: |
342 | case AMDGPULibFunc::EI_POWR: |
343 | case AMDGPULibFunc::EI_RECIP: |
344 | case AMDGPULibFunc::EI_RSQRT: |
345 | case AMDGPULibFunc::EI_SIN: |
346 | case AMDGPULibFunc::EI_SINCOS: |
347 | case AMDGPULibFunc::EI_SQRT: |
348 | case AMDGPULibFunc::EI_TAN: |
349 | return true; |
350 | default:; |
351 | } |
352 | return false; |
353 | } |
354 | |
355 | using TableRef = ArrayRef<TableEntry>; |
356 | |
357 | static TableRef getOptTable(AMDGPULibFunc::EFuncId id) { |
358 | switch(id) { |
359 | case AMDGPULibFunc::EI_ACOS: return TableRef(tbl_acos); |
360 | case AMDGPULibFunc::EI_ACOSH: return TableRef(tbl_acosh); |
361 | case AMDGPULibFunc::EI_ACOSPI: return TableRef(tbl_acospi); |
362 | case AMDGPULibFunc::EI_ASIN: return TableRef(tbl_asin); |
363 | case AMDGPULibFunc::EI_ASINH: return TableRef(tbl_asinh); |
364 | case AMDGPULibFunc::EI_ASINPI: return TableRef(tbl_asinpi); |
365 | case AMDGPULibFunc::EI_ATAN: return TableRef(tbl_atan); |
366 | case AMDGPULibFunc::EI_ATANH: return TableRef(tbl_atanh); |
367 | case AMDGPULibFunc::EI_ATANPI: return TableRef(tbl_atanpi); |
368 | case AMDGPULibFunc::EI_CBRT: return TableRef(tbl_cbrt); |
369 | case AMDGPULibFunc::EI_NCOS: |
370 | case AMDGPULibFunc::EI_COS: return TableRef(tbl_cos); |
371 | case AMDGPULibFunc::EI_COSH: return TableRef(tbl_cosh); |
372 | case AMDGPULibFunc::EI_COSPI: return TableRef(tbl_cospi); |
373 | case AMDGPULibFunc::EI_ERFC: return TableRef(tbl_erfc); |
374 | case AMDGPULibFunc::EI_ERF: return TableRef(tbl_erf); |
375 | case AMDGPULibFunc::EI_EXP: return TableRef(tbl_exp); |
376 | case AMDGPULibFunc::EI_NEXP2: |
377 | case AMDGPULibFunc::EI_EXP2: return TableRef(tbl_exp2); |
378 | case AMDGPULibFunc::EI_EXP10: return TableRef(tbl_exp10); |
379 | case AMDGPULibFunc::EI_EXPM1: return TableRef(tbl_expm1); |
380 | case AMDGPULibFunc::EI_LOG: return TableRef(tbl_log); |
381 | case AMDGPULibFunc::EI_NLOG2: |
382 | case AMDGPULibFunc::EI_LOG2: return TableRef(tbl_log2); |
383 | case AMDGPULibFunc::EI_LOG10: return TableRef(tbl_log10); |
384 | case AMDGPULibFunc::EI_NRSQRT: |
385 | case AMDGPULibFunc::EI_RSQRT: return TableRef(tbl_rsqrt); |
386 | case AMDGPULibFunc::EI_NSIN: |
387 | case AMDGPULibFunc::EI_SIN: return TableRef(tbl_sin); |
388 | case AMDGPULibFunc::EI_SINH: return TableRef(tbl_sinh); |
389 | case AMDGPULibFunc::EI_SINPI: return TableRef(tbl_sinpi); |
390 | case AMDGPULibFunc::EI_NSQRT: |
391 | case AMDGPULibFunc::EI_SQRT: return TableRef(tbl_sqrt); |
392 | case AMDGPULibFunc::EI_TAN: return TableRef(tbl_tan); |
393 | case AMDGPULibFunc::EI_TANH: return TableRef(tbl_tanh); |
394 | case AMDGPULibFunc::EI_TANPI: return TableRef(tbl_tanpi); |
395 | case AMDGPULibFunc::EI_TGAMMA: return TableRef(tbl_tgamma); |
396 | default:; |
397 | } |
398 | return TableRef(); |
399 | } |
400 | |
401 | static inline int getVecSize(const AMDGPULibFunc& FInfo) { |
402 | return FInfo.getLeads()[0].VectorSize; |
403 | } |
404 | |
405 | static inline AMDGPULibFunc::EType getArgType(const AMDGPULibFunc& FInfo) { |
406 | return (AMDGPULibFunc::EType)FInfo.getLeads()[0].ArgType; |
407 | } |
408 | |
409 | FunctionCallee AMDGPULibCalls::getFunction(Module *M, const FuncInfo &fInfo) { |
410 | // If we are doing PreLinkOpt, the function is external. So it is safe to |
411 | // use getOrInsertFunction() at this stage. |
412 | |
413 | return EnablePreLink ? AMDGPULibFunc::getOrInsertFunction(M, fInfo) |
414 | : AMDGPULibFunc::getFunction(M, fInfo); |
415 | } |
416 | |
417 | bool AMDGPULibCalls::parseFunctionName(const StringRef &FMangledName, |
418 | FuncInfo &FInfo) { |
419 | return AMDGPULibFunc::parse(MangledName: FMangledName, Ptr&: FInfo); |
420 | } |
421 | |
422 | bool AMDGPULibCalls::isUnsafeMath(const FPMathOperator *FPOp) const { |
423 | return UnsafeFPMath || FPOp->isFast(); |
424 | } |
425 | |
426 | bool AMDGPULibCalls::isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const { |
427 | return UnsafeFPMath || |
428 | (FPOp->hasApproxFunc() && FPOp->hasNoNaNs() && FPOp->hasNoInfs()); |
429 | } |
430 | |
431 | bool AMDGPULibCalls::canIncreasePrecisionOfConstantFold( |
432 | const FPMathOperator *FPOp) const { |
433 | // TODO: Refine to approxFunc or contract |
434 | return isUnsafeMath(FPOp); |
435 | } |
436 | |
437 | void AMDGPULibCalls::initFunction(Function &F, FunctionAnalysisManager &FAM) { |
438 | UnsafeFPMath = F.getFnAttribute(Kind: "unsafe-fp-math" ).getValueAsBool(); |
439 | AC = &FAM.getResult<AssumptionAnalysis>(IR&: F); |
440 | TLInfo = &FAM.getResult<TargetLibraryAnalysis>(IR&: F); |
441 | DT = FAM.getCachedResult<DominatorTreeAnalysis>(IR&: F); |
442 | } |
443 | |
444 | bool AMDGPULibCalls::useNativeFunc(const StringRef F) const { |
445 | return AllNative || llvm::is_contained(Range&: UseNative, Element: F); |
446 | } |
447 | |
448 | void AMDGPULibCalls::initNativeFuncs() { |
449 | AllNative = useNativeFunc(F: "all" ) || |
450 | (UseNative.getNumOccurrences() && UseNative.size() == 1 && |
451 | UseNative.begin()->empty()); |
452 | } |
453 | |
454 | bool AMDGPULibCalls::sincosUseNative(CallInst *aCI, const FuncInfo &FInfo) { |
455 | bool native_sin = useNativeFunc(F: "sin" ); |
456 | bool native_cos = useNativeFunc(F: "cos" ); |
457 | |
458 | if (native_sin && native_cos) { |
459 | Module *M = aCI->getModule(); |
460 | Value *opr0 = aCI->getArgOperand(i: 0); |
461 | |
462 | AMDGPULibFunc nf; |
463 | nf.getLeads()[0].ArgType = FInfo.getLeads()[0].ArgType; |
464 | nf.getLeads()[0].VectorSize = FInfo.getLeads()[0].VectorSize; |
465 | |
466 | nf.setPrefix(AMDGPULibFunc::NATIVE); |
467 | nf.setId(AMDGPULibFunc::EI_SIN); |
468 | FunctionCallee sinExpr = getFunction(M, fInfo: nf); |
469 | |
470 | nf.setPrefix(AMDGPULibFunc::NATIVE); |
471 | nf.setId(AMDGPULibFunc::EI_COS); |
472 | FunctionCallee cosExpr = getFunction(M, fInfo: nf); |
473 | if (sinExpr && cosExpr) { |
474 | Value *sinval = |
475 | CallInst::Create(Func: sinExpr, Args: opr0, NameStr: "splitsin" , InsertBefore: aCI->getIterator()); |
476 | Value *cosval = |
477 | CallInst::Create(Func: cosExpr, Args: opr0, NameStr: "splitcos" , InsertBefore: aCI->getIterator()); |
478 | new StoreInst(cosval, aCI->getArgOperand(i: 1), aCI->getIterator()); |
479 | |
480 | DEBUG_WITH_TYPE("usenative" , dbgs() << "<useNative> replace " << *aCI |
481 | << " with native version of sin/cos" ); |
482 | |
483 | replaceCall(I: aCI, With: sinval); |
484 | return true; |
485 | } |
486 | } |
487 | return false; |
488 | } |
489 | |
490 | bool AMDGPULibCalls::useNative(CallInst *aCI) { |
491 | Function *Callee = aCI->getCalledFunction(); |
492 | if (!Callee || aCI->isNoBuiltin()) |
493 | return false; |
494 | |
495 | FuncInfo FInfo; |
496 | if (!parseFunctionName(FMangledName: Callee->getName(), FInfo) || !FInfo.isMangled() || |
497 | FInfo.getPrefix() != AMDGPULibFunc::NOPFX || |
498 | getArgType(FInfo) == AMDGPULibFunc::F64 || !HasNative(id: FInfo.getId()) || |
499 | !(AllNative || useNativeFunc(F: FInfo.getName()))) { |
500 | return false; |
501 | } |
502 | |
503 | if (FInfo.getId() == AMDGPULibFunc::EI_SINCOS) |
504 | return sincosUseNative(aCI, FInfo); |
505 | |
506 | FInfo.setPrefix(AMDGPULibFunc::NATIVE); |
507 | FunctionCallee F = getFunction(M: aCI->getModule(), fInfo: FInfo); |
508 | if (!F) |
509 | return false; |
510 | |
511 | aCI->setCalledFunction(F); |
512 | DEBUG_WITH_TYPE("usenative" , dbgs() << "<useNative> replace " << *aCI |
513 | << " with native version" ); |
514 | return true; |
515 | } |
516 | |
517 | // Clang emits call of __read_pipe_2 or __read_pipe_4 for OpenCL read_pipe |
518 | // builtin, with appended type size and alignment arguments, where 2 or 4 |
519 | // indicates the original number of arguments. The library has optimized version |
520 | // of __read_pipe_2/__read_pipe_4 when the type size and alignment has the same |
521 | // power of 2 value. This function transforms __read_pipe_2 to __read_pipe_2_N |
522 | // for such cases where N is the size in bytes of the type (N = 1, 2, 4, 8, ..., |
523 | // 128). The same for __read_pipe_4, write_pipe_2, and write_pipe_4. |
524 | bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B, |
525 | const FuncInfo &FInfo) { |
526 | auto *Callee = CI->getCalledFunction(); |
527 | if (!Callee->isDeclaration()) |
528 | return false; |
529 | |
530 | assert(Callee->hasName() && "Invalid read_pipe/write_pipe function" ); |
531 | auto *M = Callee->getParent(); |
532 | std::string Name = std::string(Callee->getName()); |
533 | auto NumArg = CI->arg_size(); |
534 | if (NumArg != 4 && NumArg != 6) |
535 | return false; |
536 | ConstantInt *PacketSize = |
537 | dyn_cast<ConstantInt>(Val: CI->getArgOperand(i: NumArg - 2)); |
538 | ConstantInt *PacketAlign = |
539 | dyn_cast<ConstantInt>(Val: CI->getArgOperand(i: NumArg - 1)); |
540 | if (!PacketSize || !PacketAlign) |
541 | return false; |
542 | |
543 | unsigned Size = PacketSize->getZExtValue(); |
544 | Align Alignment = PacketAlign->getAlignValue(); |
545 | if (Alignment != Size) |
546 | return false; |
547 | |
548 | unsigned PtrArgLoc = CI->arg_size() - 3; |
549 | Value *PtrArg = CI->getArgOperand(i: PtrArgLoc); |
550 | Type *PtrTy = PtrArg->getType(); |
551 | |
552 | SmallVector<llvm::Type *, 6> ArgTys; |
553 | for (unsigned I = 0; I != PtrArgLoc; ++I) |
554 | ArgTys.push_back(Elt: CI->getArgOperand(i: I)->getType()); |
555 | ArgTys.push_back(Elt: PtrTy); |
556 | |
557 | Name = Name + "_" + std::to_string(val: Size); |
558 | auto *FTy = FunctionType::get(Result: Callee->getReturnType(), |
559 | Params: ArrayRef<Type *>(ArgTys), isVarArg: false); |
560 | AMDGPULibFunc NewLibFunc(Name, FTy); |
561 | FunctionCallee F = AMDGPULibFunc::getOrInsertFunction(M, fInfo: NewLibFunc); |
562 | if (!F) |
563 | return false; |
564 | |
565 | SmallVector<Value *, 6> Args; |
566 | for (unsigned I = 0; I != PtrArgLoc; ++I) |
567 | Args.push_back(Elt: CI->getArgOperand(i: I)); |
568 | Args.push_back(Elt: PtrArg); |
569 | |
570 | auto *NCI = B.CreateCall(Callee: F, Args); |
571 | NCI->setAttributes(CI->getAttributes()); |
572 | CI->replaceAllUsesWith(V: NCI); |
573 | CI->dropAllReferences(); |
574 | CI->eraseFromParent(); |
575 | |
576 | return true; |
577 | } |
578 | |
579 | static bool isKnownIntegral(const Value *V, const DataLayout &DL, |
580 | FastMathFlags FMF) { |
581 | if (isa<PoisonValue>(Val: V)) |
582 | return true; |
583 | if (isa<UndefValue>(Val: V)) |
584 | return false; |
585 | |
586 | if (const ConstantFP *CF = dyn_cast<ConstantFP>(Val: V)) |
587 | return CF->getValueAPF().isInteger(); |
588 | |
589 | auto *VFVTy = dyn_cast<FixedVectorType>(Val: V->getType()); |
590 | const Constant *CV = dyn_cast<Constant>(Val: V); |
591 | if (VFVTy && CV) { |
592 | unsigned NumElts = VFVTy->getNumElements(); |
593 | for (unsigned i = 0; i != NumElts; ++i) { |
594 | Constant *Elt = CV->getAggregateElement(Elt: i); |
595 | if (!Elt) |
596 | return false; |
597 | if (isa<PoisonValue>(Val: Elt)) |
598 | continue; |
599 | |
600 | const ConstantFP *CFP = dyn_cast<ConstantFP>(Val: Elt); |
601 | if (!CFP || !CFP->getValue().isInteger()) |
602 | return false; |
603 | } |
604 | |
605 | return true; |
606 | } |
607 | |
608 | const Instruction *I = dyn_cast<Instruction>(Val: V); |
609 | if (!I) |
610 | return false; |
611 | |
612 | switch (I->getOpcode()) { |
613 | case Instruction::SIToFP: |
614 | case Instruction::UIToFP: |
615 | // TODO: Could check nofpclass(inf) on incoming argument |
616 | if (FMF.noInfs()) |
617 | return true; |
618 | |
619 | // Need to check int size cannot produce infinity, which computeKnownFPClass |
620 | // knows how to do already. |
621 | return isKnownNeverInfinity(V: I, /*Depth=*/0, SQ: SimplifyQuery(DL)); |
622 | case Instruction::Call: { |
623 | const CallInst *CI = cast<CallInst>(Val: I); |
624 | switch (CI->getIntrinsicID()) { |
625 | case Intrinsic::trunc: |
626 | case Intrinsic::floor: |
627 | case Intrinsic::ceil: |
628 | case Intrinsic::rint: |
629 | case Intrinsic::nearbyint: |
630 | case Intrinsic::round: |
631 | case Intrinsic::roundeven: |
632 | return (FMF.noInfs() && FMF.noNaNs()) || |
633 | isKnownNeverInfOrNaN(V: I, /*Depth=*/0, SQ: SimplifyQuery(DL)); |
634 | default: |
635 | break; |
636 | } |
637 | |
638 | break; |
639 | } |
640 | default: |
641 | break; |
642 | } |
643 | |
644 | return false; |
645 | } |
646 | |
647 | // This function returns false if no change; return true otherwise. |
648 | bool AMDGPULibCalls::fold(CallInst *CI) { |
649 | Function *Callee = CI->getCalledFunction(); |
650 | // Ignore indirect calls. |
651 | if (!Callee || Callee->isIntrinsic() || CI->isNoBuiltin()) |
652 | return false; |
653 | |
654 | FuncInfo FInfo; |
655 | if (!parseFunctionName(FMangledName: Callee->getName(), FInfo)) |
656 | return false; |
657 | |
658 | // Further check the number of arguments to see if they match. |
659 | // TODO: Check calling convention matches too |
660 | if (!FInfo.isCompatibleSignature(FuncTy: CI->getFunctionType())) |
661 | return false; |
662 | |
663 | LLVM_DEBUG(dbgs() << "AMDIC: try folding " << *CI << '\n'); |
664 | |
665 | if (TDOFold(CI, FInfo)) |
666 | return true; |
667 | |
668 | IRBuilder<> B(CI); |
669 | if (CI->isStrictFP()) |
670 | B.setIsFPConstrained(true); |
671 | |
672 | if (FPMathOperator *FPOp = dyn_cast<FPMathOperator>(Val: CI)) { |
673 | // Under unsafe-math, evaluate calls if possible. |
674 | // According to Brian Sumner, we can do this for all f32 function calls |
675 | // using host's double function calls. |
676 | if (canIncreasePrecisionOfConstantFold(FPOp) && evaluateCall(aCI: CI, FInfo)) |
677 | return true; |
678 | |
679 | // Copy fast flags from the original call. |
680 | FastMathFlags FMF = FPOp->getFastMathFlags(); |
681 | B.setFastMathFlags(FMF); |
682 | |
683 | // Specialized optimizations for each function call. |
684 | // |
685 | // TODO: Handle native functions |
686 | switch (FInfo.getId()) { |
687 | case AMDGPULibFunc::EI_EXP: |
688 | if (FMF.none()) |
689 | return false; |
690 | return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::exp, |
691 | AllowMinSizeF32: FMF.approxFunc()); |
692 | case AMDGPULibFunc::EI_EXP2: |
693 | if (FMF.none()) |
694 | return false; |
695 | return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::exp2, |
696 | AllowMinSizeF32: FMF.approxFunc()); |
697 | case AMDGPULibFunc::EI_LOG: |
698 | if (FMF.none()) |
699 | return false; |
700 | return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::log, |
701 | AllowMinSizeF32: FMF.approxFunc()); |
702 | case AMDGPULibFunc::EI_LOG2: |
703 | if (FMF.none()) |
704 | return false; |
705 | return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::log2, |
706 | AllowMinSizeF32: FMF.approxFunc()); |
707 | case AMDGPULibFunc::EI_LOG10: |
708 | if (FMF.none()) |
709 | return false; |
710 | return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::log10, |
711 | AllowMinSizeF32: FMF.approxFunc()); |
712 | case AMDGPULibFunc::EI_FMIN: |
713 | return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::minnum, |
714 | AllowMinSizeF32: true, AllowF64: true); |
715 | case AMDGPULibFunc::EI_FMAX: |
716 | return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::maxnum, |
717 | AllowMinSizeF32: true, AllowF64: true); |
718 | case AMDGPULibFunc::EI_FMA: |
719 | return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::fma, AllowMinSizeF32: true, |
720 | AllowF64: true); |
721 | case AMDGPULibFunc::EI_MAD: |
722 | return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::fmuladd, |
723 | AllowMinSizeF32: true, AllowF64: true); |
724 | case AMDGPULibFunc::EI_FABS: |
725 | return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::fabs, AllowMinSizeF32: true, |
726 | AllowF64: true, AllowStrictFP: true); |
727 | case AMDGPULibFunc::EI_COPYSIGN: |
728 | return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::copysign, |
729 | AllowMinSizeF32: true, AllowF64: true, AllowStrictFP: true); |
730 | case AMDGPULibFunc::EI_FLOOR: |
731 | return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::floor, AllowMinSizeF32: true, |
732 | AllowF64: true); |
733 | case AMDGPULibFunc::EI_CEIL: |
734 | return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::ceil, AllowMinSizeF32: true, |
735 | AllowF64: true); |
736 | case AMDGPULibFunc::EI_TRUNC: |
737 | return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::trunc, AllowMinSizeF32: true, |
738 | AllowF64: true); |
739 | case AMDGPULibFunc::EI_RINT: |
740 | return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::rint, AllowMinSizeF32: true, |
741 | AllowF64: true); |
742 | case AMDGPULibFunc::EI_ROUND: |
743 | return tryReplaceLibcallWithSimpleIntrinsic(B, CI, IntrID: Intrinsic::round, AllowMinSizeF32: true, |
744 | AllowF64: true); |
745 | case AMDGPULibFunc::EI_LDEXP: { |
746 | if (!shouldReplaceLibcallWithIntrinsic(CI, AllowMinSizeF32: true, AllowF64: true)) |
747 | return false; |
748 | |
749 | Value *Arg1 = CI->getArgOperand(i: 1); |
750 | if (VectorType *VecTy = dyn_cast<VectorType>(Val: CI->getType()); |
751 | VecTy && !isa<VectorType>(Val: Arg1->getType())) { |
752 | Value *SplatArg1 = B.CreateVectorSplat(EC: VecTy->getElementCount(), V: Arg1); |
753 | CI->setArgOperand(i: 1, v: SplatArg1); |
754 | } |
755 | |
756 | CI->setCalledFunction(Intrinsic::getDeclaration( |
757 | M: CI->getModule(), id: Intrinsic::ldexp, |
758 | Tys: {CI->getType(), CI->getArgOperand(i: 1)->getType()})); |
759 | return true; |
760 | } |
761 | case AMDGPULibFunc::EI_POW: { |
762 | Module *M = Callee->getParent(); |
763 | AMDGPULibFunc PowrInfo(AMDGPULibFunc::EI_POWR, FInfo); |
764 | FunctionCallee PowrFunc = getFunction(M, fInfo: PowrInfo); |
765 | CallInst *Call = cast<CallInst>(Val: FPOp); |
766 | |
767 | // pow(x, y) -> powr(x, y) for x >= -0.0 |
768 | // TODO: Account for flags on current call |
769 | if (PowrFunc && |
770 | cannotBeOrderedLessThanZero( |
771 | V: FPOp->getOperand(i: 0), /*Depth=*/0, |
772 | SQ: SimplifyQuery(M->getDataLayout(), TLInfo, DT, AC, Call))) { |
773 | Call->setCalledFunction(PowrFunc); |
774 | return fold_pow(FPOp, B, FInfo: PowrInfo) || true; |
775 | } |
776 | |
777 | // pow(x, y) -> pown(x, y) for known integral y |
778 | if (isKnownIntegral(V: FPOp->getOperand(i: 1), DL: M->getDataLayout(), |
779 | FMF: FPOp->getFastMathFlags())) { |
780 | FunctionType *PownType = getPownType(FT: CI->getFunctionType()); |
781 | AMDGPULibFunc PownInfo(AMDGPULibFunc::EI_POWN, PownType, true); |
782 | FunctionCallee PownFunc = getFunction(M, fInfo: PownInfo); |
783 | if (PownFunc) { |
784 | // TODO: If the incoming integral value is an sitofp/uitofp, it won't |
785 | // fold out without a known range. We can probably take the source |
786 | // value directly. |
787 | Value *CastedArg = |
788 | B.CreateFPToSI(V: FPOp->getOperand(i: 1), DestTy: PownType->getParamType(i: 1)); |
789 | // Have to drop any nofpclass attributes on the original call site. |
790 | Call->removeParamAttrs( |
791 | ArgNo: 1, AttrsToRemove: AttributeFuncs::typeIncompatible(Ty: CastedArg->getType())); |
792 | Call->setCalledFunction(PownFunc); |
793 | Call->setArgOperand(i: 1, v: CastedArg); |
794 | return fold_pow(FPOp, B, FInfo: PownInfo) || true; |
795 | } |
796 | } |
797 | |
798 | return fold_pow(FPOp, B, FInfo); |
799 | } |
800 | case AMDGPULibFunc::EI_POWR: |
801 | case AMDGPULibFunc::EI_POWN: |
802 | return fold_pow(FPOp, B, FInfo); |
803 | case AMDGPULibFunc::EI_ROOTN: |
804 | return fold_rootn(FPOp, B, FInfo); |
805 | case AMDGPULibFunc::EI_SQRT: |
806 | // TODO: Allow with strictfp + constrained intrinsic |
807 | return tryReplaceLibcallWithSimpleIntrinsic( |
808 | B, CI, IntrID: Intrinsic::sqrt, AllowMinSizeF32: true, AllowF64: true, /*AllowStrictFP=*/false); |
809 | case AMDGPULibFunc::EI_COS: |
810 | case AMDGPULibFunc::EI_SIN: |
811 | return fold_sincos(FPOp, B, FInfo); |
812 | default: |
813 | break; |
814 | } |
815 | } else { |
816 | // Specialized optimizations for each function call |
817 | switch (FInfo.getId()) { |
818 | case AMDGPULibFunc::EI_READ_PIPE_2: |
819 | case AMDGPULibFunc::EI_READ_PIPE_4: |
820 | case AMDGPULibFunc::EI_WRITE_PIPE_2: |
821 | case AMDGPULibFunc::EI_WRITE_PIPE_4: |
822 | return fold_read_write_pipe(CI, B, FInfo); |
823 | default: |
824 | break; |
825 | } |
826 | } |
827 | |
828 | return false; |
829 | } |
830 | |
831 | bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) { |
832 | // Table-Driven optimization |
833 | const TableRef tr = getOptTable(id: FInfo.getId()); |
834 | if (tr.empty()) |
835 | return false; |
836 | |
837 | int const sz = (int)tr.size(); |
838 | Value *opr0 = CI->getArgOperand(i: 0); |
839 | |
840 | if (getVecSize(FInfo) > 1) { |
841 | if (ConstantDataVector *CV = dyn_cast<ConstantDataVector>(Val: opr0)) { |
842 | SmallVector<double, 0> DVal; |
843 | for (int eltNo = 0; eltNo < getVecSize(FInfo); ++eltNo) { |
844 | ConstantFP *eltval = dyn_cast<ConstantFP>( |
845 | Val: CV->getElementAsConstant(i: (unsigned)eltNo)); |
846 | assert(eltval && "Non-FP arguments in math function!" ); |
847 | bool found = false; |
848 | for (int i=0; i < sz; ++i) { |
849 | if (eltval->isExactlyValue(V: tr[i].input)) { |
850 | DVal.push_back(Elt: tr[i].result); |
851 | found = true; |
852 | break; |
853 | } |
854 | } |
855 | if (!found) { |
856 | // This vector constants not handled yet. |
857 | return false; |
858 | } |
859 | } |
860 | LLVMContext &context = CI->getParent()->getParent()->getContext(); |
861 | Constant *nval; |
862 | if (getArgType(FInfo) == AMDGPULibFunc::F32) { |
863 | SmallVector<float, 0> FVal; |
864 | for (double D : DVal) |
865 | FVal.push_back(Elt: (float)D); |
866 | ArrayRef<float> tmp(FVal); |
867 | nval = ConstantDataVector::get(Context&: context, Elts: tmp); |
868 | } else { // F64 |
869 | ArrayRef<double> tmp(DVal); |
870 | nval = ConstantDataVector::get(Context&: context, Elts: tmp); |
871 | } |
872 | LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n" ); |
873 | replaceCall(I: CI, With: nval); |
874 | return true; |
875 | } |
876 | } else { |
877 | // Scalar version |
878 | if (ConstantFP *CF = dyn_cast<ConstantFP>(Val: opr0)) { |
879 | for (int i = 0; i < sz; ++i) { |
880 | if (CF->isExactlyValue(V: tr[i].input)) { |
881 | Value *nval = ConstantFP::get(Ty: CF->getType(), V: tr[i].result); |
882 | LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n" ); |
883 | replaceCall(I: CI, With: nval); |
884 | return true; |
885 | } |
886 | } |
887 | } |
888 | } |
889 | |
890 | return false; |
891 | } |
892 | |
893 | namespace llvm { |
894 | static double log2(double V) { |
895 | #if _XOPEN_SOURCE >= 600 || defined(_ISOC99_SOURCE) || _POSIX_C_SOURCE >= 200112L |
896 | return ::log2(x: V); |
897 | #else |
898 | return log(V) / numbers::ln2; |
899 | #endif |
900 | } |
901 | } // namespace llvm |
902 | |
903 | bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B, |
904 | const FuncInfo &FInfo) { |
905 | assert((FInfo.getId() == AMDGPULibFunc::EI_POW || |
906 | FInfo.getId() == AMDGPULibFunc::EI_POWR || |
907 | FInfo.getId() == AMDGPULibFunc::EI_POWN) && |
908 | "fold_pow: encounter a wrong function call" ); |
909 | |
910 | Module *M = B.GetInsertBlock()->getModule(); |
911 | Type *eltType = FPOp->getType()->getScalarType(); |
912 | Value *opr0 = FPOp->getOperand(i: 0); |
913 | Value *opr1 = FPOp->getOperand(i: 1); |
914 | |
915 | const APFloat *CF = nullptr; |
916 | const APInt *CINT = nullptr; |
917 | if (!match(V: opr1, P: m_APFloatAllowPoison(Res&: CF))) |
918 | match(V: opr1, P: m_APIntAllowPoison(Res&: CINT)); |
919 | |
920 | // 0x1111111 means that we don't do anything for this call. |
921 | int ci_opr1 = (CINT ? (int)CINT->getSExtValue() : 0x1111111); |
922 | |
923 | if ((CF && CF->isZero()) || (CINT && ci_opr1 == 0)) { |
924 | // pow/powr/pown(x, 0) == 1 |
925 | LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1\n" ); |
926 | Constant *cnval = ConstantFP::get(Ty: eltType, V: 1.0); |
927 | if (getVecSize(FInfo) > 1) { |
928 | cnval = ConstantDataVector::getSplat(NumElts: getVecSize(FInfo), Elt: cnval); |
929 | } |
930 | replaceCall(I: FPOp, With: cnval); |
931 | return true; |
932 | } |
933 | if ((CF && CF->isExactlyValue(V: 1.0)) || (CINT && ci_opr1 == 1)) { |
934 | // pow/powr/pown(x, 1.0) = x |
935 | LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << *opr0 << "\n" ); |
936 | replaceCall(I: FPOp, With: opr0); |
937 | return true; |
938 | } |
939 | if ((CF && CF->isExactlyValue(V: 2.0)) || (CINT && ci_opr1 == 2)) { |
940 | // pow/powr/pown(x, 2.0) = x*x |
941 | LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << *opr0 << " * " |
942 | << *opr0 << "\n" ); |
943 | Value *nval = B.CreateFMul(L: opr0, R: opr0, Name: "__pow2" ); |
944 | replaceCall(I: FPOp, With: nval); |
945 | return true; |
946 | } |
947 | if ((CF && CF->isExactlyValue(V: -1.0)) || (CINT && ci_opr1 == -1)) { |
948 | // pow/powr/pown(x, -1.0) = 1.0/x |
949 | LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1 / " << *opr0 << "\n" ); |
950 | Constant *cnval = ConstantFP::get(Ty: eltType, V: 1.0); |
951 | if (getVecSize(FInfo) > 1) { |
952 | cnval = ConstantDataVector::getSplat(NumElts: getVecSize(FInfo), Elt: cnval); |
953 | } |
954 | Value *nval = B.CreateFDiv(L: cnval, R: opr0, Name: "__powrecip" ); |
955 | replaceCall(I: FPOp, With: nval); |
956 | return true; |
957 | } |
958 | |
959 | if (CF && (CF->isExactlyValue(V: 0.5) || CF->isExactlyValue(V: -0.5))) { |
960 | // pow[r](x, [-]0.5) = sqrt(x) |
961 | bool issqrt = CF->isExactlyValue(V: 0.5); |
962 | if (FunctionCallee FPExpr = |
963 | getFunction(M, fInfo: AMDGPULibFunc(issqrt ? AMDGPULibFunc::EI_SQRT |
964 | : AMDGPULibFunc::EI_RSQRT, |
965 | FInfo))) { |
966 | LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << FInfo.getName() |
967 | << '(' << *opr0 << ")\n" ); |
968 | Value *nval = CreateCallEx(B,Callee: FPExpr, Arg: opr0, Name: issqrt ? "__pow2sqrt" |
969 | : "__pow2rsqrt" ); |
970 | replaceCall(I: FPOp, With: nval); |
971 | return true; |
972 | } |
973 | } |
974 | |
975 | if (!isUnsafeFiniteOnlyMath(FPOp)) |
976 | return false; |
977 | |
978 | // Unsafe Math optimization |
979 | |
980 | // Remember that ci_opr1 is set if opr1 is integral |
981 | if (CF) { |
982 | double dval = (getArgType(FInfo) == AMDGPULibFunc::F32) |
983 | ? (double)CF->convertToFloat() |
984 | : CF->convertToDouble(); |
985 | int ival = (int)dval; |
986 | if ((double)ival == dval) { |
987 | ci_opr1 = ival; |
988 | } else |
989 | ci_opr1 = 0x11111111; |
990 | } |
991 | |
992 | // pow/powr/pown(x, c) = [1/](x*x*..x); where |
993 | // trunc(c) == c && the number of x == c && |c| <= 12 |
994 | unsigned abs_opr1 = (ci_opr1 < 0) ? -ci_opr1 : ci_opr1; |
995 | if (abs_opr1 <= 12) { |
996 | Constant *cnval; |
997 | Value *nval; |
998 | if (abs_opr1 == 0) { |
999 | cnval = ConstantFP::get(Ty: eltType, V: 1.0); |
1000 | if (getVecSize(FInfo) > 1) { |
1001 | cnval = ConstantDataVector::getSplat(NumElts: getVecSize(FInfo), Elt: cnval); |
1002 | } |
1003 | nval = cnval; |
1004 | } else { |
1005 | Value *valx2 = nullptr; |
1006 | nval = nullptr; |
1007 | while (abs_opr1 > 0) { |
1008 | valx2 = valx2 ? B.CreateFMul(L: valx2, R: valx2, Name: "__powx2" ) : opr0; |
1009 | if (abs_opr1 & 1) { |
1010 | nval = nval ? B.CreateFMul(L: nval, R: valx2, Name: "__powprod" ) : valx2; |
1011 | } |
1012 | abs_opr1 >>= 1; |
1013 | } |
1014 | } |
1015 | |
1016 | if (ci_opr1 < 0) { |
1017 | cnval = ConstantFP::get(Ty: eltType, V: 1.0); |
1018 | if (getVecSize(FInfo) > 1) { |
1019 | cnval = ConstantDataVector::getSplat(NumElts: getVecSize(FInfo), Elt: cnval); |
1020 | } |
1021 | nval = B.CreateFDiv(L: cnval, R: nval, Name: "__1powprod" ); |
1022 | } |
1023 | LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " |
1024 | << ((ci_opr1 < 0) ? "1/prod(" : "prod(" ) << *opr0 |
1025 | << ")\n" ); |
1026 | replaceCall(I: FPOp, With: nval); |
1027 | return true; |
1028 | } |
1029 | |
1030 | // If we should use the generic intrinsic instead of emitting a libcall |
1031 | const bool ShouldUseIntrinsic = eltType->isFloatTy() || eltType->isHalfTy(); |
1032 | |
1033 | // powr ---> exp2(y * log2(x)) |
1034 | // pown/pow ---> powr(fabs(x), y) | (x & ((int)y << 31)) |
1035 | FunctionCallee ExpExpr; |
1036 | if (ShouldUseIntrinsic) |
1037 | ExpExpr = Intrinsic::getDeclaration(M, id: Intrinsic::exp2, Tys: {FPOp->getType()}); |
1038 | else { |
1039 | ExpExpr = getFunction(M, fInfo: AMDGPULibFunc(AMDGPULibFunc::EI_EXP2, FInfo)); |
1040 | if (!ExpExpr) |
1041 | return false; |
1042 | } |
1043 | |
1044 | bool needlog = false; |
1045 | bool needabs = false; |
1046 | bool needcopysign = false; |
1047 | Constant *cnval = nullptr; |
1048 | if (getVecSize(FInfo) == 1) { |
1049 | CF = nullptr; |
1050 | match(V: opr0, P: m_APFloatAllowPoison(Res&: CF)); |
1051 | |
1052 | if (CF) { |
1053 | double V = (getArgType(FInfo) == AMDGPULibFunc::F32) |
1054 | ? (double)CF->convertToFloat() |
1055 | : CF->convertToDouble(); |
1056 | |
1057 | V = log2(V: std::abs(x: V)); |
1058 | cnval = ConstantFP::get(Ty: eltType, V); |
1059 | needcopysign = (FInfo.getId() != AMDGPULibFunc::EI_POWR) && |
1060 | CF->isNegative(); |
1061 | } else { |
1062 | needlog = true; |
1063 | needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR; |
1064 | } |
1065 | } else { |
1066 | ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(Val: opr0); |
1067 | |
1068 | if (!CDV) { |
1069 | needlog = true; |
1070 | needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR; |
1071 | } else { |
1072 | assert ((int)CDV->getNumElements() == getVecSize(FInfo) && |
1073 | "Wrong vector size detected" ); |
1074 | |
1075 | SmallVector<double, 0> DVal; |
1076 | for (int i=0; i < getVecSize(FInfo); ++i) { |
1077 | double V = CDV->getElementAsAPFloat(i).convertToDouble(); |
1078 | if (V < 0.0) needcopysign = true; |
1079 | V = log2(V: std::abs(x: V)); |
1080 | DVal.push_back(Elt: V); |
1081 | } |
1082 | if (getArgType(FInfo) == AMDGPULibFunc::F32) { |
1083 | SmallVector<float, 0> FVal; |
1084 | for (double D : DVal) |
1085 | FVal.push_back(Elt: (float)D); |
1086 | ArrayRef<float> tmp(FVal); |
1087 | cnval = ConstantDataVector::get(Context&: M->getContext(), Elts: tmp); |
1088 | } else { |
1089 | ArrayRef<double> tmp(DVal); |
1090 | cnval = ConstantDataVector::get(Context&: M->getContext(), Elts: tmp); |
1091 | } |
1092 | } |
1093 | } |
1094 | |
1095 | if (needcopysign && (FInfo.getId() == AMDGPULibFunc::EI_POW)) { |
1096 | // We cannot handle corner cases for a general pow() function, give up |
1097 | // unless y is a constant integral value. Then proceed as if it were pown. |
1098 | if (!isKnownIntegral(V: opr1, DL: M->getDataLayout(), FMF: FPOp->getFastMathFlags())) |
1099 | return false; |
1100 | } |
1101 | |
1102 | Value *nval; |
1103 | if (needabs) { |
1104 | nval = B.CreateUnaryIntrinsic(ID: Intrinsic::fabs, V: opr0, FMFSource: nullptr, Name: "__fabs" ); |
1105 | } else { |
1106 | nval = cnval ? cnval : opr0; |
1107 | } |
1108 | if (needlog) { |
1109 | FunctionCallee LogExpr; |
1110 | if (ShouldUseIntrinsic) { |
1111 | LogExpr = |
1112 | Intrinsic::getDeclaration(M, id: Intrinsic::log2, Tys: {FPOp->getType()}); |
1113 | } else { |
1114 | LogExpr = getFunction(M, fInfo: AMDGPULibFunc(AMDGPULibFunc::EI_LOG2, FInfo)); |
1115 | if (!LogExpr) |
1116 | return false; |
1117 | } |
1118 | |
1119 | nval = CreateCallEx(B,Callee: LogExpr, Arg: nval, Name: "__log2" ); |
1120 | } |
1121 | |
1122 | if (FInfo.getId() == AMDGPULibFunc::EI_POWN) { |
1123 | // convert int(32) to fp(f32 or f64) |
1124 | opr1 = B.CreateSIToFP(V: opr1, DestTy: nval->getType(), Name: "pownI2F" ); |
1125 | } |
1126 | nval = B.CreateFMul(L: opr1, R: nval, Name: "__ylogx" ); |
1127 | nval = CreateCallEx(B,Callee: ExpExpr, Arg: nval, Name: "__exp2" ); |
1128 | |
1129 | if (needcopysign) { |
1130 | Type* nTyS = B.getIntNTy(N: eltType->getPrimitiveSizeInBits()); |
1131 | Type *nTy = FPOp->getType()->getWithNewType(EltTy: nTyS); |
1132 | unsigned size = nTy->getScalarSizeInBits(); |
1133 | Value *opr_n = FPOp->getOperand(i: 1); |
1134 | if (opr_n->getType()->getScalarType()->isIntegerTy()) |
1135 | opr_n = B.CreateZExtOrTrunc(V: opr_n, DestTy: nTy, Name: "__ytou" ); |
1136 | else |
1137 | opr_n = B.CreateFPToSI(V: opr1, DestTy: nTy, Name: "__ytou" ); |
1138 | |
1139 | Value *sign = B.CreateShl(LHS: opr_n, RHS: size-1, Name: "__yeven" ); |
1140 | sign = B.CreateAnd(LHS: B.CreateBitCast(V: opr0, DestTy: nTy), RHS: sign, Name: "__pow_sign" ); |
1141 | nval = B.CreateOr(LHS: B.CreateBitCast(V: nval, DestTy: nTy), RHS: sign); |
1142 | nval = B.CreateBitCast(V: nval, DestTy: opr0->getType()); |
1143 | } |
1144 | |
1145 | LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " |
1146 | << "exp2(" << *opr1 << " * log2(" << *opr0 << "))\n" ); |
1147 | replaceCall(I: FPOp, With: nval); |
1148 | |
1149 | return true; |
1150 | } |
1151 | |
1152 | bool AMDGPULibCalls::fold_rootn(FPMathOperator *FPOp, IRBuilder<> &B, |
1153 | const FuncInfo &FInfo) { |
1154 | Value *opr0 = FPOp->getOperand(i: 0); |
1155 | Value *opr1 = FPOp->getOperand(i: 1); |
1156 | |
1157 | const APInt *CINT = nullptr; |
1158 | if (!match(V: opr1, P: m_APIntAllowPoison(Res&: CINT))) |
1159 | return false; |
1160 | |
1161 | Function *Parent = B.GetInsertBlock()->getParent(); |
1162 | |
1163 | int ci_opr1 = (int)CINT->getSExtValue(); |
1164 | if (ci_opr1 == 1 && !Parent->hasFnAttribute(Kind: Attribute::StrictFP)) { |
1165 | // rootn(x, 1) = x |
1166 | // |
1167 | // TODO: Insert constrained canonicalize for strictfp case. |
1168 | LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << *opr0 << '\n'); |
1169 | replaceCall(I: FPOp, With: opr0); |
1170 | return true; |
1171 | } |
1172 | |
1173 | Module *M = B.GetInsertBlock()->getModule(); |
1174 | |
1175 | CallInst *CI = cast<CallInst>(Val: FPOp); |
1176 | if (ci_opr1 == 2 && |
1177 | shouldReplaceLibcallWithIntrinsic(CI, |
1178 | /*AllowMinSizeF32=*/true, |
1179 | /*AllowF64=*/true)) { |
1180 | // rootn(x, 2) = sqrt(x) |
1181 | LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> sqrt(" << *opr0 << ")\n" ); |
1182 | |
1183 | CallInst *NewCall = B.CreateUnaryIntrinsic(ID: Intrinsic::sqrt, V: opr0, FMFSource: CI); |
1184 | NewCall->takeName(V: CI); |
1185 | |
1186 | // OpenCL rootn has a looser ulp of 2 requirement than sqrt, so add some |
1187 | // metadata. |
1188 | MDBuilder MDHelper(M->getContext()); |
1189 | MDNode *FPMD = MDHelper.createFPMath(Accuracy: std::max(a: FPOp->getFPAccuracy(), b: 2.0f)); |
1190 | NewCall->setMetadata(KindID: LLVMContext::MD_fpmath, Node: FPMD); |
1191 | |
1192 | replaceCall(I: CI, With: NewCall); |
1193 | return true; |
1194 | } |
1195 | |
1196 | if (ci_opr1 == 3) { // rootn(x, 3) = cbrt(x) |
1197 | if (FunctionCallee FPExpr = |
1198 | getFunction(M, fInfo: AMDGPULibFunc(AMDGPULibFunc::EI_CBRT, FInfo))) { |
1199 | LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> cbrt(" << *opr0 |
1200 | << ")\n" ); |
1201 | Value *nval = CreateCallEx(B,Callee: FPExpr, Arg: opr0, Name: "__rootn2cbrt" ); |
1202 | replaceCall(I: FPOp, With: nval); |
1203 | return true; |
1204 | } |
1205 | } else if (ci_opr1 == -1) { // rootn(x, -1) = 1.0/x |
1206 | LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1.0 / " << *opr0 << "\n" ); |
1207 | Value *nval = B.CreateFDiv(L: ConstantFP::get(Ty: opr0->getType(), V: 1.0), |
1208 | R: opr0, |
1209 | Name: "__rootn2div" ); |
1210 | replaceCall(I: FPOp, With: nval); |
1211 | return true; |
1212 | } |
1213 | |
1214 | if (ci_opr1 == -2 && |
1215 | shouldReplaceLibcallWithIntrinsic(CI, |
1216 | /*AllowMinSizeF32=*/true, |
1217 | /*AllowF64=*/true)) { |
1218 | // rootn(x, -2) = rsqrt(x) |
1219 | |
1220 | // The original rootn had looser ulp requirements than the resultant sqrt |
1221 | // and fdiv. |
1222 | MDBuilder MDHelper(M->getContext()); |
1223 | MDNode *FPMD = MDHelper.createFPMath(Accuracy: std::max(a: FPOp->getFPAccuracy(), b: 2.0f)); |
1224 | |
1225 | // TODO: Could handle strictfp but need to fix strict sqrt emission |
1226 | FastMathFlags FMF = FPOp->getFastMathFlags(); |
1227 | FMF.setAllowContract(true); |
1228 | |
1229 | CallInst *Sqrt = B.CreateUnaryIntrinsic(ID: Intrinsic::sqrt, V: opr0, FMFSource: CI); |
1230 | Instruction *RSqrt = cast<Instruction>( |
1231 | Val: B.CreateFDiv(L: ConstantFP::get(Ty: opr0->getType(), V: 1.0), R: Sqrt)); |
1232 | Sqrt->setFastMathFlags(FMF); |
1233 | RSqrt->setFastMathFlags(FMF); |
1234 | RSqrt->setMetadata(KindID: LLVMContext::MD_fpmath, Node: FPMD); |
1235 | |
1236 | LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> rsqrt(" << *opr0 |
1237 | << ")\n" ); |
1238 | replaceCall(I: CI, With: RSqrt); |
1239 | return true; |
1240 | } |
1241 | |
1242 | return false; |
1243 | } |
1244 | |
1245 | // Get a scalar native builtin single argument FP function |
1246 | FunctionCallee AMDGPULibCalls::getNativeFunction(Module *M, |
1247 | const FuncInfo &FInfo) { |
1248 | if (getArgType(FInfo) == AMDGPULibFunc::F64 || !HasNative(id: FInfo.getId())) |
1249 | return nullptr; |
1250 | FuncInfo nf = FInfo; |
1251 | nf.setPrefix(AMDGPULibFunc::NATIVE); |
1252 | return getFunction(M, fInfo: nf); |
1253 | } |
1254 | |
1255 | // Some library calls are just wrappers around llvm intrinsics, but compiled |
1256 | // conservatively. Preserve the flags from the original call site by |
1257 | // substituting them with direct calls with all the flags. |
1258 | bool AMDGPULibCalls::shouldReplaceLibcallWithIntrinsic(const CallInst *CI, |
1259 | bool AllowMinSizeF32, |
1260 | bool AllowF64, |
1261 | bool AllowStrictFP) { |
1262 | Type *FltTy = CI->getType()->getScalarType(); |
1263 | const bool IsF32 = FltTy->isFloatTy(); |
1264 | |
1265 | // f64 intrinsics aren't implemented for most operations. |
1266 | if (!IsF32 && !FltTy->isHalfTy() && (!AllowF64 || !FltTy->isDoubleTy())) |
1267 | return false; |
1268 | |
1269 | // We're implicitly inlining by replacing the libcall with the intrinsic, so |
1270 | // don't do it for noinline call sites. |
1271 | if (CI->isNoInline()) |
1272 | return false; |
1273 | |
1274 | const Function *ParentF = CI->getFunction(); |
1275 | // TODO: Handle strictfp |
1276 | if (!AllowStrictFP && ParentF->hasFnAttribute(Kind: Attribute::StrictFP)) |
1277 | return false; |
1278 | |
1279 | if (IsF32 && !AllowMinSizeF32 && ParentF->hasMinSize()) |
1280 | return false; |
1281 | return true; |
1282 | } |
1283 | |
1284 | void AMDGPULibCalls::replaceLibCallWithSimpleIntrinsic(IRBuilder<> &B, |
1285 | CallInst *CI, |
1286 | Intrinsic::ID IntrID) { |
1287 | if (CI->arg_size() == 2) { |
1288 | Value *Arg0 = CI->getArgOperand(i: 0); |
1289 | Value *Arg1 = CI->getArgOperand(i: 1); |
1290 | VectorType *Arg0VecTy = dyn_cast<VectorType>(Val: Arg0->getType()); |
1291 | VectorType *Arg1VecTy = dyn_cast<VectorType>(Val: Arg1->getType()); |
1292 | if (Arg0VecTy && !Arg1VecTy) { |
1293 | Value *SplatRHS = B.CreateVectorSplat(EC: Arg0VecTy->getElementCount(), V: Arg1); |
1294 | CI->setArgOperand(i: 1, v: SplatRHS); |
1295 | } else if (!Arg0VecTy && Arg1VecTy) { |
1296 | Value *SplatLHS = B.CreateVectorSplat(EC: Arg1VecTy->getElementCount(), V: Arg0); |
1297 | CI->setArgOperand(i: 0, v: SplatLHS); |
1298 | } |
1299 | } |
1300 | |
1301 | CI->setCalledFunction( |
1302 | Intrinsic::getDeclaration(M: CI->getModule(), id: IntrID, Tys: {CI->getType()})); |
1303 | } |
1304 | |
1305 | bool AMDGPULibCalls::tryReplaceLibcallWithSimpleIntrinsic( |
1306 | IRBuilder<> &B, CallInst *CI, Intrinsic::ID IntrID, bool AllowMinSizeF32, |
1307 | bool AllowF64, bool AllowStrictFP) { |
1308 | if (!shouldReplaceLibcallWithIntrinsic(CI, AllowMinSizeF32, AllowF64, |
1309 | AllowStrictFP)) |
1310 | return false; |
1311 | replaceLibCallWithSimpleIntrinsic(B, CI, IntrID); |
1312 | return true; |
1313 | } |
1314 | |
1315 | std::tuple<Value *, Value *, Value *> |
1316 | AMDGPULibCalls::insertSinCos(Value *Arg, FastMathFlags FMF, IRBuilder<> &B, |
1317 | FunctionCallee Fsincos) { |
1318 | DebugLoc DL = B.getCurrentDebugLocation(); |
1319 | Function *F = B.GetInsertBlock()->getParent(); |
1320 | B.SetInsertPointPastAllocas(F); |
1321 | |
1322 | AllocaInst *Alloc = B.CreateAlloca(Ty: Arg->getType(), ArraySize: nullptr, Name: "__sincos_" ); |
1323 | |
1324 | if (Instruction *ArgInst = dyn_cast<Instruction>(Val: Arg)) { |
1325 | // If the argument is an instruction, it must dominate all uses so put our |
1326 | // sincos call there. Otherwise, right after the allocas works well enough |
1327 | // if it's an argument or constant. |
1328 | |
1329 | B.SetInsertPoint(TheBB: ArgInst->getParent(), IP: ++ArgInst->getIterator()); |
1330 | |
1331 | // SetInsertPoint unwelcomely always tries to set the debug loc. |
1332 | B.SetCurrentDebugLocation(DL); |
1333 | } |
1334 | |
1335 | Type *CosPtrTy = Fsincos.getFunctionType()->getParamType(i: 1); |
1336 | |
1337 | // The allocaInst allocates the memory in private address space. This need |
1338 | // to be addrspacecasted to point to the address space of cos pointer type. |
1339 | // In OpenCL 2.0 this is generic, while in 1.2 that is private. |
1340 | Value *CastAlloc = B.CreateAddrSpaceCast(V: Alloc, DestTy: CosPtrTy); |
1341 | |
1342 | CallInst *SinCos = CreateCallEx2(B, Callee: Fsincos, Arg1: Arg, Arg2: CastAlloc); |
1343 | |
1344 | // TODO: Is it worth trying to preserve the location for the cos calls for the |
1345 | // load? |
1346 | |
1347 | LoadInst *LoadCos = B.CreateLoad(Ty: Alloc->getAllocatedType(), Ptr: Alloc); |
1348 | return {SinCos, LoadCos, SinCos}; |
1349 | } |
1350 | |
1351 | // fold sin, cos -> sincos. |
1352 | bool AMDGPULibCalls::fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B, |
1353 | const FuncInfo &fInfo) { |
1354 | assert(fInfo.getId() == AMDGPULibFunc::EI_SIN || |
1355 | fInfo.getId() == AMDGPULibFunc::EI_COS); |
1356 | |
1357 | if ((getArgType(FInfo: fInfo) != AMDGPULibFunc::F32 && |
1358 | getArgType(FInfo: fInfo) != AMDGPULibFunc::F64) || |
1359 | fInfo.getPrefix() != AMDGPULibFunc::NOPFX) |
1360 | return false; |
1361 | |
1362 | bool const isSin = fInfo.getId() == AMDGPULibFunc::EI_SIN; |
1363 | |
1364 | Value *CArgVal = FPOp->getOperand(i: 0); |
1365 | CallInst *CI = cast<CallInst>(Val: FPOp); |
1366 | |
1367 | Function *F = B.GetInsertBlock()->getParent(); |
1368 | Module *M = F->getParent(); |
1369 | |
1370 | // Merge the sin and cos. For OpenCL 2.0, there may only be a generic pointer |
1371 | // implementation. Prefer the private form if available. |
1372 | AMDGPULibFunc SinCosLibFuncPrivate(AMDGPULibFunc::EI_SINCOS, fInfo); |
1373 | SinCosLibFuncPrivate.getLeads()[0].PtrKind = |
1374 | AMDGPULibFunc::getEPtrKindFromAddrSpace(AS: AMDGPUAS::PRIVATE_ADDRESS); |
1375 | |
1376 | AMDGPULibFunc SinCosLibFuncGeneric(AMDGPULibFunc::EI_SINCOS, fInfo); |
1377 | SinCosLibFuncGeneric.getLeads()[0].PtrKind = |
1378 | AMDGPULibFunc::getEPtrKindFromAddrSpace(AS: AMDGPUAS::FLAT_ADDRESS); |
1379 | |
1380 | FunctionCallee FSinCosPrivate = getFunction(M, fInfo: SinCosLibFuncPrivate); |
1381 | FunctionCallee FSinCosGeneric = getFunction(M, fInfo: SinCosLibFuncGeneric); |
1382 | FunctionCallee FSinCos = FSinCosPrivate ? FSinCosPrivate : FSinCosGeneric; |
1383 | if (!FSinCos) |
1384 | return false; |
1385 | |
1386 | SmallVector<CallInst *> SinCalls; |
1387 | SmallVector<CallInst *> CosCalls; |
1388 | SmallVector<CallInst *> SinCosCalls; |
1389 | FuncInfo PartnerInfo(isSin ? AMDGPULibFunc::EI_COS : AMDGPULibFunc::EI_SIN, |
1390 | fInfo); |
1391 | const std::string PairName = PartnerInfo.mangle(); |
1392 | |
1393 | StringRef SinName = isSin ? CI->getCalledFunction()->getName() : PairName; |
1394 | StringRef CosName = isSin ? PairName : CI->getCalledFunction()->getName(); |
1395 | const std::string SinCosPrivateName = SinCosLibFuncPrivate.mangle(); |
1396 | const std::string SinCosGenericName = SinCosLibFuncGeneric.mangle(); |
1397 | |
1398 | // Intersect the two sets of flags. |
1399 | FastMathFlags FMF = FPOp->getFastMathFlags(); |
1400 | MDNode *FPMath = CI->getMetadata(KindID: LLVMContext::MD_fpmath); |
1401 | |
1402 | SmallVector<DILocation *> MergeDbgLocs = {CI->getDebugLoc()}; |
1403 | |
1404 | for (User* U : CArgVal->users()) { |
1405 | CallInst *XI = dyn_cast<CallInst>(Val: U); |
1406 | if (!XI || XI->getFunction() != F || XI->isNoBuiltin()) |
1407 | continue; |
1408 | |
1409 | Function *UCallee = XI->getCalledFunction(); |
1410 | if (!UCallee) |
1411 | continue; |
1412 | |
1413 | bool Handled = true; |
1414 | |
1415 | if (UCallee->getName() == SinName) |
1416 | SinCalls.push_back(Elt: XI); |
1417 | else if (UCallee->getName() == CosName) |
1418 | CosCalls.push_back(Elt: XI); |
1419 | else if (UCallee->getName() == SinCosPrivateName || |
1420 | UCallee->getName() == SinCosGenericName) |
1421 | SinCosCalls.push_back(Elt: XI); |
1422 | else |
1423 | Handled = false; |
1424 | |
1425 | if (Handled) { |
1426 | MergeDbgLocs.push_back(Elt: XI->getDebugLoc()); |
1427 | auto *OtherOp = cast<FPMathOperator>(Val: XI); |
1428 | FMF &= OtherOp->getFastMathFlags(); |
1429 | FPMath = MDNode::getMostGenericFPMath( |
1430 | A: FPMath, B: XI->getMetadata(KindID: LLVMContext::MD_fpmath)); |
1431 | } |
1432 | } |
1433 | |
1434 | if (SinCalls.empty() || CosCalls.empty()) |
1435 | return false; |
1436 | |
1437 | B.setFastMathFlags(FMF); |
1438 | B.setDefaultFPMathTag(FPMath); |
1439 | DILocation *DbgLoc = DILocation::getMergedLocations(Locs: MergeDbgLocs); |
1440 | B.SetCurrentDebugLocation(DbgLoc); |
1441 | |
1442 | auto [Sin, Cos, SinCos] = insertSinCos(Arg: CArgVal, FMF, B, Fsincos: FSinCos); |
1443 | |
1444 | auto replaceTrigInsts = [](ArrayRef<CallInst *> Calls, Value *Res) { |
1445 | for (CallInst *C : Calls) |
1446 | C->replaceAllUsesWith(V: Res); |
1447 | |
1448 | // Leave the other dead instructions to avoid clobbering iterators. |
1449 | }; |
1450 | |
1451 | replaceTrigInsts(SinCalls, Sin); |
1452 | replaceTrigInsts(CosCalls, Cos); |
1453 | replaceTrigInsts(SinCosCalls, SinCos); |
1454 | |
1455 | // It's safe to delete the original now. |
1456 | CI->eraseFromParent(); |
1457 | return true; |
1458 | } |
1459 | |
1460 | bool AMDGPULibCalls::evaluateScalarMathFunc(const FuncInfo &FInfo, double &Res0, |
1461 | double &Res1, Constant *copr0, |
1462 | Constant *copr1) { |
1463 | // By default, opr0/opr1/opr3 holds values of float/double type. |
1464 | // If they are not float/double, each function has to its |
1465 | // operand separately. |
1466 | double opr0 = 0.0, opr1 = 0.0; |
1467 | ConstantFP *fpopr0 = dyn_cast_or_null<ConstantFP>(Val: copr0); |
1468 | ConstantFP *fpopr1 = dyn_cast_or_null<ConstantFP>(Val: copr1); |
1469 | if (fpopr0) { |
1470 | opr0 = (getArgType(FInfo) == AMDGPULibFunc::F64) |
1471 | ? fpopr0->getValueAPF().convertToDouble() |
1472 | : (double)fpopr0->getValueAPF().convertToFloat(); |
1473 | } |
1474 | |
1475 | if (fpopr1) { |
1476 | opr1 = (getArgType(FInfo) == AMDGPULibFunc::F64) |
1477 | ? fpopr1->getValueAPF().convertToDouble() |
1478 | : (double)fpopr1->getValueAPF().convertToFloat(); |
1479 | } |
1480 | |
1481 | switch (FInfo.getId()) { |
1482 | default : return false; |
1483 | |
1484 | case AMDGPULibFunc::EI_ACOS: |
1485 | Res0 = acos(x: opr0); |
1486 | return true; |
1487 | |
1488 | case AMDGPULibFunc::EI_ACOSH: |
1489 | // acosh(x) == log(x + sqrt(x*x - 1)) |
1490 | Res0 = log(x: opr0 + sqrt(x: opr0*opr0 - 1.0)); |
1491 | return true; |
1492 | |
1493 | case AMDGPULibFunc::EI_ACOSPI: |
1494 | Res0 = acos(x: opr0) / MATH_PI; |
1495 | return true; |
1496 | |
1497 | case AMDGPULibFunc::EI_ASIN: |
1498 | Res0 = asin(x: opr0); |
1499 | return true; |
1500 | |
1501 | case AMDGPULibFunc::EI_ASINH: |
1502 | // asinh(x) == log(x + sqrt(x*x + 1)) |
1503 | Res0 = log(x: opr0 + sqrt(x: opr0*opr0 + 1.0)); |
1504 | return true; |
1505 | |
1506 | case AMDGPULibFunc::EI_ASINPI: |
1507 | Res0 = asin(x: opr0) / MATH_PI; |
1508 | return true; |
1509 | |
1510 | case AMDGPULibFunc::EI_ATAN: |
1511 | Res0 = atan(x: opr0); |
1512 | return true; |
1513 | |
1514 | case AMDGPULibFunc::EI_ATANH: |
1515 | // atanh(x) == (log(x+1) - log(x-1))/2; |
1516 | Res0 = (log(x: opr0 + 1.0) - log(x: opr0 - 1.0))/2.0; |
1517 | return true; |
1518 | |
1519 | case AMDGPULibFunc::EI_ATANPI: |
1520 | Res0 = atan(x: opr0) / MATH_PI; |
1521 | return true; |
1522 | |
1523 | case AMDGPULibFunc::EI_CBRT: |
1524 | Res0 = (opr0 < 0.0) ? -pow(x: -opr0, y: 1.0/3.0) : pow(x: opr0, y: 1.0/3.0); |
1525 | return true; |
1526 | |
1527 | case AMDGPULibFunc::EI_COS: |
1528 | Res0 = cos(x: opr0); |
1529 | return true; |
1530 | |
1531 | case AMDGPULibFunc::EI_COSH: |
1532 | Res0 = cosh(x: opr0); |
1533 | return true; |
1534 | |
1535 | case AMDGPULibFunc::EI_COSPI: |
1536 | Res0 = cos(MATH_PI * opr0); |
1537 | return true; |
1538 | |
1539 | case AMDGPULibFunc::EI_EXP: |
1540 | Res0 = exp(x: opr0); |
1541 | return true; |
1542 | |
1543 | case AMDGPULibFunc::EI_EXP2: |
1544 | Res0 = pow(x: 2.0, y: opr0); |
1545 | return true; |
1546 | |
1547 | case AMDGPULibFunc::EI_EXP10: |
1548 | Res0 = pow(x: 10.0, y: opr0); |
1549 | return true; |
1550 | |
1551 | case AMDGPULibFunc::EI_LOG: |
1552 | Res0 = log(x: opr0); |
1553 | return true; |
1554 | |
1555 | case AMDGPULibFunc::EI_LOG2: |
1556 | Res0 = log(x: opr0) / log(x: 2.0); |
1557 | return true; |
1558 | |
1559 | case AMDGPULibFunc::EI_LOG10: |
1560 | Res0 = log(x: opr0) / log(x: 10.0); |
1561 | return true; |
1562 | |
1563 | case AMDGPULibFunc::EI_RSQRT: |
1564 | Res0 = 1.0 / sqrt(x: opr0); |
1565 | return true; |
1566 | |
1567 | case AMDGPULibFunc::EI_SIN: |
1568 | Res0 = sin(x: opr0); |
1569 | return true; |
1570 | |
1571 | case AMDGPULibFunc::EI_SINH: |
1572 | Res0 = sinh(x: opr0); |
1573 | return true; |
1574 | |
1575 | case AMDGPULibFunc::EI_SINPI: |
1576 | Res0 = sin(MATH_PI * opr0); |
1577 | return true; |
1578 | |
1579 | case AMDGPULibFunc::EI_TAN: |
1580 | Res0 = tan(x: opr0); |
1581 | return true; |
1582 | |
1583 | case AMDGPULibFunc::EI_TANH: |
1584 | Res0 = tanh(x: opr0); |
1585 | return true; |
1586 | |
1587 | case AMDGPULibFunc::EI_TANPI: |
1588 | Res0 = tan(MATH_PI * opr0); |
1589 | return true; |
1590 | |
1591 | // two-arg functions |
1592 | case AMDGPULibFunc::EI_POW: |
1593 | case AMDGPULibFunc::EI_POWR: |
1594 | Res0 = pow(x: opr0, y: opr1); |
1595 | return true; |
1596 | |
1597 | case AMDGPULibFunc::EI_POWN: { |
1598 | if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(Val: copr1)) { |
1599 | double val = (double)iopr1->getSExtValue(); |
1600 | Res0 = pow(x: opr0, y: val); |
1601 | return true; |
1602 | } |
1603 | return false; |
1604 | } |
1605 | |
1606 | case AMDGPULibFunc::EI_ROOTN: { |
1607 | if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(Val: copr1)) { |
1608 | double val = (double)iopr1->getSExtValue(); |
1609 | Res0 = pow(x: opr0, y: 1.0 / val); |
1610 | return true; |
1611 | } |
1612 | return false; |
1613 | } |
1614 | |
1615 | // with ptr arg |
1616 | case AMDGPULibFunc::EI_SINCOS: |
1617 | Res0 = sin(x: opr0); |
1618 | Res1 = cos(x: opr0); |
1619 | return true; |
1620 | } |
1621 | |
1622 | return false; |
1623 | } |
1624 | |
1625 | bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) { |
1626 | int numArgs = (int)aCI->arg_size(); |
1627 | if (numArgs > 3) |
1628 | return false; |
1629 | |
1630 | Constant *copr0 = nullptr; |
1631 | Constant *copr1 = nullptr; |
1632 | if (numArgs > 0) { |
1633 | if ((copr0 = dyn_cast<Constant>(Val: aCI->getArgOperand(i: 0))) == nullptr) |
1634 | return false; |
1635 | } |
1636 | |
1637 | if (numArgs > 1) { |
1638 | if ((copr1 = dyn_cast<Constant>(Val: aCI->getArgOperand(i: 1))) == nullptr) { |
1639 | if (FInfo.getId() != AMDGPULibFunc::EI_SINCOS) |
1640 | return false; |
1641 | } |
1642 | } |
1643 | |
1644 | // At this point, all arguments to aCI are constants. |
1645 | |
1646 | // max vector size is 16, and sincos will generate two results. |
1647 | double DVal0[16], DVal1[16]; |
1648 | int FuncVecSize = getVecSize(FInfo); |
1649 | bool hasTwoResults = (FInfo.getId() == AMDGPULibFunc::EI_SINCOS); |
1650 | if (FuncVecSize == 1) { |
1651 | if (!evaluateScalarMathFunc(FInfo, Res0&: DVal0[0], Res1&: DVal1[0], copr0, copr1)) { |
1652 | return false; |
1653 | } |
1654 | } else { |
1655 | ConstantDataVector *CDV0 = dyn_cast_or_null<ConstantDataVector>(Val: copr0); |
1656 | ConstantDataVector *CDV1 = dyn_cast_or_null<ConstantDataVector>(Val: copr1); |
1657 | for (int i = 0; i < FuncVecSize; ++i) { |
1658 | Constant *celt0 = CDV0 ? CDV0->getElementAsConstant(i) : nullptr; |
1659 | Constant *celt1 = CDV1 ? CDV1->getElementAsConstant(i) : nullptr; |
1660 | if (!evaluateScalarMathFunc(FInfo, Res0&: DVal0[i], Res1&: DVal1[i], copr0: celt0, copr1: celt1)) { |
1661 | return false; |
1662 | } |
1663 | } |
1664 | } |
1665 | |
1666 | LLVMContext &context = aCI->getContext(); |
1667 | Constant *nval0, *nval1; |
1668 | if (FuncVecSize == 1) { |
1669 | nval0 = ConstantFP::get(Ty: aCI->getType(), V: DVal0[0]); |
1670 | if (hasTwoResults) |
1671 | nval1 = ConstantFP::get(Ty: aCI->getType(), V: DVal1[0]); |
1672 | } else { |
1673 | if (getArgType(FInfo) == AMDGPULibFunc::F32) { |
1674 | SmallVector <float, 0> FVal0, FVal1; |
1675 | for (int i = 0; i < FuncVecSize; ++i) |
1676 | FVal0.push_back(Elt: (float)DVal0[i]); |
1677 | ArrayRef<float> tmp0(FVal0); |
1678 | nval0 = ConstantDataVector::get(Context&: context, Elts: tmp0); |
1679 | if (hasTwoResults) { |
1680 | for (int i = 0; i < FuncVecSize; ++i) |
1681 | FVal1.push_back(Elt: (float)DVal1[i]); |
1682 | ArrayRef<float> tmp1(FVal1); |
1683 | nval1 = ConstantDataVector::get(Context&: context, Elts: tmp1); |
1684 | } |
1685 | } else { |
1686 | ArrayRef<double> tmp0(DVal0); |
1687 | nval0 = ConstantDataVector::get(Context&: context, Elts: tmp0); |
1688 | if (hasTwoResults) { |
1689 | ArrayRef<double> tmp1(DVal1); |
1690 | nval1 = ConstantDataVector::get(Context&: context, Elts: tmp1); |
1691 | } |
1692 | } |
1693 | } |
1694 | |
1695 | if (hasTwoResults) { |
1696 | // sincos |
1697 | assert(FInfo.getId() == AMDGPULibFunc::EI_SINCOS && |
1698 | "math function with ptr arg not supported yet" ); |
1699 | new StoreInst(nval1, aCI->getArgOperand(i: 1), aCI->getIterator()); |
1700 | } |
1701 | |
1702 | replaceCall(I: aCI, With: nval0); |
1703 | return true; |
1704 | } |
1705 | |
1706 | PreservedAnalyses AMDGPUSimplifyLibCallsPass::run(Function &F, |
1707 | FunctionAnalysisManager &AM) { |
1708 | AMDGPULibCalls Simplifier; |
1709 | Simplifier.initNativeFuncs(); |
1710 | Simplifier.initFunction(F, FAM&: AM); |
1711 | |
1712 | bool Changed = false; |
1713 | |
1714 | LLVM_DEBUG(dbgs() << "AMDIC: process function " ; |
1715 | F.printAsOperand(dbgs(), false, F.getParent()); dbgs() << '\n';); |
1716 | |
1717 | for (auto &BB : F) { |
1718 | for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;) { |
1719 | // Ignore non-calls. |
1720 | CallInst *CI = dyn_cast<CallInst>(Val&: I); |
1721 | ++I; |
1722 | |
1723 | if (CI) { |
1724 | if (Simplifier.fold(CI)) |
1725 | Changed = true; |
1726 | } |
1727 | } |
1728 | } |
1729 | return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); |
1730 | } |
1731 | |
1732 | PreservedAnalyses AMDGPUUseNativeCallsPass::run(Function &F, |
1733 | FunctionAnalysisManager &AM) { |
1734 | if (UseNative.empty()) |
1735 | return PreservedAnalyses::all(); |
1736 | |
1737 | AMDGPULibCalls Simplifier; |
1738 | Simplifier.initNativeFuncs(); |
1739 | Simplifier.initFunction(F, FAM&: AM); |
1740 | |
1741 | bool Changed = false; |
1742 | for (auto &BB : F) { |
1743 | for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;) { |
1744 | // Ignore non-calls. |
1745 | CallInst *CI = dyn_cast<CallInst>(Val&: I); |
1746 | ++I; |
1747 | if (CI && Simplifier.useNative(aCI: CI)) |
1748 | Changed = true; |
1749 | } |
1750 | } |
1751 | return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); |
1752 | } |
1753 | |