1 | //===-- NVPTXTargetTransformInfo.cpp - NVPTX specific TTI -----------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "NVPTXTargetTransformInfo.h" |
10 | #include "NVPTXUtilities.h" |
11 | #include "llvm/Analysis/LoopInfo.h" |
12 | #include "llvm/Analysis/TargetTransformInfo.h" |
13 | #include "llvm/Analysis/ValueTracking.h" |
14 | #include "llvm/CodeGen/BasicTTIImpl.h" |
15 | #include "llvm/CodeGen/CostTable.h" |
16 | #include "llvm/CodeGen/TargetLowering.h" |
17 | #include "llvm/IR/IntrinsicsNVPTX.h" |
18 | #include "llvm/Support/Debug.h" |
19 | #include <optional> |
20 | using namespace llvm; |
21 | |
22 | #define DEBUG_TYPE "NVPTXtti" |
23 | |
24 | // Whether the given intrinsic reads threadIdx.x/y/z. |
25 | static bool readsThreadIndex(const IntrinsicInst *II) { |
26 | switch (II->getIntrinsicID()) { |
27 | default: return false; |
28 | case Intrinsic::nvvm_read_ptx_sreg_tid_x: |
29 | case Intrinsic::nvvm_read_ptx_sreg_tid_y: |
30 | case Intrinsic::nvvm_read_ptx_sreg_tid_z: |
31 | return true; |
32 | } |
33 | } |
34 | |
35 | static bool readsLaneId(const IntrinsicInst *II) { |
36 | return II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_laneid; |
37 | } |
38 | |
39 | // Whether the given intrinsic is an atomic instruction in PTX. |
40 | static bool isNVVMAtomic(const IntrinsicInst *II) { |
41 | switch (II->getIntrinsicID()) { |
42 | default: return false; |
43 | case Intrinsic::nvvm_atomic_load_inc_32: |
44 | case Intrinsic::nvvm_atomic_load_dec_32: |
45 | |
46 | case Intrinsic::nvvm_atomic_add_gen_f_cta: |
47 | case Intrinsic::nvvm_atomic_add_gen_f_sys: |
48 | case Intrinsic::nvvm_atomic_add_gen_i_cta: |
49 | case Intrinsic::nvvm_atomic_add_gen_i_sys: |
50 | case Intrinsic::nvvm_atomic_and_gen_i_cta: |
51 | case Intrinsic::nvvm_atomic_and_gen_i_sys: |
52 | case Intrinsic::nvvm_atomic_cas_gen_i_cta: |
53 | case Intrinsic::nvvm_atomic_cas_gen_i_sys: |
54 | case Intrinsic::nvvm_atomic_dec_gen_i_cta: |
55 | case Intrinsic::nvvm_atomic_dec_gen_i_sys: |
56 | case Intrinsic::nvvm_atomic_inc_gen_i_cta: |
57 | case Intrinsic::nvvm_atomic_inc_gen_i_sys: |
58 | case Intrinsic::nvvm_atomic_max_gen_i_cta: |
59 | case Intrinsic::nvvm_atomic_max_gen_i_sys: |
60 | case Intrinsic::nvvm_atomic_min_gen_i_cta: |
61 | case Intrinsic::nvvm_atomic_min_gen_i_sys: |
62 | case Intrinsic::nvvm_atomic_or_gen_i_cta: |
63 | case Intrinsic::nvvm_atomic_or_gen_i_sys: |
64 | case Intrinsic::nvvm_atomic_exch_gen_i_cta: |
65 | case Intrinsic::nvvm_atomic_exch_gen_i_sys: |
66 | case Intrinsic::nvvm_atomic_xor_gen_i_cta: |
67 | case Intrinsic::nvvm_atomic_xor_gen_i_sys: |
68 | return true; |
69 | } |
70 | } |
71 | |
72 | bool NVPTXTTIImpl::isSourceOfDivergence(const Value *V) { |
73 | // Without inter-procedural analysis, we conservatively assume that arguments |
74 | // to __device__ functions are divergent. |
75 | if (const Argument *Arg = dyn_cast<Argument>(Val: V)) |
76 | return !isKernelFunction(*Arg->getParent()); |
77 | |
78 | if (const Instruction *I = dyn_cast<Instruction>(Val: V)) { |
79 | // Without pointer analysis, we conservatively assume values loaded from |
80 | // generic or local address space are divergent. |
81 | if (const LoadInst *LI = dyn_cast<LoadInst>(Val: I)) { |
82 | unsigned AS = LI->getPointerAddressSpace(); |
83 | return AS == ADDRESS_SPACE_GENERIC || AS == ADDRESS_SPACE_LOCAL; |
84 | } |
85 | // Atomic instructions may cause divergence. Atomic instructions are |
86 | // executed sequentially across all threads in a warp. Therefore, an earlier |
87 | // executed thread may see different memory inputs than a later executed |
88 | // thread. For example, suppose *a = 0 initially. |
89 | // |
90 | // atom.global.add.s32 d, [a], 1 |
91 | // |
92 | // returns 0 for the first thread that enters the critical region, and 1 for |
93 | // the second thread. |
94 | if (I->isAtomic()) |
95 | return true; |
96 | if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: I)) { |
97 | // Instructions that read threadIdx are obviously divergent. |
98 | if (readsThreadIndex(II) || readsLaneId(II)) |
99 | return true; |
100 | // Handle the NVPTX atomic intrinsics that cannot be represented as an |
101 | // atomic IR instruction. |
102 | if (isNVVMAtomic(II)) |
103 | return true; |
104 | } |
105 | // Conservatively consider the return value of function calls as divergent. |
106 | // We could analyze callees with bodies more precisely using |
107 | // inter-procedural analysis. |
108 | if (isa<CallInst>(Val: I)) |
109 | return true; |
110 | } |
111 | |
112 | return false; |
113 | } |
114 | |
115 | // Convert NVVM intrinsics to target-generic LLVM code where possible. |
116 | static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) { |
117 | // Each NVVM intrinsic we can simplify can be replaced with one of: |
118 | // |
119 | // * an LLVM intrinsic, |
120 | // * an LLVM cast operation, |
121 | // * an LLVM binary operation, or |
122 | // * ad-hoc LLVM IR for the particular operation. |
123 | |
124 | // Some transformations are only valid when the module's |
125 | // flush-denormals-to-zero (ftz) setting is true/false, whereas other |
126 | // transformations are valid regardless of the module's ftz setting. |
127 | enum FtzRequirementTy { |
128 | FTZ_Any, // Any ftz setting is ok. |
129 | FTZ_MustBeOn, // Transformation is valid only if ftz is on. |
130 | FTZ_MustBeOff, // Transformation is valid only if ftz is off. |
131 | }; |
132 | // Classes of NVVM intrinsics that can't be replaced one-to-one with a |
133 | // target-generic intrinsic, cast op, or binary op but that we can nonetheless |
134 | // simplify. |
135 | enum SpecialCase { |
136 | SPC_Reciprocal, |
137 | }; |
138 | |
139 | // SimplifyAction is a poor-man's variant (plus an additional flag) that |
140 | // represents how to replace an NVVM intrinsic with target-generic LLVM IR. |
141 | struct SimplifyAction { |
142 | // Invariant: At most one of these Optionals has a value. |
143 | std::optional<Intrinsic::ID> IID; |
144 | std::optional<Instruction::CastOps> CastOp; |
145 | std::optional<Instruction::BinaryOps> BinaryOp; |
146 | std::optional<SpecialCase> Special; |
147 | |
148 | FtzRequirementTy FtzRequirement = FTZ_Any; |
149 | // Denormal handling is guarded by different attributes depending on the |
150 | // type (denormal-fp-math vs denormal-fp-math-f32), take note of halfs. |
151 | bool IsHalfTy = false; |
152 | |
153 | SimplifyAction() = default; |
154 | |
155 | SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq, |
156 | bool IsHalfTy = false) |
157 | : IID(IID), FtzRequirement(FtzReq), IsHalfTy(IsHalfTy) {} |
158 | |
159 | // Cast operations don't have anything to do with FTZ, so we skip that |
160 | // argument. |
161 | SimplifyAction(Instruction::CastOps CastOp) : CastOp(CastOp) {} |
162 | |
163 | SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq) |
164 | : BinaryOp(BinaryOp), FtzRequirement(FtzReq) {} |
165 | |
166 | SimplifyAction(SpecialCase Special, FtzRequirementTy FtzReq) |
167 | : Special(Special), FtzRequirement(FtzReq) {} |
168 | }; |
169 | |
170 | // Try to generate a SimplifyAction describing how to replace our |
171 | // IntrinsicInstr with target-generic LLVM IR. |
172 | const SimplifyAction Action = [II]() -> SimplifyAction { |
173 | switch (II->getIntrinsicID()) { |
174 | // NVVM intrinsics that map directly to LLVM intrinsics. |
175 | case Intrinsic::nvvm_ceil_d: |
176 | return {Intrinsic::ceil, FTZ_Any}; |
177 | case Intrinsic::nvvm_ceil_f: |
178 | return {Intrinsic::ceil, FTZ_MustBeOff}; |
179 | case Intrinsic::nvvm_ceil_ftz_f: |
180 | return {Intrinsic::ceil, FTZ_MustBeOn}; |
181 | case Intrinsic::nvvm_fabs_d: |
182 | return {Intrinsic::fabs, FTZ_Any}; |
183 | case Intrinsic::nvvm_floor_d: |
184 | return {Intrinsic::floor, FTZ_Any}; |
185 | case Intrinsic::nvvm_floor_f: |
186 | return {Intrinsic::floor, FTZ_MustBeOff}; |
187 | case Intrinsic::nvvm_floor_ftz_f: |
188 | return {Intrinsic::floor, FTZ_MustBeOn}; |
189 | case Intrinsic::nvvm_fma_rn_d: |
190 | return {Intrinsic::fma, FTZ_Any}; |
191 | case Intrinsic::nvvm_fma_rn_f: |
192 | return {Intrinsic::fma, FTZ_MustBeOff}; |
193 | case Intrinsic::nvvm_fma_rn_ftz_f: |
194 | return {Intrinsic::fma, FTZ_MustBeOn}; |
195 | case Intrinsic::nvvm_fma_rn_f16: |
196 | return {Intrinsic::fma, FTZ_MustBeOff, true}; |
197 | case Intrinsic::nvvm_fma_rn_ftz_f16: |
198 | return {Intrinsic::fma, FTZ_MustBeOn, true}; |
199 | case Intrinsic::nvvm_fma_rn_f16x2: |
200 | return {Intrinsic::fma, FTZ_MustBeOff, true}; |
201 | case Intrinsic::nvvm_fma_rn_ftz_f16x2: |
202 | return {Intrinsic::fma, FTZ_MustBeOn, true}; |
203 | case Intrinsic::nvvm_fma_rn_bf16: |
204 | return {Intrinsic::fma, FTZ_MustBeOff, true}; |
205 | case Intrinsic::nvvm_fma_rn_ftz_bf16: |
206 | return {Intrinsic::fma, FTZ_MustBeOn, true}; |
207 | case Intrinsic::nvvm_fma_rn_bf16x2: |
208 | return {Intrinsic::fma, FTZ_MustBeOff, true}; |
209 | case Intrinsic::nvvm_fma_rn_ftz_bf16x2: |
210 | return {Intrinsic::fma, FTZ_MustBeOn, true}; |
211 | case Intrinsic::nvvm_fmax_d: |
212 | return {Intrinsic::maxnum, FTZ_Any}; |
213 | case Intrinsic::nvvm_fmax_f: |
214 | return {Intrinsic::maxnum, FTZ_MustBeOff}; |
215 | case Intrinsic::nvvm_fmax_ftz_f: |
216 | return {Intrinsic::maxnum, FTZ_MustBeOn}; |
217 | case Intrinsic::nvvm_fmax_nan_f: |
218 | return {Intrinsic::maximum, FTZ_MustBeOff}; |
219 | case Intrinsic::nvvm_fmax_ftz_nan_f: |
220 | return {Intrinsic::maximum, FTZ_MustBeOn}; |
221 | case Intrinsic::nvvm_fmax_f16: |
222 | return {Intrinsic::maxnum, FTZ_MustBeOff, true}; |
223 | case Intrinsic::nvvm_fmax_ftz_f16: |
224 | return {Intrinsic::maxnum, FTZ_MustBeOn, true}; |
225 | case Intrinsic::nvvm_fmax_f16x2: |
226 | return {Intrinsic::maxnum, FTZ_MustBeOff, true}; |
227 | case Intrinsic::nvvm_fmax_ftz_f16x2: |
228 | return {Intrinsic::maxnum, FTZ_MustBeOn, true}; |
229 | case Intrinsic::nvvm_fmax_nan_f16: |
230 | return {Intrinsic::maximum, FTZ_MustBeOff, true}; |
231 | case Intrinsic::nvvm_fmax_ftz_nan_f16: |
232 | return {Intrinsic::maximum, FTZ_MustBeOn, true}; |
233 | case Intrinsic::nvvm_fmax_nan_f16x2: |
234 | return {Intrinsic::maximum, FTZ_MustBeOff, true}; |
235 | case Intrinsic::nvvm_fmax_ftz_nan_f16x2: |
236 | return {Intrinsic::maximum, FTZ_MustBeOn, true}; |
237 | case Intrinsic::nvvm_fmin_d: |
238 | return {Intrinsic::minnum, FTZ_Any}; |
239 | case Intrinsic::nvvm_fmin_f: |
240 | return {Intrinsic::minnum, FTZ_MustBeOff}; |
241 | case Intrinsic::nvvm_fmin_ftz_f: |
242 | return {Intrinsic::minnum, FTZ_MustBeOn}; |
243 | case Intrinsic::nvvm_fmin_nan_f: |
244 | return {Intrinsic::minimum, FTZ_MustBeOff}; |
245 | case Intrinsic::nvvm_fmin_ftz_nan_f: |
246 | return {Intrinsic::minimum, FTZ_MustBeOn}; |
247 | case Intrinsic::nvvm_fmin_f16: |
248 | return {Intrinsic::minnum, FTZ_MustBeOff, true}; |
249 | case Intrinsic::nvvm_fmin_ftz_f16: |
250 | return {Intrinsic::minnum, FTZ_MustBeOn, true}; |
251 | case Intrinsic::nvvm_fmin_f16x2: |
252 | return {Intrinsic::minnum, FTZ_MustBeOff, true}; |
253 | case Intrinsic::nvvm_fmin_ftz_f16x2: |
254 | return {Intrinsic::minnum, FTZ_MustBeOn, true}; |
255 | case Intrinsic::nvvm_fmin_nan_f16: |
256 | return {Intrinsic::minimum, FTZ_MustBeOff, true}; |
257 | case Intrinsic::nvvm_fmin_ftz_nan_f16: |
258 | return {Intrinsic::minimum, FTZ_MustBeOn, true}; |
259 | case Intrinsic::nvvm_fmin_nan_f16x2: |
260 | return {Intrinsic::minimum, FTZ_MustBeOff, true}; |
261 | case Intrinsic::nvvm_fmin_ftz_nan_f16x2: |
262 | return {Intrinsic::minimum, FTZ_MustBeOn, true}; |
263 | case Intrinsic::nvvm_sqrt_rn_d: |
264 | return {Intrinsic::sqrt, FTZ_Any}; |
265 | case Intrinsic::nvvm_sqrt_f: |
266 | // nvvm_sqrt_f is a special case. For most intrinsics, foo_ftz_f is the |
267 | // ftz version, and foo_f is the non-ftz version. But nvvm_sqrt_f adopts |
268 | // the ftz-ness of the surrounding code. sqrt_rn_f and sqrt_rn_ftz_f are |
269 | // the versions with explicit ftz-ness. |
270 | return {Intrinsic::sqrt, FTZ_Any}; |
271 | case Intrinsic::nvvm_trunc_d: |
272 | return {Intrinsic::trunc, FTZ_Any}; |
273 | case Intrinsic::nvvm_trunc_f: |
274 | return {Intrinsic::trunc, FTZ_MustBeOff}; |
275 | case Intrinsic::nvvm_trunc_ftz_f: |
276 | return {Intrinsic::trunc, FTZ_MustBeOn}; |
277 | |
278 | // NVVM intrinsics that map to LLVM cast operations. |
279 | // |
280 | // Note that llvm's target-generic conversion operators correspond to the rz |
281 | // (round to zero) versions of the nvvm conversion intrinsics, even though |
282 | // most everything else here uses the rn (round to nearest even) nvvm ops. |
283 | case Intrinsic::nvvm_d2i_rz: |
284 | case Intrinsic::nvvm_f2i_rz: |
285 | case Intrinsic::nvvm_d2ll_rz: |
286 | case Intrinsic::nvvm_f2ll_rz: |
287 | return {Instruction::FPToSI}; |
288 | case Intrinsic::nvvm_d2ui_rz: |
289 | case Intrinsic::nvvm_f2ui_rz: |
290 | case Intrinsic::nvvm_d2ull_rz: |
291 | case Intrinsic::nvvm_f2ull_rz: |
292 | return {Instruction::FPToUI}; |
293 | case Intrinsic::nvvm_i2d_rz: |
294 | case Intrinsic::nvvm_i2f_rz: |
295 | case Intrinsic::nvvm_ll2d_rz: |
296 | case Intrinsic::nvvm_ll2f_rz: |
297 | return {Instruction::SIToFP}; |
298 | case Intrinsic::nvvm_ui2d_rz: |
299 | case Intrinsic::nvvm_ui2f_rz: |
300 | case Intrinsic::nvvm_ull2d_rz: |
301 | case Intrinsic::nvvm_ull2f_rz: |
302 | return {Instruction::UIToFP}; |
303 | |
304 | // NVVM intrinsics that map to LLVM binary ops. |
305 | case Intrinsic::nvvm_div_rn_d: |
306 | return {Instruction::FDiv, FTZ_Any}; |
307 | |
308 | // The remainder of cases are NVVM intrinsics that map to LLVM idioms, but |
309 | // need special handling. |
310 | // |
311 | // We seem to be missing intrinsics for rcp.approx.{ftz.}f32, which is just |
312 | // as well. |
313 | case Intrinsic::nvvm_rcp_rn_d: |
314 | return {SPC_Reciprocal, FTZ_Any}; |
315 | |
316 | // We do not currently simplify intrinsics that give an approximate |
317 | // answer. These include: |
318 | // |
319 | // - nvvm_cos_approx_{f,ftz_f} |
320 | // - nvvm_ex2_approx_{d,f,ftz_f} |
321 | // - nvvm_lg2_approx_{d,f,ftz_f} |
322 | // - nvvm_sin_approx_{f,ftz_f} |
323 | // - nvvm_sqrt_approx_{f,ftz_f} |
324 | // - nvvm_rsqrt_approx_{d,f,ftz_f} |
325 | // - nvvm_div_approx_{ftz_d,ftz_f,f} |
326 | // - nvvm_rcp_approx_ftz_d |
327 | // |
328 | // Ideally we'd encode them as e.g. "fast call @llvm.cos", where "fast" |
329 | // means that fastmath is enabled in the intrinsic. Unfortunately only |
330 | // binary operators (currently) have a fastmath bit in SelectionDAG, so |
331 | // this information gets lost and we can't select on it. |
332 | // |
333 | // TODO: div and rcp are lowered to a binary op, so these we could in |
334 | // theory lower them to "fast fdiv". |
335 | |
336 | default: |
337 | return {}; |
338 | } |
339 | }(); |
340 | |
341 | // If Action.FtzRequirementTy is not satisfied by the module's ftz state, we |
342 | // can bail out now. (Notice that in the case that IID is not an NVVM |
343 | // intrinsic, we don't have to look up any module metadata, as |
344 | // FtzRequirementTy will be FTZ_Any.) |
345 | if (Action.FtzRequirement != FTZ_Any) { |
346 | // FIXME: Broken for f64 |
347 | DenormalMode Mode = II->getFunction()->getDenormalMode( |
348 | FPType: Action.IsHalfTy ? APFloat::IEEEhalf() : APFloat::IEEEsingle()); |
349 | bool FtzEnabled = Mode.Output == DenormalMode::PreserveSign; |
350 | |
351 | if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn)) |
352 | return nullptr; |
353 | } |
354 | |
355 | // Simplify to target-generic intrinsic. |
356 | if (Action.IID) { |
357 | SmallVector<Value *, 4> Args(II->args()); |
358 | // All the target-generic intrinsics currently of interest to us have one |
359 | // type argument, equal to that of the nvvm intrinsic's argument. |
360 | Type *Tys[] = {II->getArgOperand(i: 0)->getType()}; |
361 | return CallInst::Create( |
362 | Func: Intrinsic::getDeclaration(M: II->getModule(), id: *Action.IID, Tys), Args); |
363 | } |
364 | |
365 | // Simplify to target-generic binary op. |
366 | if (Action.BinaryOp) |
367 | return BinaryOperator::Create(Op: *Action.BinaryOp, S1: II->getArgOperand(i: 0), |
368 | S2: II->getArgOperand(i: 1), Name: II->getName()); |
369 | |
370 | // Simplify to target-generic cast op. |
371 | if (Action.CastOp) |
372 | return CastInst::Create(*Action.CastOp, S: II->getArgOperand(i: 0), Ty: II->getType(), |
373 | Name: II->getName()); |
374 | |
375 | // All that's left are the special cases. |
376 | if (!Action.Special) |
377 | return nullptr; |
378 | |
379 | switch (*Action.Special) { |
380 | case SPC_Reciprocal: |
381 | // Simplify reciprocal. |
382 | return BinaryOperator::Create( |
383 | Op: Instruction::FDiv, S1: ConstantFP::get(Ty: II->getArgOperand(i: 0)->getType(), V: 1), |
384 | S2: II->getArgOperand(i: 0), Name: II->getName()); |
385 | } |
386 | llvm_unreachable("All SpecialCase enumerators should be handled in switch." ); |
387 | } |
388 | |
389 | std::optional<Instruction *> |
390 | NVPTXTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { |
391 | if (Instruction *I = simplifyNvvmIntrinsic(II: &II, IC)) { |
392 | return I; |
393 | } |
394 | return std::nullopt; |
395 | } |
396 | |
397 | InstructionCost NVPTXTTIImpl::getArithmeticInstrCost( |
398 | unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, |
399 | TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, |
400 | ArrayRef<const Value *> Args, |
401 | const Instruction *CxtI) { |
402 | // Legalize the type. |
403 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); |
404 | |
405 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
406 | |
407 | switch (ISD) { |
408 | default: |
409 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, |
410 | Opd2Info: Op2Info); |
411 | case ISD::ADD: |
412 | case ISD::MUL: |
413 | case ISD::XOR: |
414 | case ISD::OR: |
415 | case ISD::AND: |
416 | // The machine code (SASS) simulates an i64 with two i32. Therefore, we |
417 | // estimate that arithmetic operations on i64 are twice as expensive as |
418 | // those on types that can fit into one machine register. |
419 | if (LT.second.SimpleTy == MVT::i64) |
420 | return 2 * LT.first; |
421 | // Delegate other cases to the basic TTI. |
422 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, |
423 | Opd2Info: Op2Info); |
424 | } |
425 | } |
426 | |
427 | void NVPTXTTIImpl::(Loop *L, ScalarEvolution &SE, |
428 | TTI::UnrollingPreferences &UP, |
429 | OptimizationRemarkEmitter *ORE) { |
430 | BaseT::getUnrollingPreferences(L, SE, UP, ORE); |
431 | |
432 | // Enable partial unrolling and runtime unrolling, but reduce the |
433 | // threshold. This partially unrolls small loops which are often |
434 | // unrolled by the PTX to SASS compiler and unrolling earlier can be |
435 | // beneficial. |
436 | UP.Partial = UP.Runtime = true; |
437 | UP.PartialThreshold = UP.Threshold / 4; |
438 | } |
439 | |
440 | void NVPTXTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, |
441 | TTI::PeelingPreferences &PP) { |
442 | BaseT::getPeelingPreferences(L, SE, PP); |
443 | } |
444 | |