1 | //===-- SystemZTargetTransformInfo.cpp - SystemZ-specific TTI -------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements a TargetTransformInfo analysis pass specific to the |
10 | // SystemZ target machine. It uses the target's detailed information to provide |
11 | // more precise answers to certain TTI queries, while letting the target |
12 | // independent and default TTI implementations handle the rest. |
13 | // |
14 | //===----------------------------------------------------------------------===// |
15 | |
16 | #include "SystemZTargetTransformInfo.h" |
17 | #include "llvm/Analysis/TargetTransformInfo.h" |
18 | #include "llvm/CodeGen/BasicTTIImpl.h" |
19 | #include "llvm/CodeGen/TargetLowering.h" |
20 | #include "llvm/IR/DerivedTypes.h" |
21 | #include "llvm/IR/InstIterator.h" |
22 | #include "llvm/IR/IntrinsicInst.h" |
23 | #include "llvm/IR/Intrinsics.h" |
24 | #include "llvm/Support/Debug.h" |
25 | #include "llvm/Support/InstructionCost.h" |
26 | #include "llvm/Support/MathExtras.h" |
27 | |
28 | using namespace llvm; |
29 | |
30 | #define DEBUG_TYPE "systemztti" |
31 | |
32 | //===----------------------------------------------------------------------===// |
33 | // |
34 | // SystemZ cost model. |
35 | // |
36 | //===----------------------------------------------------------------------===// |
37 | |
38 | static bool isUsedAsMemCpySource(const Value *V, bool &OtherUse) { |
39 | bool UsedAsMemCpySource = false; |
40 | for (const User *U : V->users()) |
41 | if (const Instruction *User = dyn_cast<Instruction>(Val: U)) { |
42 | if (isa<BitCastInst>(Val: User) || isa<GetElementPtrInst>(Val: User)) { |
43 | UsedAsMemCpySource |= isUsedAsMemCpySource(V: User, OtherUse); |
44 | continue; |
45 | } |
46 | if (const MemCpyInst *Memcpy = dyn_cast<MemCpyInst>(Val: User)) { |
47 | if (Memcpy->getOperand(i_nocapture: 1) == V && !Memcpy->isVolatile()) { |
48 | UsedAsMemCpySource = true; |
49 | continue; |
50 | } |
51 | } |
52 | OtherUse = true; |
53 | } |
54 | return UsedAsMemCpySource; |
55 | } |
56 | |
57 | static void countNumMemAccesses(const Value *Ptr, unsigned &NumStores, |
58 | unsigned &NumLoads, const Function *F) { |
59 | if (!isa<PointerType>(Val: Ptr->getType())) |
60 | return; |
61 | for (const User *U : Ptr->users()) |
62 | if (const Instruction *User = dyn_cast<Instruction>(Val: U)) { |
63 | if (User->getParent()->getParent() == F) { |
64 | if (const auto *SI = dyn_cast<StoreInst>(Val: User)) { |
65 | if (SI->getPointerOperand() == Ptr && !SI->isVolatile()) |
66 | NumStores++; |
67 | } else if (const auto *LI = dyn_cast<LoadInst>(Val: User)) { |
68 | if (LI->getPointerOperand() == Ptr && !LI->isVolatile()) |
69 | NumLoads++; |
70 | } else if (const auto *GEP = dyn_cast<GetElementPtrInst>(Val: User)) { |
71 | if (GEP->getPointerOperand() == Ptr) |
72 | countNumMemAccesses(Ptr: GEP, NumStores, NumLoads, F); |
73 | } |
74 | } |
75 | } |
76 | } |
77 | |
78 | unsigned SystemZTTIImpl::adjustInliningThreshold(const CallBase *CB) const { |
79 | unsigned Bonus = 0; |
80 | const Function *Caller = CB->getParent()->getParent(); |
81 | const Function *Callee = CB->getCalledFunction(); |
82 | if (!Callee) |
83 | return 0; |
84 | |
85 | // Increase the threshold if an incoming argument is used only as a memcpy |
86 | // source. |
87 | for (const Argument &Arg : Callee->args()) { |
88 | bool OtherUse = false; |
89 | if (isUsedAsMemCpySource(V: &Arg, OtherUse) && !OtherUse) { |
90 | Bonus = 1000; |
91 | break; |
92 | } |
93 | } |
94 | |
95 | // Give bonus for globals used much in both caller and a relatively small |
96 | // callee. |
97 | unsigned InstrCount = 0; |
98 | SmallDenseMap<const Value *, unsigned> Ptr2NumUses; |
99 | for (auto &I : instructions(F: Callee)) { |
100 | if (++InstrCount == 200) { |
101 | Ptr2NumUses.clear(); |
102 | break; |
103 | } |
104 | if (const auto *SI = dyn_cast<StoreInst>(Val: &I)) { |
105 | if (!SI->isVolatile()) |
106 | if (auto *GV = dyn_cast<GlobalVariable>(Val: SI->getPointerOperand())) |
107 | Ptr2NumUses[GV]++; |
108 | } else if (const auto *LI = dyn_cast<LoadInst>(Val: &I)) { |
109 | if (!LI->isVolatile()) |
110 | if (auto *GV = dyn_cast<GlobalVariable>(Val: LI->getPointerOperand())) |
111 | Ptr2NumUses[GV]++; |
112 | } else if (const auto *GEP = dyn_cast<GetElementPtrInst>(Val: &I)) { |
113 | if (auto *GV = dyn_cast<GlobalVariable>(Val: GEP->getPointerOperand())) { |
114 | unsigned NumStores = 0, NumLoads = 0; |
115 | countNumMemAccesses(Ptr: GEP, NumStores, NumLoads, F: Callee); |
116 | Ptr2NumUses[GV] += NumLoads + NumStores; |
117 | } |
118 | } |
119 | } |
120 | |
121 | for (auto [Ptr, NumCalleeUses] : Ptr2NumUses) |
122 | if (NumCalleeUses > 10) { |
123 | unsigned CallerStores = 0, CallerLoads = 0; |
124 | countNumMemAccesses(Ptr, NumStores&: CallerStores, NumLoads&: CallerLoads, F: Caller); |
125 | if (CallerStores + CallerLoads > 10) { |
126 | Bonus = 1000; |
127 | break; |
128 | } |
129 | } |
130 | |
131 | // Give bonus when Callee accesses an Alloca of Caller heavily. |
132 | unsigned NumStores = 0; |
133 | unsigned NumLoads = 0; |
134 | for (unsigned OpIdx = 0; OpIdx != Callee->arg_size(); ++OpIdx) { |
135 | Value *CallerArg = CB->getArgOperand(i: OpIdx); |
136 | Argument *CalleeArg = Callee->getArg(i: OpIdx); |
137 | if (isa<AllocaInst>(Val: CallerArg)) |
138 | countNumMemAccesses(Ptr: CalleeArg, NumStores, NumLoads, F: Callee); |
139 | } |
140 | if (NumLoads > 10) |
141 | Bonus += NumLoads * 50; |
142 | if (NumStores > 10) |
143 | Bonus += NumStores * 50; |
144 | Bonus = std::min(a: Bonus, b: unsigned(1000)); |
145 | |
146 | LLVM_DEBUG(if (Bonus) |
147 | dbgs() << "++ SZTTI Adding inlining bonus: " << Bonus << "\n" ;); |
148 | return Bonus; |
149 | } |
150 | |
151 | InstructionCost |
152 | SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, |
153 | TTI::TargetCostKind CostKind) const { |
154 | assert(Ty->isIntegerTy()); |
155 | |
156 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
157 | // There is no cost model for constants with a bit size of 0. Return TCC_Free |
158 | // here, so that constant hoisting will ignore this constant. |
159 | if (BitSize == 0) |
160 | return TTI::TCC_Free; |
161 | // No cost model for operations on integers larger than 128 bit implemented yet. |
162 | if ((!ST->hasVector() && BitSize > 64) || BitSize > 128) |
163 | return TTI::TCC_Free; |
164 | |
165 | if (Imm == 0) |
166 | return TTI::TCC_Free; |
167 | |
168 | if (Imm.getBitWidth() <= 64) { |
169 | // Constants loaded via lgfi. |
170 | if (isInt<32>(x: Imm.getSExtValue())) |
171 | return TTI::TCC_Basic; |
172 | // Constants loaded via llilf. |
173 | if (isUInt<32>(x: Imm.getZExtValue())) |
174 | return TTI::TCC_Basic; |
175 | // Constants loaded via llihf: |
176 | if ((Imm.getZExtValue() & 0xffffffff) == 0) |
177 | return TTI::TCC_Basic; |
178 | |
179 | return 2 * TTI::TCC_Basic; |
180 | } |
181 | |
182 | // i128 immediates loads from Constant Pool |
183 | return 2 * TTI::TCC_Basic; |
184 | } |
185 | |
186 | InstructionCost SystemZTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, |
187 | const APInt &Imm, Type *Ty, |
188 | TTI::TargetCostKind CostKind, |
189 | Instruction *Inst) const { |
190 | assert(Ty->isIntegerTy()); |
191 | |
192 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
193 | // There is no cost model for constants with a bit size of 0. Return TCC_Free |
194 | // here, so that constant hoisting will ignore this constant. |
195 | if (BitSize == 0) |
196 | return TTI::TCC_Free; |
197 | // No cost model for operations on integers larger than 64 bit implemented yet. |
198 | if (BitSize > 64) |
199 | return TTI::TCC_Free; |
200 | |
201 | switch (Opcode) { |
202 | default: |
203 | return TTI::TCC_Free; |
204 | case Instruction::GetElementPtr: |
205 | // Always hoist the base address of a GetElementPtr. This prevents the |
206 | // creation of new constants for every base constant that gets constant |
207 | // folded with the offset. |
208 | if (Idx == 0) |
209 | return 2 * TTI::TCC_Basic; |
210 | return TTI::TCC_Free; |
211 | case Instruction::Store: |
212 | if (Idx == 0 && Imm.getBitWidth() <= 64) { |
213 | // Any 8-bit immediate store can by implemented via mvi. |
214 | if (BitSize == 8) |
215 | return TTI::TCC_Free; |
216 | // 16-bit immediate values can be stored via mvhhi/mvhi/mvghi. |
217 | if (isInt<16>(x: Imm.getSExtValue())) |
218 | return TTI::TCC_Free; |
219 | } |
220 | break; |
221 | case Instruction::ICmp: |
222 | if (Idx == 1 && Imm.getBitWidth() <= 64) { |
223 | // Comparisons against signed 32-bit immediates implemented via cgfi. |
224 | if (isInt<32>(x: Imm.getSExtValue())) |
225 | return TTI::TCC_Free; |
226 | // Comparisons against unsigned 32-bit immediates implemented via clgfi. |
227 | if (isUInt<32>(x: Imm.getZExtValue())) |
228 | return TTI::TCC_Free; |
229 | } |
230 | break; |
231 | case Instruction::Add: |
232 | case Instruction::Sub: |
233 | if (Idx == 1 && Imm.getBitWidth() <= 64) { |
234 | // We use algfi/slgfi to add/subtract 32-bit unsigned immediates. |
235 | if (isUInt<32>(x: Imm.getZExtValue())) |
236 | return TTI::TCC_Free; |
237 | // Or their negation, by swapping addition vs. subtraction. |
238 | if (isUInt<32>(x: -Imm.getSExtValue())) |
239 | return TTI::TCC_Free; |
240 | } |
241 | break; |
242 | case Instruction::Mul: |
243 | if (Idx == 1 && Imm.getBitWidth() <= 64) { |
244 | // We use msgfi to multiply by 32-bit signed immediates. |
245 | if (isInt<32>(x: Imm.getSExtValue())) |
246 | return TTI::TCC_Free; |
247 | } |
248 | break; |
249 | case Instruction::Or: |
250 | case Instruction::Xor: |
251 | if (Idx == 1 && Imm.getBitWidth() <= 64) { |
252 | // Masks supported by oilf/xilf. |
253 | if (isUInt<32>(x: Imm.getZExtValue())) |
254 | return TTI::TCC_Free; |
255 | // Masks supported by oihf/xihf. |
256 | if ((Imm.getZExtValue() & 0xffffffff) == 0) |
257 | return TTI::TCC_Free; |
258 | } |
259 | break; |
260 | case Instruction::And: |
261 | if (Idx == 1 && Imm.getBitWidth() <= 64) { |
262 | // Any 32-bit AND operation can by implemented via nilf. |
263 | if (BitSize <= 32) |
264 | return TTI::TCC_Free; |
265 | // 64-bit masks supported by nilf. |
266 | if (isUInt<32>(x: ~Imm.getZExtValue())) |
267 | return TTI::TCC_Free; |
268 | // 64-bit masks supported by nilh. |
269 | if ((Imm.getZExtValue() & 0xffffffff) == 0xffffffff) |
270 | return TTI::TCC_Free; |
271 | // Some 64-bit AND operations can be implemented via risbg. |
272 | const SystemZInstrInfo *TII = ST->getInstrInfo(); |
273 | unsigned Start, End; |
274 | if (TII->isRxSBGMask(Mask: Imm.getZExtValue(), BitSize, Start, End)) |
275 | return TTI::TCC_Free; |
276 | } |
277 | break; |
278 | case Instruction::Shl: |
279 | case Instruction::LShr: |
280 | case Instruction::AShr: |
281 | // Always return TCC_Free for the shift value of a shift instruction. |
282 | if (Idx == 1) |
283 | return TTI::TCC_Free; |
284 | break; |
285 | case Instruction::UDiv: |
286 | case Instruction::SDiv: |
287 | case Instruction::URem: |
288 | case Instruction::SRem: |
289 | case Instruction::Trunc: |
290 | case Instruction::ZExt: |
291 | case Instruction::SExt: |
292 | case Instruction::IntToPtr: |
293 | case Instruction::PtrToInt: |
294 | case Instruction::BitCast: |
295 | case Instruction::PHI: |
296 | case Instruction::Call: |
297 | case Instruction::Select: |
298 | case Instruction::Ret: |
299 | case Instruction::Load: |
300 | break; |
301 | } |
302 | |
303 | return SystemZTTIImpl::getIntImmCost(Imm, Ty, CostKind); |
304 | } |
305 | |
306 | InstructionCost |
307 | SystemZTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, |
308 | const APInt &Imm, Type *Ty, |
309 | TTI::TargetCostKind CostKind) const { |
310 | assert(Ty->isIntegerTy()); |
311 | |
312 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
313 | // There is no cost model for constants with a bit size of 0. Return TCC_Free |
314 | // here, so that constant hoisting will ignore this constant. |
315 | if (BitSize == 0) |
316 | return TTI::TCC_Free; |
317 | // No cost model for operations on integers larger than 64 bit implemented yet. |
318 | if (BitSize > 64) |
319 | return TTI::TCC_Free; |
320 | |
321 | switch (IID) { |
322 | default: |
323 | return TTI::TCC_Free; |
324 | case Intrinsic::sadd_with_overflow: |
325 | case Intrinsic::uadd_with_overflow: |
326 | case Intrinsic::ssub_with_overflow: |
327 | case Intrinsic::usub_with_overflow: |
328 | // These get expanded to include a normal addition/subtraction. |
329 | if (Idx == 1 && Imm.getBitWidth() <= 64) { |
330 | if (isUInt<32>(x: Imm.getZExtValue())) |
331 | return TTI::TCC_Free; |
332 | if (isUInt<32>(x: -Imm.getSExtValue())) |
333 | return TTI::TCC_Free; |
334 | } |
335 | break; |
336 | case Intrinsic::smul_with_overflow: |
337 | case Intrinsic::umul_with_overflow: |
338 | // These get expanded to include a normal multiplication. |
339 | if (Idx == 1 && Imm.getBitWidth() <= 64) { |
340 | if (isInt<32>(x: Imm.getSExtValue())) |
341 | return TTI::TCC_Free; |
342 | } |
343 | break; |
344 | case Intrinsic::experimental_stackmap: |
345 | if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(x: Imm.getSExtValue()))) |
346 | return TTI::TCC_Free; |
347 | break; |
348 | case Intrinsic::experimental_patchpoint_void: |
349 | case Intrinsic::experimental_patchpoint: |
350 | if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(x: Imm.getSExtValue()))) |
351 | return TTI::TCC_Free; |
352 | break; |
353 | } |
354 | return SystemZTTIImpl::getIntImmCost(Imm, Ty, CostKind); |
355 | } |
356 | |
357 | TargetTransformInfo::PopcntSupportKind |
358 | SystemZTTIImpl::getPopcntSupport(unsigned TyWidth) const { |
359 | assert(isPowerOf2_32(TyWidth) && "Type width must be power of 2" ); |
360 | if (ST->hasPopulationCount() && TyWidth <= 64) |
361 | return TTI::PSK_FastHardware; |
362 | return TTI::PSK_Software; |
363 | } |
364 | |
365 | void SystemZTTIImpl::( |
366 | Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, |
367 | OptimizationRemarkEmitter *ORE) const { |
368 | // Find out if L contains a call, what the machine instruction count |
369 | // estimate is, and how many stores there are. |
370 | bool HasCall = false; |
371 | InstructionCost NumStores = 0; |
372 | for (auto &BB : L->blocks()) |
373 | for (auto &I : *BB) { |
374 | if (isa<CallInst>(Val: &I) || isa<InvokeInst>(Val: &I)) { |
375 | if (const Function *F = cast<CallBase>(Val&: I).getCalledFunction()) { |
376 | if (isLoweredToCall(F)) |
377 | HasCall = true; |
378 | if (F->getIntrinsicID() == Intrinsic::memcpy || |
379 | F->getIntrinsicID() == Intrinsic::memset) |
380 | NumStores++; |
381 | } else { // indirect call. |
382 | HasCall = true; |
383 | } |
384 | } |
385 | if (isa<StoreInst>(Val: &I)) { |
386 | Type *MemAccessTy = I.getOperand(i: 0)->getType(); |
387 | NumStores += getMemoryOpCost(Opcode: Instruction::Store, Src: MemAccessTy, Alignment: Align(), |
388 | AddressSpace: 0, CostKind: TTI::TCK_RecipThroughput); |
389 | } |
390 | } |
391 | |
392 | // The z13 processor will run out of store tags if too many stores |
393 | // are fed into it too quickly. Therefore make sure there are not |
394 | // too many stores in the resulting unrolled loop. |
395 | unsigned const NumStoresVal = NumStores.getValue(); |
396 | unsigned const Max = (NumStoresVal ? (12 / NumStoresVal) : UINT_MAX); |
397 | |
398 | if (HasCall) { |
399 | // Only allow full unrolling if loop has any calls. |
400 | UP.FullUnrollMaxCount = Max; |
401 | UP.MaxCount = 1; |
402 | return; |
403 | } |
404 | |
405 | UP.MaxCount = Max; |
406 | if (UP.MaxCount <= 1) |
407 | return; |
408 | |
409 | // Allow partial and runtime trip count unrolling. |
410 | UP.Partial = UP.Runtime = true; |
411 | |
412 | UP.PartialThreshold = 75; |
413 | UP.DefaultUnrollRuntimeCount = 4; |
414 | |
415 | // Allow expensive instructions in the pre-header of the loop. |
416 | UP.AllowExpensiveTripCount = true; |
417 | |
418 | UP.Force = true; |
419 | } |
420 | |
421 | void SystemZTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, |
422 | TTI::PeelingPreferences &PP) const { |
423 | BaseT::getPeelingPreferences(L, SE, PP); |
424 | } |
425 | |
426 | bool SystemZTTIImpl::isLSRCostLess( |
427 | const TargetTransformInfo::LSRCost &C1, |
428 | const TargetTransformInfo::LSRCost &C2) const { |
429 | // SystemZ specific: check instruction count (first), and don't care about |
430 | // ImmCost, since offsets are checked explicitly. |
431 | return std::tie(args: C1.Insns, args: C1.NumRegs, args: C1.AddRecCost, |
432 | args: C1.NumIVMuls, args: C1.NumBaseAdds, |
433 | args: C1.ScaleCost, args: C1.SetupCost) < |
434 | std::tie(args: C2.Insns, args: C2.NumRegs, args: C2.AddRecCost, |
435 | args: C2.NumIVMuls, args: C2.NumBaseAdds, |
436 | args: C2.ScaleCost, args: C2.SetupCost); |
437 | } |
438 | |
439 | bool SystemZTTIImpl::areInlineCompatible(const Function *Caller, |
440 | const Function *Callee) const { |
441 | const TargetMachine &TM = getTLI()->getTargetMachine(); |
442 | |
443 | const FeatureBitset &CallerBits = |
444 | TM.getSubtargetImpl(*Caller)->getFeatureBits(); |
445 | const FeatureBitset &CalleeBits = |
446 | TM.getSubtargetImpl(*Callee)->getFeatureBits(); |
447 | |
448 | // Support only equal feature bitsets. Restriction should be relaxed in the |
449 | // future to allow inlining when callee's bits are subset of the caller's. |
450 | return CallerBits == CalleeBits; |
451 | } |
452 | |
453 | unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID) const { |
454 | bool Vector = (ClassID == 1); |
455 | if (!Vector) |
456 | // Discount the stack pointer. Also leave out %r0, since it can't |
457 | // be used in an address. |
458 | return 14; |
459 | if (ST->hasVector()) |
460 | return 32; |
461 | return 0; |
462 | } |
463 | |
464 | TypeSize |
465 | SystemZTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { |
466 | switch (K) { |
467 | case TargetTransformInfo::RGK_Scalar: |
468 | return TypeSize::getFixed(ExactSize: 64); |
469 | case TargetTransformInfo::RGK_FixedWidthVector: |
470 | return TypeSize::getFixed(ExactSize: ST->hasVector() ? 128 : 0); |
471 | case TargetTransformInfo::RGK_ScalableVector: |
472 | return TypeSize::getScalable(MinimumSize: 0); |
473 | } |
474 | |
475 | llvm_unreachable("Unsupported register kind" ); |
476 | } |
477 | |
478 | unsigned SystemZTTIImpl::getMinPrefetchStride(unsigned NumMemAccesses, |
479 | unsigned NumStridedMemAccesses, |
480 | unsigned NumPrefetches, |
481 | bool HasCall) const { |
482 | // Don't prefetch a loop with many far apart accesses. |
483 | if (NumPrefetches > 16) |
484 | return UINT_MAX; |
485 | |
486 | // Emit prefetch instructions for smaller strides in cases where we think |
487 | // the hardware prefetcher might not be able to keep up. |
488 | if (NumStridedMemAccesses > 32 && !HasCall && |
489 | (NumMemAccesses - NumStridedMemAccesses) * 32 <= NumStridedMemAccesses) |
490 | return 1; |
491 | |
492 | return ST->hasMiscellaneousExtensions3() ? 8192 : 2048; |
493 | } |
494 | |
495 | bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) const { |
496 | EVT VT = TLI->getValueType(DL, Ty: DataType); |
497 | return (VT.isScalarInteger() && TLI->isTypeLegal(VT)); |
498 | } |
499 | |
500 | static bool isFreeEltLoad(const Value *Op) { |
501 | if (isa<LoadInst>(Val: Op) && Op->hasOneUse()) { |
502 | const Instruction *UserI = cast<Instruction>(Val: *Op->user_begin()); |
503 | return !isa<StoreInst>(Val: UserI); // Prefer MVC |
504 | } |
505 | return false; |
506 | } |
507 | |
508 | InstructionCost SystemZTTIImpl::getScalarizationOverhead( |
509 | VectorType *Ty, const APInt &DemandedElts, bool Insert, bool , |
510 | TTI::TargetCostKind CostKind, bool ForPoisonSrc, |
511 | ArrayRef<Value *> VL) const { |
512 | unsigned NumElts = cast<FixedVectorType>(Val: Ty)->getNumElements(); |
513 | InstructionCost Cost = 0; |
514 | |
515 | if (Insert && Ty->isIntOrIntVectorTy(BitWidth: 64)) { |
516 | // VLVGP will insert two GPRs with one instruction, while VLE will load |
517 | // an element directly with no extra cost |
518 | assert((VL.empty() || VL.size() == NumElts) && |
519 | "Type does not match the number of values." ); |
520 | InstructionCost CurrVectorCost = 0; |
521 | for (unsigned Idx = 0; Idx < NumElts; ++Idx) { |
522 | if (DemandedElts[Idx] && !(VL.size() && isFreeEltLoad(Op: VL[Idx]))) |
523 | ++CurrVectorCost; |
524 | if (Idx % 2 == 1) { |
525 | Cost += std::min(a: InstructionCost(1), b: CurrVectorCost); |
526 | CurrVectorCost = 0; |
527 | } |
528 | } |
529 | Insert = false; |
530 | } |
531 | |
532 | Cost += BaseT::getScalarizationOverhead(InTy: Ty, DemandedElts, Insert, Extract, |
533 | CostKind, ForPoisonSrc, VL); |
534 | return Cost; |
535 | } |
536 | |
537 | // Return the bit size for the scalar type or vector element |
538 | // type. getScalarSizeInBits() returns 0 for a pointer type. |
539 | static unsigned getScalarSizeInBits(Type *Ty) { |
540 | unsigned Size = |
541 | (Ty->isPtrOrPtrVectorTy() ? 64U : Ty->getScalarSizeInBits()); |
542 | assert(Size > 0 && "Element must have non-zero size." ); |
543 | return Size; |
544 | } |
545 | |
546 | // getNumberOfParts() calls getTypeLegalizationCost() which splits the vector |
547 | // type until it is legal. This would e.g. return 4 for <6 x i64>, instead of |
548 | // 3. |
549 | static unsigned getNumVectorRegs(Type *Ty) { |
550 | auto *VTy = cast<FixedVectorType>(Val: Ty); |
551 | unsigned WideBits = getScalarSizeInBits(Ty) * VTy->getNumElements(); |
552 | assert(WideBits > 0 && "Could not compute size of vector" ); |
553 | return ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U)); |
554 | } |
555 | |
556 | InstructionCost SystemZTTIImpl::getArithmeticInstrCost( |
557 | unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, |
558 | TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, |
559 | ArrayRef<const Value *> Args, const Instruction *CxtI) const { |
560 | |
561 | // TODO: Handle more cost kinds. |
562 | if (CostKind != TTI::TCK_RecipThroughput) |
563 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, |
564 | Opd2Info: Op2Info, Args, CxtI); |
565 | |
566 | // TODO: return a good value for BB-VECTORIZER that includes the |
567 | // immediate loads, which we do not want to count for the loop |
568 | // vectorizer, since they are hopefully hoisted out of the loop. This |
569 | // would require a new parameter 'InLoop', but not sure if constant |
570 | // args are common enough to motivate this. |
571 | |
572 | unsigned ScalarBits = Ty->getScalarSizeInBits(); |
573 | |
574 | // There are thre cases of division and remainder: Dividing with a register |
575 | // needs a divide instruction. A divisor which is a power of two constant |
576 | // can be implemented with a sequence of shifts. Any other constant needs a |
577 | // multiply and shifts. |
578 | const unsigned DivInstrCost = 20; |
579 | const unsigned DivMulSeqCost = 10; |
580 | const unsigned SDivPow2Cost = 4; |
581 | |
582 | bool SignedDivRem = |
583 | Opcode == Instruction::SDiv || Opcode == Instruction::SRem; |
584 | bool UnsignedDivRem = |
585 | Opcode == Instruction::UDiv || Opcode == Instruction::URem; |
586 | |
587 | // Check for a constant divisor. |
588 | bool DivRemConst = false; |
589 | bool DivRemConstPow2 = false; |
590 | if ((SignedDivRem || UnsignedDivRem) && Args.size() == 2) { |
591 | if (const Constant *C = dyn_cast<Constant>(Val: Args[1])) { |
592 | const ConstantInt *CVal = |
593 | (C->getType()->isVectorTy() |
594 | ? dyn_cast_or_null<const ConstantInt>(Val: C->getSplatValue()) |
595 | : dyn_cast<const ConstantInt>(Val: C)); |
596 | if (CVal && (CVal->getValue().isPowerOf2() || |
597 | CVal->getValue().isNegatedPowerOf2())) |
598 | DivRemConstPow2 = true; |
599 | else |
600 | DivRemConst = true; |
601 | } |
602 | } |
603 | |
604 | if (!Ty->isVectorTy()) { |
605 | // These FP operations are supported with a dedicated instruction for |
606 | // float, double and fp128 (base implementation assumes float generally |
607 | // costs 2). |
608 | if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub || |
609 | Opcode == Instruction::FMul || Opcode == Instruction::FDiv) |
610 | return 1; |
611 | |
612 | // There is no native support for FRem. |
613 | if (Opcode == Instruction::FRem) |
614 | return LIBCALL_COST; |
615 | |
616 | // Give discount for some combined logical operations if supported. |
617 | if (Args.size() == 2) { |
618 | if (Opcode == Instruction::Xor) { |
619 | for (const Value *A : Args) { |
620 | if (const Instruction *I = dyn_cast<Instruction>(Val: A)) |
621 | if (I->hasOneUse() && |
622 | (I->getOpcode() == Instruction::Or || |
623 | I->getOpcode() == Instruction::And || |
624 | I->getOpcode() == Instruction::Xor)) |
625 | if ((ScalarBits <= 64 && ST->hasMiscellaneousExtensions3()) || |
626 | (isInt128InVR(Ty) && |
627 | (I->getOpcode() == Instruction::Or || ST->hasVectorEnhancements1()))) |
628 | return 0; |
629 | } |
630 | } |
631 | else if (Opcode == Instruction::And || Opcode == Instruction::Or) { |
632 | for (const Value *A : Args) { |
633 | if (const Instruction *I = dyn_cast<Instruction>(Val: A)) |
634 | if ((I->hasOneUse() && I->getOpcode() == Instruction::Xor) && |
635 | ((ScalarBits <= 64 && ST->hasMiscellaneousExtensions3()) || |
636 | (isInt128InVR(Ty) && |
637 | (Opcode == Instruction::And || ST->hasVectorEnhancements1())))) |
638 | return 0; |
639 | } |
640 | } |
641 | } |
642 | |
643 | // Or requires one instruction, although it has custom handling for i64. |
644 | if (Opcode == Instruction::Or) |
645 | return 1; |
646 | |
647 | if (Opcode == Instruction::Xor && ScalarBits == 1) { |
648 | if (ST->hasLoadStoreOnCond2()) |
649 | return 5; // 2 * (li 0; loc 1); xor |
650 | return 7; // 2 * ipm sequences ; xor ; shift ; compare |
651 | } |
652 | |
653 | if (DivRemConstPow2) |
654 | return (SignedDivRem ? SDivPow2Cost : 1); |
655 | if (DivRemConst) |
656 | return DivMulSeqCost; |
657 | if (SignedDivRem || UnsignedDivRem) |
658 | return DivInstrCost; |
659 | } |
660 | else if (ST->hasVector()) { |
661 | auto *VTy = cast<FixedVectorType>(Val: Ty); |
662 | unsigned VF = VTy->getNumElements(); |
663 | unsigned NumVectors = getNumVectorRegs(Ty); |
664 | |
665 | // These vector operations are custom handled, but are still supported |
666 | // with one instruction per vector, regardless of element size. |
667 | if (Opcode == Instruction::Shl || Opcode == Instruction::LShr || |
668 | Opcode == Instruction::AShr) { |
669 | return NumVectors; |
670 | } |
671 | |
672 | if (DivRemConstPow2) |
673 | return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1)); |
674 | if (DivRemConst) { |
675 | SmallVector<Type *> Tys(Args.size(), Ty); |
676 | return VF * DivMulSeqCost + |
677 | BaseT::getScalarizationOverhead(RetTy: VTy, Args, Tys, CostKind); |
678 | } |
679 | if (SignedDivRem || UnsignedDivRem) { |
680 | if (ST->hasVectorEnhancements3() && ScalarBits >= 32) |
681 | return NumVectors * DivInstrCost; |
682 | else if (VF > 4) |
683 | // Temporary hack: disable high vectorization factors with integer |
684 | // division/remainder, which will get scalarized and handled with |
685 | // GR128 registers. The mischeduler is not clever enough to avoid |
686 | // spilling yet. |
687 | return 1000; |
688 | } |
689 | |
690 | // These FP operations are supported with a single vector instruction for |
691 | // double (base implementation assumes float generally costs 2). For |
692 | // FP128, the scalar cost is 1, and there is no overhead since the values |
693 | // are already in scalar registers. |
694 | if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub || |
695 | Opcode == Instruction::FMul || Opcode == Instruction::FDiv) { |
696 | switch (ScalarBits) { |
697 | case 32: { |
698 | // The vector enhancements facility 1 provides v4f32 instructions. |
699 | if (ST->hasVectorEnhancements1()) |
700 | return NumVectors; |
701 | // Return the cost of multiple scalar invocation plus the cost of |
702 | // inserting and extracting the values. |
703 | InstructionCost ScalarCost = |
704 | getArithmeticInstrCost(Opcode, Ty: Ty->getScalarType(), CostKind); |
705 | SmallVector<Type *> Tys(Args.size(), Ty); |
706 | InstructionCost Cost = |
707 | (VF * ScalarCost) + |
708 | BaseT::getScalarizationOverhead(RetTy: VTy, Args, Tys, CostKind); |
709 | // FIXME: VF 2 for these FP operations are currently just as |
710 | // expensive as for VF 4. |
711 | if (VF == 2) |
712 | Cost *= 2; |
713 | return Cost; |
714 | } |
715 | case 64: |
716 | case 128: |
717 | return NumVectors; |
718 | default: |
719 | break; |
720 | } |
721 | } |
722 | |
723 | // There is no native support for FRem. |
724 | if (Opcode == Instruction::FRem) { |
725 | SmallVector<Type *> Tys(Args.size(), Ty); |
726 | InstructionCost Cost = |
727 | (VF * LIBCALL_COST) + |
728 | BaseT::getScalarizationOverhead(RetTy: VTy, Args, Tys, CostKind); |
729 | // FIXME: VF 2 for float is currently just as expensive as for VF 4. |
730 | if (VF == 2 && ScalarBits == 32) |
731 | Cost *= 2; |
732 | return Cost; |
733 | } |
734 | } |
735 | |
736 | // Fallback to the default implementation. |
737 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info, |
738 | Args, CxtI); |
739 | } |
740 | |
741 | InstructionCost |
742 | SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, |
743 | VectorType *SrcTy, ArrayRef<int> Mask, |
744 | TTI::TargetCostKind CostKind, int Index, |
745 | VectorType *SubTp, ArrayRef<const Value *> Args, |
746 | const Instruction *CxtI) const { |
747 | Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTy&: SubTp); |
748 | if (ST->hasVector()) { |
749 | unsigned NumVectors = getNumVectorRegs(Ty: SrcTy); |
750 | |
751 | // TODO: Since fp32 is expanded, the shuffle cost should always be 0. |
752 | |
753 | // FP128 values are always in scalar registers, so there is no work |
754 | // involved with a shuffle, except for broadcast. In that case register |
755 | // moves are done with a single instruction per element. |
756 | if (SrcTy->getScalarType()->isFP128Ty()) |
757 | return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0); |
758 | |
759 | switch (Kind) { |
760 | case TargetTransformInfo::SK_ExtractSubvector: |
761 | // ExtractSubvector Index indicates start offset. |
762 | |
763 | // Extracting a subvector from first index is a noop. |
764 | return (Index == 0 ? 0 : NumVectors); |
765 | |
766 | case TargetTransformInfo::SK_Broadcast: |
767 | // Loop vectorizer calls here to figure out the extra cost of |
768 | // broadcasting a loaded value to all elements of a vector. Since vlrep |
769 | // loads and replicates with a single instruction, adjust the returned |
770 | // value. |
771 | return NumVectors - 1; |
772 | |
773 | default: |
774 | |
775 | // SystemZ supports single instruction permutation / replication. |
776 | return NumVectors; |
777 | } |
778 | } |
779 | |
780 | return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, |
781 | SubTp); |
782 | } |
783 | |
784 | // Return the log2 difference of the element sizes of the two vector types. |
785 | static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1) { |
786 | unsigned Bits0 = Ty0->getScalarSizeInBits(); |
787 | unsigned Bits1 = Ty1->getScalarSizeInBits(); |
788 | |
789 | if (Bits1 > Bits0) |
790 | return (Log2_32(Value: Bits1) - Log2_32(Value: Bits0)); |
791 | |
792 | return (Log2_32(Value: Bits0) - Log2_32(Value: Bits1)); |
793 | } |
794 | |
795 | // Return the number of instructions needed to truncate SrcTy to DstTy. |
796 | unsigned SystemZTTIImpl::getVectorTruncCost(Type *SrcTy, Type *DstTy) const { |
797 | assert (SrcTy->isVectorTy() && DstTy->isVectorTy()); |
798 | assert(SrcTy->getPrimitiveSizeInBits().getFixedValue() > |
799 | DstTy->getPrimitiveSizeInBits().getFixedValue() && |
800 | "Packing must reduce size of vector type." ); |
801 | assert(cast<FixedVectorType>(SrcTy)->getNumElements() == |
802 | cast<FixedVectorType>(DstTy)->getNumElements() && |
803 | "Packing should not change number of elements." ); |
804 | |
805 | // TODO: Since fp32 is expanded, the extract cost should always be 0. |
806 | |
807 | unsigned NumParts = getNumVectorRegs(Ty: SrcTy); |
808 | if (NumParts <= 2) |
809 | // Up to 2 vector registers can be truncated efficiently with pack or |
810 | // permute. The latter requires an immediate mask to be loaded, which |
811 | // typically gets hoisted out of a loop. TODO: return a good value for |
812 | // BB-VECTORIZER that includes the immediate loads, which we do not want |
813 | // to count for the loop vectorizer. |
814 | return 1; |
815 | |
816 | unsigned Cost = 0; |
817 | unsigned Log2Diff = getElSizeLog2Diff(Ty0: SrcTy, Ty1: DstTy); |
818 | unsigned VF = cast<FixedVectorType>(Val: SrcTy)->getNumElements(); |
819 | for (unsigned P = 0; P < Log2Diff; ++P) { |
820 | if (NumParts > 1) |
821 | NumParts /= 2; |
822 | Cost += NumParts; |
823 | } |
824 | |
825 | // Currently, a general mix of permutes and pack instructions is output by |
826 | // isel, which follow the cost computation above except for this case which |
827 | // is one instruction less: |
828 | if (VF == 8 && SrcTy->getScalarSizeInBits() == 64 && |
829 | DstTy->getScalarSizeInBits() == 8) |
830 | Cost--; |
831 | |
832 | return Cost; |
833 | } |
834 | |
835 | // Return the cost of converting a vector bitmask produced by a compare |
836 | // (SrcTy), to the type of the select or extend instruction (DstTy). |
837 | unsigned SystemZTTIImpl::getVectorBitmaskConversionCost(Type *SrcTy, |
838 | Type *DstTy) const { |
839 | assert (SrcTy->isVectorTy() && DstTy->isVectorTy() && |
840 | "Should only be called with vector types." ); |
841 | |
842 | unsigned PackCost = 0; |
843 | unsigned SrcScalarBits = SrcTy->getScalarSizeInBits(); |
844 | unsigned DstScalarBits = DstTy->getScalarSizeInBits(); |
845 | unsigned Log2Diff = getElSizeLog2Diff(Ty0: SrcTy, Ty1: DstTy); |
846 | if (SrcScalarBits > DstScalarBits) |
847 | // The bitmask will be truncated. |
848 | PackCost = getVectorTruncCost(SrcTy, DstTy); |
849 | else if (SrcScalarBits < DstScalarBits) { |
850 | unsigned DstNumParts = getNumVectorRegs(Ty: DstTy); |
851 | // Each vector select needs its part of the bitmask unpacked. |
852 | PackCost = Log2Diff * DstNumParts; |
853 | // Extra cost for moving part of mask before unpacking. |
854 | PackCost += DstNumParts - 1; |
855 | } |
856 | |
857 | return PackCost; |
858 | } |
859 | |
860 | // Return the type of the compared operands. This is needed to compute the |
861 | // cost for a Select / ZExt or SExt instruction. |
862 | static Type *getCmpOpsType(const Instruction *I, unsigned VF = 1) { |
863 | Type *OpTy = nullptr; |
864 | if (CmpInst *CI = dyn_cast<CmpInst>(Val: I->getOperand(i: 0))) |
865 | OpTy = CI->getOperand(i_nocapture: 0)->getType(); |
866 | else if (Instruction *LogicI = dyn_cast<Instruction>(Val: I->getOperand(i: 0))) |
867 | if (LogicI->getNumOperands() == 2) |
868 | if (CmpInst *CI0 = dyn_cast<CmpInst>(Val: LogicI->getOperand(i: 0))) |
869 | if (isa<CmpInst>(Val: LogicI->getOperand(i: 1))) |
870 | OpTy = CI0->getOperand(i_nocapture: 0)->getType(); |
871 | |
872 | if (OpTy != nullptr) { |
873 | if (VF == 1) { |
874 | assert (!OpTy->isVectorTy() && "Expected scalar type" ); |
875 | return OpTy; |
876 | } |
877 | // Return the potentially vectorized type based on 'I' and 'VF'. 'I' may |
878 | // be either scalar or already vectorized with a same or lesser VF. |
879 | Type *ElTy = OpTy->getScalarType(); |
880 | return FixedVectorType::get(ElementType: ElTy, NumElts: VF); |
881 | } |
882 | |
883 | return nullptr; |
884 | } |
885 | |
886 | // Get the cost of converting a boolean vector to a vector with same width |
887 | // and element size as Dst, plus the cost of zero extending if needed. |
888 | unsigned |
889 | SystemZTTIImpl::getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst, |
890 | const Instruction *I) const { |
891 | auto *DstVTy = cast<FixedVectorType>(Val: Dst); |
892 | unsigned VF = DstVTy->getNumElements(); |
893 | unsigned Cost = 0; |
894 | // If we know what the widths of the compared operands, get any cost of |
895 | // converting it to match Dst. Otherwise assume same widths. |
896 | Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr); |
897 | if (CmpOpTy != nullptr) |
898 | Cost = getVectorBitmaskConversionCost(SrcTy: CmpOpTy, DstTy: Dst); |
899 | if (Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP) |
900 | // One 'vn' per dst vector with an immediate mask. |
901 | Cost += getNumVectorRegs(Ty: Dst); |
902 | return Cost; |
903 | } |
904 | |
905 | InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, |
906 | Type *Src, |
907 | TTI::CastContextHint CCH, |
908 | TTI::TargetCostKind CostKind, |
909 | const Instruction *I) const { |
910 | // FIXME: Can the logic below also be used for these cost kinds? |
911 | if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) { |
912 | auto BaseCost = BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); |
913 | return BaseCost == 0 ? BaseCost : 1; |
914 | } |
915 | |
916 | unsigned DstScalarBits = Dst->getScalarSizeInBits(); |
917 | unsigned SrcScalarBits = Src->getScalarSizeInBits(); |
918 | |
919 | if (!Src->isVectorTy()) { |
920 | if (Dst->isVectorTy()) |
921 | return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); |
922 | |
923 | if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) { |
924 | if (Src->isIntegerTy(Bitwidth: 128)) |
925 | return LIBCALL_COST; |
926 | if (SrcScalarBits >= 32 || |
927 | (I != nullptr && isa<LoadInst>(Val: I->getOperand(i: 0)))) |
928 | return 1; |
929 | return SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/; |
930 | } |
931 | |
932 | if ((Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) && |
933 | Dst->isIntegerTy(Bitwidth: 128)) |
934 | return LIBCALL_COST; |
935 | |
936 | if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt)) { |
937 | if (Src->isIntegerTy(Bitwidth: 1)) { |
938 | if (DstScalarBits == 128) { |
939 | if (Opcode == Instruction::SExt && ST->hasVectorEnhancements3()) |
940 | return 0;/*VCEQQ*/ |
941 | return 5 /*branch seq.*/; |
942 | } |
943 | |
944 | if (ST->hasLoadStoreOnCond2()) |
945 | return 2; // li 0; loc 1 |
946 | |
947 | // This should be extension of a compare i1 result, which is done with |
948 | // ipm and a varying sequence of instructions. |
949 | unsigned Cost = 0; |
950 | if (Opcode == Instruction::SExt) |
951 | Cost = (DstScalarBits < 64 ? 3 : 4); |
952 | if (Opcode == Instruction::ZExt) |
953 | Cost = 3; |
954 | Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr); |
955 | if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy()) |
956 | // If operands of an fp-type was compared, this costs +1. |
957 | Cost++; |
958 | return Cost; |
959 | } |
960 | else if (isInt128InVR(Ty: Dst)) { |
961 | // Extensions from GPR to i128 (in VR) typically costs two instructions, |
962 | // but a zero-extending load would be just one extra instruction. |
963 | if (Opcode == Instruction::ZExt && I != nullptr) |
964 | if (LoadInst *Ld = dyn_cast<LoadInst>(Val: I->getOperand(i: 0))) |
965 | if (Ld->hasOneUse()) |
966 | return 1; |
967 | return 2; |
968 | } |
969 | } |
970 | |
971 | if (Opcode == Instruction::Trunc && isInt128InVR(Ty: Src) && I != nullptr) { |
972 | if (LoadInst *Ld = dyn_cast<LoadInst>(Val: I->getOperand(i: 0))) |
973 | if (Ld->hasOneUse()) |
974 | return 0; // Will be converted to GPR load. |
975 | bool OnlyTruncatingStores = true; |
976 | for (const User *U : I->users()) |
977 | if (!isa<StoreInst>(Val: U)) { |
978 | OnlyTruncatingStores = false; |
979 | break; |
980 | } |
981 | if (OnlyTruncatingStores) |
982 | return 0; |
983 | return 2; // Vector element extraction. |
984 | } |
985 | } |
986 | else if (ST->hasVector()) { |
987 | // Vector to scalar cast. |
988 | auto *SrcVecTy = cast<FixedVectorType>(Val: Src); |
989 | auto *DstVecTy = dyn_cast<FixedVectorType>(Val: Dst); |
990 | if (!DstVecTy) { |
991 | // TODO: tune vector-to-scalar cast. |
992 | return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); |
993 | } |
994 | unsigned VF = SrcVecTy->getNumElements(); |
995 | unsigned NumDstVectors = getNumVectorRegs(Ty: Dst); |
996 | unsigned NumSrcVectors = getNumVectorRegs(Ty: Src); |
997 | |
998 | if (Opcode == Instruction::Trunc) { |
999 | if (Src->getScalarSizeInBits() == Dst->getScalarSizeInBits()) |
1000 | return 0; // Check for NOOP conversions. |
1001 | return getVectorTruncCost(SrcTy: Src, DstTy: Dst); |
1002 | } |
1003 | |
1004 | if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { |
1005 | if (SrcScalarBits >= 8) { |
1006 | // ZExt will use either a single unpack or a vector permute. |
1007 | if (Opcode == Instruction::ZExt) |
1008 | return NumDstVectors; |
1009 | |
1010 | // SExt will be handled with one unpack per doubling of width. |
1011 | unsigned NumUnpacks = getElSizeLog2Diff(Ty0: Src, Ty1: Dst); |
1012 | |
1013 | // For types that spans multiple vector registers, some additional |
1014 | // instructions are used to setup the unpacking. |
1015 | unsigned NumSrcVectorOps = |
1016 | (NumUnpacks > 1 ? (NumDstVectors - NumSrcVectors) |
1017 | : (NumDstVectors / 2)); |
1018 | |
1019 | return (NumUnpacks * NumDstVectors) + NumSrcVectorOps; |
1020 | } |
1021 | else if (SrcScalarBits == 1) |
1022 | return getBoolVecToIntConversionCost(Opcode, Dst, I); |
1023 | } |
1024 | |
1025 | if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP || |
1026 | Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) { |
1027 | // TODO: Fix base implementation which could simplify things a bit here |
1028 | // (seems to miss on differentiating on scalar/vector types). |
1029 | |
1030 | // Only 64 bit vector conversions are natively supported before z15. |
1031 | if (DstScalarBits == 64 || ST->hasVectorEnhancements2()) { |
1032 | if (SrcScalarBits == DstScalarBits) |
1033 | return NumDstVectors; |
1034 | |
1035 | if (SrcScalarBits == 1) |
1036 | return getBoolVecToIntConversionCost(Opcode, Dst, I) + NumDstVectors; |
1037 | } |
1038 | |
1039 | // Return the cost of multiple scalar invocation plus the cost of |
1040 | // inserting and extracting the values. Base implementation does not |
1041 | // realize float->int gets scalarized. |
1042 | InstructionCost ScalarCost = getCastInstrCost( |
1043 | Opcode, Dst: Dst->getScalarType(), Src: Src->getScalarType(), CCH, CostKind); |
1044 | InstructionCost TotCost = VF * ScalarCost; |
1045 | bool NeedsInserts = true, = true; |
1046 | // FP128 registers do not get inserted or extracted. |
1047 | if (DstScalarBits == 128 && |
1048 | (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP)) |
1049 | NeedsInserts = false; |
1050 | if (SrcScalarBits == 128 && |
1051 | (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI)) |
1052 | NeedsExtracts = false; |
1053 | |
1054 | TotCost += BaseT::getScalarizationOverhead(InTy: SrcVecTy, /*Insert*/ false, |
1055 | Extract: NeedsExtracts, CostKind); |
1056 | TotCost += BaseT::getScalarizationOverhead(InTy: DstVecTy, Insert: NeedsInserts, |
1057 | /*Extract*/ false, CostKind); |
1058 | |
1059 | // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4. |
1060 | if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32) |
1061 | TotCost *= 2; |
1062 | |
1063 | return TotCost; |
1064 | } |
1065 | |
1066 | if (Opcode == Instruction::FPTrunc) { |
1067 | if (SrcScalarBits == 128) // fp128 -> double/float + inserts of elements. |
1068 | return VF /*ldxbr/lexbr*/ + |
1069 | BaseT::getScalarizationOverhead(InTy: DstVecTy, /*Insert*/ true, |
1070 | /*Extract*/ false, CostKind); |
1071 | else // double -> float |
1072 | return VF / 2 /*vledb*/ + std::max(a: 1U, b: VF / 4 /*vperm*/); |
1073 | } |
1074 | |
1075 | if (Opcode == Instruction::FPExt) { |
1076 | if (SrcScalarBits == 32 && DstScalarBits == 64) { |
1077 | // float -> double is very rare and currently unoptimized. Instead of |
1078 | // using vldeb, which can do two at a time, all conversions are |
1079 | // scalarized. |
1080 | return VF * 2; |
1081 | } |
1082 | // -> fp128. VF * lxdb/lxeb + extraction of elements. |
1083 | return VF + BaseT::getScalarizationOverhead(InTy: SrcVecTy, /*Insert*/ false, |
1084 | /*Extract*/ true, CostKind); |
1085 | } |
1086 | } |
1087 | |
1088 | return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); |
1089 | } |
1090 | |
1091 | // Scalar i8 / i16 operations will typically be made after first extending |
1092 | // the operands to i32. |
1093 | static unsigned getOperandsExtensionCost(const Instruction *I) { |
1094 | unsigned ExtCost = 0; |
1095 | for (Value *Op : I->operands()) |
1096 | // A load of i8 or i16 sign/zero extends to i32. |
1097 | if (!isa<LoadInst>(Val: Op) && !isa<ConstantInt>(Val: Op)) |
1098 | ExtCost++; |
1099 | |
1100 | return ExtCost; |
1101 | } |
1102 | |
1103 | InstructionCost SystemZTTIImpl::getCmpSelInstrCost( |
1104 | unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, |
1105 | TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, |
1106 | TTI::OperandValueInfo Op2Info, const Instruction *I) const { |
1107 | if (CostKind != TTI::TCK_RecipThroughput) |
1108 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, |
1109 | Op1Info, Op2Info); |
1110 | |
1111 | if (!ValTy->isVectorTy()) { |
1112 | switch (Opcode) { |
1113 | case Instruction::ICmp: { |
1114 | // A loaded value compared with 0 with multiple users becomes Load and |
1115 | // Test. The load is then not foldable, so return 0 cost for the ICmp. |
1116 | unsigned ScalarBits = ValTy->getScalarSizeInBits(); |
1117 | if (I != nullptr && (ScalarBits == 32 || ScalarBits == 64)) |
1118 | if (LoadInst *Ld = dyn_cast<LoadInst>(Val: I->getOperand(i: 0))) |
1119 | if (const ConstantInt *C = dyn_cast<ConstantInt>(Val: I->getOperand(i: 1))) |
1120 | if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() && |
1121 | C->isZero()) |
1122 | return 0; |
1123 | |
1124 | unsigned Cost = 1; |
1125 | if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16) |
1126 | Cost += (I != nullptr ? getOperandsExtensionCost(I) : 2); |
1127 | return Cost; |
1128 | } |
1129 | case Instruction::Select: |
1130 | if (ValTy->isFloatingPointTy()) |
1131 | return 4; // No LOC for FP - costs a conditional jump. |
1132 | |
1133 | // When selecting based on an i128 comparison, LOC / VSEL is possible |
1134 | // if i128 comparisons are directly supported. |
1135 | if (I != nullptr) |
1136 | if (ICmpInst *CI = dyn_cast<ICmpInst>(Val: I->getOperand(i: 0))) |
1137 | if (CI->getOperand(i_nocapture: 0)->getType()->isIntegerTy(Bitwidth: 128)) |
1138 | return ST->hasVectorEnhancements3() ? 1 : 4; |
1139 | |
1140 | // Load On Condition / Select Register available, except for i128. |
1141 | return !isInt128InVR(Ty: ValTy) ? 1 : 4; |
1142 | } |
1143 | } |
1144 | else if (ST->hasVector()) { |
1145 | unsigned VF = cast<FixedVectorType>(Val: ValTy)->getNumElements(); |
1146 | |
1147 | // Called with a compare instruction. |
1148 | if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) { |
1149 | unsigned = 0; |
1150 | if (I != nullptr) { |
1151 | // Some predicates cost one or two extra instructions. |
1152 | switch (cast<CmpInst>(Val: I)->getPredicate()) { |
1153 | case CmpInst::Predicate::ICMP_NE: |
1154 | case CmpInst::Predicate::ICMP_UGE: |
1155 | case CmpInst::Predicate::ICMP_ULE: |
1156 | case CmpInst::Predicate::ICMP_SGE: |
1157 | case CmpInst::Predicate::ICMP_SLE: |
1158 | PredicateExtraCost = 1; |
1159 | break; |
1160 | case CmpInst::Predicate::FCMP_ONE: |
1161 | case CmpInst::Predicate::FCMP_ORD: |
1162 | case CmpInst::Predicate::FCMP_UEQ: |
1163 | case CmpInst::Predicate::FCMP_UNO: |
1164 | PredicateExtraCost = 2; |
1165 | break; |
1166 | default: |
1167 | break; |
1168 | } |
1169 | } |
1170 | |
1171 | // Float is handled with 2*vmr[lh]f + 2*vldeb + vfchdb for each pair of |
1172 | // floats. FIXME: <2 x float> generates same code as <4 x float>. |
1173 | unsigned CmpCostPerVector = (ValTy->getScalarType()->isFloatTy() ? 10 : 1); |
1174 | unsigned NumVecs_cmp = getNumVectorRegs(Ty: ValTy); |
1175 | |
1176 | unsigned Cost = (NumVecs_cmp * (CmpCostPerVector + PredicateExtraCost)); |
1177 | return Cost; |
1178 | } |
1179 | else { // Called with a select instruction. |
1180 | assert (Opcode == Instruction::Select); |
1181 | |
1182 | // We can figure out the extra cost of packing / unpacking if the |
1183 | // instruction was passed and the compare instruction is found. |
1184 | unsigned PackCost = 0; |
1185 | Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr); |
1186 | if (CmpOpTy != nullptr) |
1187 | PackCost = |
1188 | getVectorBitmaskConversionCost(SrcTy: CmpOpTy, DstTy: ValTy); |
1189 | |
1190 | return getNumVectorRegs(Ty: ValTy) /*vsel*/ + PackCost; |
1191 | } |
1192 | } |
1193 | |
1194 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, |
1195 | Op1Info, Op2Info); |
1196 | } |
1197 | |
1198 | InstructionCost SystemZTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, |
1199 | TTI::TargetCostKind CostKind, |
1200 | unsigned Index, |
1201 | const Value *Op0, |
1202 | const Value *Op1) const { |
1203 | if (Opcode == Instruction::InsertElement) { |
1204 | // Vector Element Load. |
1205 | if (Op1 != nullptr && isFreeEltLoad(Op: Op1)) |
1206 | return 0; |
1207 | |
1208 | // vlvgp will insert two grs into a vector register, so count half the |
1209 | // number of instructions as an estimate when we don't have the full |
1210 | // picture (as in getScalarizationOverhead()). |
1211 | if (Val->isIntOrIntVectorTy(BitWidth: 64)) |
1212 | return ((Index % 2 == 0) ? 1 : 0); |
1213 | } |
1214 | |
1215 | if (Opcode == Instruction::ExtractElement) { |
1216 | int Cost = ((getScalarSizeInBits(Ty: Val) == 1) ? 2 /*+test-under-mask*/ : 1); |
1217 | |
1218 | // Give a slight penalty for moving out of vector pipeline to FXU unit. |
1219 | if (Index == 0 && Val->isIntOrIntVectorTy()) |
1220 | Cost += 1; |
1221 | |
1222 | return Cost; |
1223 | } |
1224 | |
1225 | return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); |
1226 | } |
1227 | |
1228 | // Check if a load may be folded as a memory operand in its user. |
1229 | bool SystemZTTIImpl::isFoldableLoad(const LoadInst *Ld, |
1230 | const Instruction *&FoldedValue) const { |
1231 | if (!Ld->hasOneUse()) |
1232 | return false; |
1233 | FoldedValue = Ld; |
1234 | const Instruction *UserI = cast<Instruction>(Val: *Ld->user_begin()); |
1235 | unsigned LoadedBits = getScalarSizeInBits(Ty: Ld->getType()); |
1236 | unsigned TruncBits = 0; |
1237 | unsigned SExtBits = 0; |
1238 | unsigned ZExtBits = 0; |
1239 | if (UserI->hasOneUse()) { |
1240 | unsigned UserBits = UserI->getType()->getScalarSizeInBits(); |
1241 | if (isa<TruncInst>(Val: UserI)) |
1242 | TruncBits = UserBits; |
1243 | else if (isa<SExtInst>(Val: UserI)) |
1244 | SExtBits = UserBits; |
1245 | else if (isa<ZExtInst>(Val: UserI)) |
1246 | ZExtBits = UserBits; |
1247 | } |
1248 | if (TruncBits || SExtBits || ZExtBits) { |
1249 | FoldedValue = UserI; |
1250 | UserI = cast<Instruction>(Val: *UserI->user_begin()); |
1251 | // Load (single use) -> trunc/extend (single use) -> UserI |
1252 | } |
1253 | if ((UserI->getOpcode() == Instruction::Sub || |
1254 | UserI->getOpcode() == Instruction::SDiv || |
1255 | UserI->getOpcode() == Instruction::UDiv) && |
1256 | UserI->getOperand(i: 1) != FoldedValue) |
1257 | return false; // Not commutative, only RHS foldable. |
1258 | // LoadOrTruncBits holds the number of effectively loaded bits, but 0 if an |
1259 | // extension was made of the load. |
1260 | unsigned LoadOrTruncBits = |
1261 | ((SExtBits || ZExtBits) ? 0 : (TruncBits ? TruncBits : LoadedBits)); |
1262 | switch (UserI->getOpcode()) { |
1263 | case Instruction::Add: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64 |
1264 | case Instruction::Sub: |
1265 | case Instruction::ICmp: |
1266 | if (LoadedBits == 32 && ZExtBits == 64) |
1267 | return true; |
1268 | [[fallthrough]]; |
1269 | case Instruction::Mul: // SE: 16->32, 32->64, z14:16->64 |
1270 | if (UserI->getOpcode() != Instruction::ICmp) { |
1271 | if (LoadedBits == 16 && |
1272 | (SExtBits == 32 || |
1273 | (SExtBits == 64 && ST->hasMiscellaneousExtensions2()))) |
1274 | return true; |
1275 | if (LoadOrTruncBits == 16) |
1276 | return true; |
1277 | } |
1278 | [[fallthrough]]; |
1279 | case Instruction::SDiv:// SE: 32->64 |
1280 | if (LoadedBits == 32 && SExtBits == 64) |
1281 | return true; |
1282 | [[fallthrough]]; |
1283 | case Instruction::UDiv: |
1284 | case Instruction::And: |
1285 | case Instruction::Or: |
1286 | case Instruction::Xor: |
1287 | // This also makes sense for float operations, but disabled for now due |
1288 | // to regressions. |
1289 | // case Instruction::FCmp: |
1290 | // case Instruction::FAdd: |
1291 | // case Instruction::FSub: |
1292 | // case Instruction::FMul: |
1293 | // case Instruction::FDiv: |
1294 | |
1295 | // All possible extensions of memory checked above. |
1296 | |
1297 | // Comparison between memory and immediate. |
1298 | if (UserI->getOpcode() == Instruction::ICmp) |
1299 | if (ConstantInt *CI = dyn_cast<ConstantInt>(Val: UserI->getOperand(i: 1))) |
1300 | if (CI->getValue().isIntN(N: 16)) |
1301 | return true; |
1302 | return (LoadOrTruncBits == 32 || LoadOrTruncBits == 64); |
1303 | break; |
1304 | } |
1305 | return false; |
1306 | } |
1307 | |
1308 | static bool isBswapIntrinsicCall(const Value *V) { |
1309 | if (const Instruction *I = dyn_cast<Instruction>(Val: V)) |
1310 | if (auto *CI = dyn_cast<CallInst>(Val: I)) |
1311 | if (auto *F = CI->getCalledFunction()) |
1312 | if (F->getIntrinsicID() == Intrinsic::bswap) |
1313 | return true; |
1314 | return false; |
1315 | } |
1316 | |
1317 | InstructionCost SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, |
1318 | Align Alignment, |
1319 | unsigned AddressSpace, |
1320 | TTI::TargetCostKind CostKind, |
1321 | TTI::OperandValueInfo OpInfo, |
1322 | const Instruction *I) const { |
1323 | assert(!Src->isVoidTy() && "Invalid type" ); |
1324 | |
1325 | // TODO: Handle other cost kinds. |
1326 | if (CostKind != TTI::TCK_RecipThroughput) |
1327 | return 1; |
1328 | |
1329 | if (!Src->isVectorTy() && Opcode == Instruction::Load && I != nullptr) { |
1330 | // Store the load or its truncated or extended value in FoldedValue. |
1331 | const Instruction *FoldedValue = nullptr; |
1332 | if (isFoldableLoad(Ld: cast<LoadInst>(Val: I), FoldedValue)) { |
1333 | const Instruction *UserI = cast<Instruction>(Val: *FoldedValue->user_begin()); |
1334 | assert (UserI->getNumOperands() == 2 && "Expected a binop." ); |
1335 | |
1336 | // UserI can't fold two loads, so in that case return 0 cost only |
1337 | // half of the time. |
1338 | for (unsigned i = 0; i < 2; ++i) { |
1339 | if (UserI->getOperand(i) == FoldedValue) |
1340 | continue; |
1341 | |
1342 | if (Instruction *OtherOp = dyn_cast<Instruction>(Val: UserI->getOperand(i))){ |
1343 | LoadInst *OtherLoad = dyn_cast<LoadInst>(Val: OtherOp); |
1344 | if (!OtherLoad && |
1345 | (isa<TruncInst>(Val: OtherOp) || isa<SExtInst>(Val: OtherOp) || |
1346 | isa<ZExtInst>(Val: OtherOp))) |
1347 | OtherLoad = dyn_cast<LoadInst>(Val: OtherOp->getOperand(i: 0)); |
1348 | if (OtherLoad && isFoldableLoad(Ld: OtherLoad, FoldedValue/*dummy*/)) |
1349 | return i == 0; // Both operands foldable. |
1350 | } |
1351 | } |
1352 | |
1353 | return 0; // Only I is foldable in user. |
1354 | } |
1355 | } |
1356 | |
1357 | // Type legalization (via getNumberOfParts) can't handle structs |
1358 | if (TLI->getValueType(DL, Ty: Src, AllowUnknown: true) == MVT::Other) |
1359 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, |
1360 | CostKind); |
1361 | |
1362 | // FP128 is a legal type but kept in a register pair on older CPUs. |
1363 | if (Src->isFP128Ty() && !ST->hasVectorEnhancements1()) |
1364 | return 2; |
1365 | |
1366 | unsigned NumOps = |
1367 | (Src->isVectorTy() ? getNumVectorRegs(Ty: Src) : getNumberOfParts(Tp: Src)); |
1368 | |
1369 | // Store/Load reversed saves one instruction. |
1370 | if (((!Src->isVectorTy() && NumOps == 1) || ST->hasVectorEnhancements2()) && |
1371 | I != nullptr) { |
1372 | if (Opcode == Instruction::Load && I->hasOneUse()) { |
1373 | const Instruction *LdUser = cast<Instruction>(Val: *I->user_begin()); |
1374 | // In case of load -> bswap -> store, return normal cost for the load. |
1375 | if (isBswapIntrinsicCall(V: LdUser) && |
1376 | (!LdUser->hasOneUse() || !isa<StoreInst>(Val: *LdUser->user_begin()))) |
1377 | return 0; |
1378 | } |
1379 | else if (const StoreInst *SI = dyn_cast<StoreInst>(Val: I)) { |
1380 | const Value *StoredVal = SI->getValueOperand(); |
1381 | if (StoredVal->hasOneUse() && isBswapIntrinsicCall(V: StoredVal)) |
1382 | return 0; |
1383 | } |
1384 | } |
1385 | |
1386 | return NumOps; |
1387 | } |
1388 | |
1389 | // The generic implementation of getInterleavedMemoryOpCost() is based on |
1390 | // adding costs of the memory operations plus all the extracts and inserts |
1391 | // needed for using / defining the vector operands. The SystemZ version does |
1392 | // roughly the same but bases the computations on vector permutations |
1393 | // instead. |
1394 | InstructionCost SystemZTTIImpl::getInterleavedMemoryOpCost( |
1395 | unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, |
1396 | Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, |
1397 | bool UseMaskForCond, bool UseMaskForGaps) const { |
1398 | if (UseMaskForCond || UseMaskForGaps) |
1399 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, |
1400 | Alignment, AddressSpace, CostKind, |
1401 | UseMaskForCond, UseMaskForGaps); |
1402 | assert(isa<VectorType>(VecTy) && |
1403 | "Expect a vector type for interleaved memory op" ); |
1404 | |
1405 | unsigned NumElts = cast<FixedVectorType>(Val: VecTy)->getNumElements(); |
1406 | assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor" ); |
1407 | unsigned VF = NumElts / Factor; |
1408 | unsigned NumEltsPerVecReg = (128U / getScalarSizeInBits(Ty: VecTy)); |
1409 | unsigned NumVectorMemOps = getNumVectorRegs(Ty: VecTy); |
1410 | unsigned NumPermutes = 0; |
1411 | |
1412 | if (Opcode == Instruction::Load) { |
1413 | // Loading interleave groups may have gaps, which may mean fewer |
1414 | // loads. Find out how many vectors will be loaded in total, and in how |
1415 | // many of them each value will be in. |
1416 | BitVector UsedInsts(NumVectorMemOps, false); |
1417 | std::vector<BitVector> ValueVecs(Factor, BitVector(NumVectorMemOps, false)); |
1418 | for (unsigned Index : Indices) |
1419 | for (unsigned Elt = 0; Elt < VF; ++Elt) { |
1420 | unsigned Vec = (Index + Elt * Factor) / NumEltsPerVecReg; |
1421 | UsedInsts.set(Vec); |
1422 | ValueVecs[Index].set(Vec); |
1423 | } |
1424 | NumVectorMemOps = UsedInsts.count(); |
1425 | |
1426 | for (unsigned Index : Indices) { |
1427 | // Estimate that each loaded source vector containing this Index |
1428 | // requires one operation, except that vperm can handle two input |
1429 | // registers first time for each dst vector. |
1430 | unsigned NumSrcVecs = ValueVecs[Index].count(); |
1431 | unsigned NumDstVecs = divideCeil(Numerator: VF * getScalarSizeInBits(Ty: VecTy), Denominator: 128U); |
1432 | assert (NumSrcVecs >= NumDstVecs && "Expected at least as many sources" ); |
1433 | NumPermutes += std::max(a: 1U, b: NumSrcVecs - NumDstVecs); |
1434 | } |
1435 | } else { |
1436 | // Estimate the permutes for each stored vector as the smaller of the |
1437 | // number of elements and the number of source vectors. Subtract one per |
1438 | // dst vector for vperm (S.A.). |
1439 | unsigned NumSrcVecs = std::min(a: NumEltsPerVecReg, b: Factor); |
1440 | unsigned NumDstVecs = NumVectorMemOps; |
1441 | NumPermutes += (NumDstVecs * NumSrcVecs) - NumDstVecs; |
1442 | } |
1443 | |
1444 | // Cost of load/store operations and the permutations needed. |
1445 | return NumVectorMemOps + NumPermutes; |
1446 | } |
1447 | |
1448 | InstructionCost getIntAddReductionCost(unsigned NumVec, unsigned ScalarBits) { |
1449 | InstructionCost Cost = 0; |
1450 | // Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total. |
1451 | Cost += NumVec - 1; |
1452 | // For integer adds, VSUM creates shorter reductions on the final vector. |
1453 | Cost += (ScalarBits < 32) ? 3 : 2; |
1454 | return Cost; |
1455 | } |
1456 | |
1457 | InstructionCost getFastReductionCost(unsigned NumVec, unsigned NumElems, |
1458 | unsigned ScalarBits) { |
1459 | unsigned NumEltsPerVecReg = (SystemZ::VectorBits / ScalarBits); |
1460 | InstructionCost Cost = 0; |
1461 | // Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total. |
1462 | Cost += NumVec - 1; |
1463 | // For each shuffle / arithmetic layer, we need 2 instructions, and we need |
1464 | // log2(Elements in Last Vector) layers. |
1465 | Cost += 2 * Log2_32_Ceil(Value: std::min(a: NumElems, b: NumEltsPerVecReg)); |
1466 | return Cost; |
1467 | } |
1468 | |
1469 | inline bool customCostReductions(unsigned Opcode) { |
1470 | return Opcode == Instruction::FAdd || Opcode == Instruction::FMul || |
1471 | Opcode == Instruction::Add || Opcode == Instruction::Mul; |
1472 | } |
1473 | |
1474 | InstructionCost |
1475 | SystemZTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, |
1476 | std::optional<FastMathFlags> FMF, |
1477 | TTI::TargetCostKind CostKind) const { |
1478 | unsigned ScalarBits = Ty->getScalarSizeInBits(); |
1479 | // The following is only for subtargets with vector math, non-ordered |
1480 | // reductions, and reasonable scalar sizes for int and fp add/mul. |
1481 | if (customCostReductions(Opcode) && ST->hasVector() && |
1482 | !TTI::requiresOrderedReduction(FMF) && |
1483 | ScalarBits <= SystemZ::VectorBits) { |
1484 | unsigned NumVectors = getNumVectorRegs(Ty); |
1485 | unsigned NumElems = ((FixedVectorType *)Ty)->getNumElements(); |
1486 | // Integer Add is using custom code gen, that needs to be accounted for. |
1487 | if (Opcode == Instruction::Add) |
1488 | return getIntAddReductionCost(NumVec: NumVectors, ScalarBits); |
1489 | // The base cost is the same across all other arithmetic instructions |
1490 | InstructionCost Cost = |
1491 | getFastReductionCost(NumVec: NumVectors, NumElems, ScalarBits); |
1492 | // But we need to account for the final op involving the scalar operand. |
1493 | if ((Opcode == Instruction::FAdd) || (Opcode == Instruction::FMul)) |
1494 | Cost += 1; |
1495 | return Cost; |
1496 | } |
1497 | // otherwise, fall back to the standard implementation |
1498 | return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); |
1499 | } |
1500 | |
1501 | InstructionCost |
1502 | SystemZTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, |
1503 | FastMathFlags FMF, |
1504 | TTI::TargetCostKind CostKind) const { |
1505 | // Return custom costs only on subtargets with vector enhancements. |
1506 | if (ST->hasVectorEnhancements1()) { |
1507 | unsigned NumVectors = getNumVectorRegs(Ty); |
1508 | unsigned NumElems = ((FixedVectorType *)Ty)->getNumElements(); |
1509 | unsigned ScalarBits = Ty->getScalarSizeInBits(); |
1510 | InstructionCost Cost = 0; |
1511 | // Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total. |
1512 | Cost += NumVectors - 1; |
1513 | // For the final vector, we need shuffle + min/max operations, and |
1514 | // we need #Elements - 1 of them. |
1515 | Cost += 2 * (std::min(a: NumElems, b: SystemZ::VectorBits / ScalarBits) - 1); |
1516 | return Cost; |
1517 | } |
1518 | // For other targets, fall back to the standard implementation |
1519 | return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind); |
1520 | } |
1521 | |
1522 | static int |
1523 | getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, |
1524 | const SmallVectorImpl<Type *> &ParamTys) { |
1525 | if (RetTy->isVectorTy() && ID == Intrinsic::bswap) |
1526 | return getNumVectorRegs(Ty: RetTy); // VPERM |
1527 | |
1528 | return -1; |
1529 | } |
1530 | |
1531 | InstructionCost |
1532 | SystemZTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, |
1533 | TTI::TargetCostKind CostKind) const { |
1534 | InstructionCost Cost = getVectorIntrinsicInstrCost( |
1535 | ID: ICA.getID(), RetTy: ICA.getReturnType(), ParamTys: ICA.getArgTypes()); |
1536 | if (Cost != -1) |
1537 | return Cost; |
1538 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); |
1539 | } |
1540 | |
1541 | bool SystemZTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const { |
1542 | // Always expand on Subtargets without vector instructions. |
1543 | if (!ST->hasVector()) |
1544 | return true; |
1545 | |
1546 | // Whether or not to expand is a per-intrinsic decision. |
1547 | switch (II->getIntrinsicID()) { |
1548 | default: |
1549 | return true; |
1550 | // Do not expand vector.reduce.add... |
1551 | case Intrinsic::vector_reduce_add: |
1552 | auto *VType = cast<FixedVectorType>(Val: II->getOperand(i_nocapture: 0)->getType()); |
1553 | // ...unless the scalar size is i64 or larger, |
1554 | // or the operand vector is not full, since the |
1555 | // performance benefit is dubious in those cases. |
1556 | return VType->getScalarSizeInBits() >= 64 || |
1557 | VType->getPrimitiveSizeInBits() < SystemZ::VectorBits; |
1558 | } |
1559 | } |
1560 | |