1 | //===- R600TargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // \file |
10 | // This file implements a TargetTransformInfo analysis pass specific to the |
11 | // R600 target machine. It uses the target's detailed information to provide |
12 | // more precise answers to certain TTI queries, while letting the target |
13 | // independent and default TTI implementations handle the rest. |
14 | // |
15 | //===----------------------------------------------------------------------===// |
16 | |
17 | #include "R600TargetTransformInfo.h" |
18 | #include "AMDGPU.h" |
19 | #include "AMDGPUTargetMachine.h" |
20 | #include "R600Subtarget.h" |
21 | |
22 | using namespace llvm; |
23 | |
24 | #define DEBUG_TYPE "R600tti" |
25 | |
26 | R600TTIImpl::R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F) |
27 | : BaseT(TM, F.getDataLayout()), |
28 | ST(static_cast<const R600Subtarget *>(TM->getSubtargetImpl(F))), |
29 | TLI(ST->getTargetLowering()), CommonTTI(TM, F) {} |
30 | |
31 | unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const { |
32 | return 4 * 128; // XXX - 4 channels. Should these count as vector instead? |
33 | } |
34 | |
35 | unsigned R600TTIImpl::getNumberOfRegisters(bool Vec) const { |
36 | return getHardwareNumberOfRegisters(Vec); |
37 | } |
38 | |
39 | TypeSize |
40 | R600TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { |
41 | return TypeSize::getFixed(ExactSize: 32); |
42 | } |
43 | |
44 | unsigned R600TTIImpl::getMinVectorRegisterBitWidth() const { return 32; } |
45 | |
46 | unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { |
47 | if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS || |
48 | AddrSpace == AMDGPUAS::CONSTANT_ADDRESS) |
49 | return 128; |
50 | if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || |
51 | AddrSpace == AMDGPUAS::REGION_ADDRESS) |
52 | return 64; |
53 | if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) |
54 | return 32; |
55 | |
56 | if ((AddrSpace == AMDGPUAS::PARAM_D_ADDRESS || |
57 | AddrSpace == AMDGPUAS::PARAM_I_ADDRESS || |
58 | (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 && |
59 | AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15))) |
60 | return 128; |
61 | llvm_unreachable("unhandled address space" ); |
62 | } |
63 | |
64 | bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, |
65 | Align Alignment, |
66 | unsigned AddrSpace) const { |
67 | // We allow vectorization of flat stores, even though we may need to decompose |
68 | // them later if they may access private memory. We don't have enough context |
69 | // here, and legalization can handle it. |
70 | return (AddrSpace != AMDGPUAS::PRIVATE_ADDRESS); |
71 | } |
72 | |
73 | bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, |
74 | Align Alignment, |
75 | unsigned AddrSpace) const { |
76 | return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); |
77 | } |
78 | |
79 | bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, |
80 | Align Alignment, |
81 | unsigned AddrSpace) const { |
82 | return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); |
83 | } |
84 | |
85 | unsigned R600TTIImpl::getMaxInterleaveFactor(ElementCount VF) { |
86 | // Disable unrolling if the loop is not vectorized. |
87 | // TODO: Enable this again. |
88 | if (VF.isScalar()) |
89 | return 1; |
90 | |
91 | return 8; |
92 | } |
93 | |
94 | InstructionCost R600TTIImpl::getCFInstrCost(unsigned Opcode, |
95 | TTI::TargetCostKind CostKind, |
96 | const Instruction *I) { |
97 | if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) |
98 | return Opcode == Instruction::PHI ? 0 : 1; |
99 | |
100 | // XXX - For some reason this isn't called for switch. |
101 | switch (Opcode) { |
102 | case Instruction::Br: |
103 | case Instruction::Ret: |
104 | return 10; |
105 | default: |
106 | return BaseT::getCFInstrCost(Opcode, CostKind, I); |
107 | } |
108 | } |
109 | |
110 | InstructionCost R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, |
111 | TTI::TargetCostKind CostKind, |
112 | unsigned Index, Value *Op0, |
113 | Value *Op1) { |
114 | switch (Opcode) { |
115 | case Instruction::ExtractElement: |
116 | case Instruction::InsertElement: { |
117 | unsigned EltSize = |
118 | DL.getTypeSizeInBits(Ty: cast<VectorType>(Val: ValTy)->getElementType()); |
119 | if (EltSize < 32) { |
120 | return BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0, |
121 | Op1); |
122 | } |
123 | |
124 | // Extracts are just reads of a subregister, so are free. Inserts are |
125 | // considered free because we don't want to have any cost for scalarizing |
126 | // operations, and we don't have to copy into a different register class. |
127 | |
128 | // Dynamic indexing isn't free and is best avoided. |
129 | return Index == ~0u ? 2 : 0; |
130 | } |
131 | default: |
132 | return BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0, Op1); |
133 | } |
134 | } |
135 | |
136 | void R600TTIImpl::(Loop *L, ScalarEvolution &SE, |
137 | TTI::UnrollingPreferences &UP, |
138 | OptimizationRemarkEmitter *ORE) { |
139 | CommonTTI.getUnrollingPreferences(L, SE, UP, ORE); |
140 | } |
141 | |
142 | void R600TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, |
143 | TTI::PeelingPreferences &PP) { |
144 | CommonTTI.getPeelingPreferences(L, SE, PP); |
145 | } |
146 | |