| 1 | //===- R600TargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // \file |
| 10 | // This file implements a TargetTransformInfo analysis pass specific to the |
| 11 | // R600 target machine. It uses the target's detailed information to provide |
| 12 | // more precise answers to certain TTI queries, while letting the target |
| 13 | // independent and default TTI implementations handle the rest. |
| 14 | // |
| 15 | //===----------------------------------------------------------------------===// |
| 16 | |
| 17 | #include "R600TargetTransformInfo.h" |
| 18 | #include "AMDGPU.h" |
| 19 | #include "AMDGPUTargetMachine.h" |
| 20 | #include "R600Subtarget.h" |
| 21 | |
| 22 | using namespace llvm; |
| 23 | |
| 24 | #define DEBUG_TYPE "R600tti" |
| 25 | |
| 26 | R600TTIImpl::R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F) |
| 27 | : BaseT(TM, F.getDataLayout()), |
| 28 | ST(static_cast<const R600Subtarget *>(TM->getSubtargetImpl(F))), |
| 29 | TLI(ST->getTargetLowering()), CommonTTI(TM, F) {} |
| 30 | |
| 31 | unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const { |
| 32 | return 4 * 128; // XXX - 4 channels. Should these count as vector instead? |
| 33 | } |
| 34 | |
| 35 | unsigned R600TTIImpl::getNumberOfRegisters(unsigned ClassID) const { |
| 36 | bool Vec = ClassID == 1; |
| 37 | return getHardwareNumberOfRegisters(Vec); |
| 38 | } |
| 39 | |
| 40 | TypeSize |
| 41 | R600TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { |
| 42 | return TypeSize::getFixed(ExactSize: 32); |
| 43 | } |
| 44 | |
| 45 | unsigned R600TTIImpl::getMinVectorRegisterBitWidth() const { return 32; } |
| 46 | |
| 47 | unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { |
| 48 | if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS || |
| 49 | AddrSpace == AMDGPUAS::CONSTANT_ADDRESS) |
| 50 | return 128; |
| 51 | if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || |
| 52 | AddrSpace == AMDGPUAS::REGION_ADDRESS) |
| 53 | return 64; |
| 54 | if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) |
| 55 | return 32; |
| 56 | |
| 57 | if ((AddrSpace == AMDGPUAS::PARAM_D_ADDRESS || |
| 58 | AddrSpace == AMDGPUAS::PARAM_I_ADDRESS || |
| 59 | (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 && |
| 60 | AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15))) |
| 61 | return 128; |
| 62 | llvm_unreachable("unhandled address space" ); |
| 63 | } |
| 64 | |
| 65 | bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, |
| 66 | Align Alignment, |
| 67 | unsigned AddrSpace) const { |
| 68 | // We allow vectorization of flat stores, even though we may need to decompose |
| 69 | // them later if they may access private memory. We don't have enough context |
| 70 | // here, and legalization can handle it. |
| 71 | return (AddrSpace != AMDGPUAS::PRIVATE_ADDRESS); |
| 72 | } |
| 73 | |
| 74 | bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, |
| 75 | Align Alignment, |
| 76 | unsigned AddrSpace) const { |
| 77 | return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); |
| 78 | } |
| 79 | |
| 80 | bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, |
| 81 | Align Alignment, |
| 82 | unsigned AddrSpace) const { |
| 83 | return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); |
| 84 | } |
| 85 | |
| 86 | unsigned R600TTIImpl::getMaxInterleaveFactor(ElementCount VF) const { |
| 87 | // Disable unrolling if the loop is not vectorized. |
| 88 | // TODO: Enable this again. |
| 89 | if (VF.isScalar()) |
| 90 | return 1; |
| 91 | |
| 92 | return 8; |
| 93 | } |
| 94 | |
| 95 | InstructionCost R600TTIImpl::getCFInstrCost(unsigned Opcode, |
| 96 | TTI::TargetCostKind CostKind, |
| 97 | const Instruction *I) const { |
| 98 | if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) |
| 99 | return Opcode == Instruction::PHI ? 0 : 1; |
| 100 | |
| 101 | // XXX - For some reason this isn't called for switch. |
| 102 | switch (Opcode) { |
| 103 | case Instruction::Br: |
| 104 | case Instruction::Ret: |
| 105 | return 10; |
| 106 | default: |
| 107 | return BaseT::getCFInstrCost(Opcode, CostKind, I); |
| 108 | } |
| 109 | } |
| 110 | |
| 111 | InstructionCost R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, |
| 112 | TTI::TargetCostKind CostKind, |
| 113 | unsigned Index, |
| 114 | const Value *Op0, |
| 115 | const Value *Op1) const { |
| 116 | switch (Opcode) { |
| 117 | case Instruction::ExtractElement: |
| 118 | case Instruction::InsertElement: { |
| 119 | unsigned EltSize = |
| 120 | DL.getTypeSizeInBits(Ty: cast<VectorType>(Val: ValTy)->getElementType()); |
| 121 | if (EltSize < 32) { |
| 122 | return BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0, |
| 123 | Op1); |
| 124 | } |
| 125 | |
| 126 | // Extracts are just reads of a subregister, so are free. Inserts are |
| 127 | // considered free because we don't want to have any cost for scalarizing |
| 128 | // operations, and we don't have to copy into a different register class. |
| 129 | |
| 130 | // Dynamic indexing isn't free and is best avoided. |
| 131 | return Index == ~0u ? 2 : 0; |
| 132 | } |
| 133 | default: |
| 134 | return BaseT::getVectorInstrCost(Opcode, Val: ValTy, CostKind, Index, Op0, Op1); |
| 135 | } |
| 136 | } |
| 137 | |
| 138 | void R600TTIImpl::( |
| 139 | Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, |
| 140 | OptimizationRemarkEmitter *ORE) const { |
| 141 | CommonTTI.getUnrollingPreferences(L, SE, UP, ORE); |
| 142 | } |
| 143 | |
| 144 | void R600TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, |
| 145 | TTI::PeelingPreferences &PP) const { |
| 146 | CommonTTI.getPeelingPreferences(L, SE, PP); |
| 147 | } |
| 148 | |