| 1 | //===- ARMLatencyMutations.cpp - ARM Latency Mutations --------------------===// | 
|---|
| 2 | // | 
|---|
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | 
|---|
| 4 | // See https://llvm.org/LICENSE.txt for license information. | 
|---|
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | 
|---|
| 6 | // | 
|---|
| 7 | //===----------------------------------------------------------------------===// | 
|---|
| 8 | // | 
|---|
| 9 | /// \file This file contains the ARM definition DAG scheduling mutations which | 
|---|
| 10 | /// change inter-instruction latencies | 
|---|
| 11 | // | 
|---|
| 12 | //===----------------------------------------------------------------------===// | 
|---|
| 13 |  | 
|---|
| 14 | #include "ARMLatencyMutations.h" | 
|---|
| 15 | #include "ARMSubtarget.h" | 
|---|
| 16 | #include "Thumb2InstrInfo.h" | 
|---|
| 17 | #include "llvm/Analysis/AliasAnalysis.h" | 
|---|
| 18 | #include "llvm/CodeGen/ScheduleDAG.h" | 
|---|
| 19 | #include "llvm/CodeGen/ScheduleDAGMutation.h" | 
|---|
| 20 | #include "llvm/CodeGen/TargetInstrInfo.h" | 
|---|
| 21 | #include <algorithm> | 
|---|
| 22 | #include <array> | 
|---|
| 23 | #include <initializer_list> | 
|---|
| 24 | #include <memory> | 
|---|
| 25 |  | 
|---|
| 26 | namespace llvm { | 
|---|
| 27 |  | 
|---|
| 28 | namespace { | 
|---|
| 29 |  | 
|---|
| 30 | // Precompute information about opcodes to speed up pass | 
|---|
| 31 |  | 
|---|
| 32 | class InstructionInformation { | 
|---|
| 33 | protected: | 
|---|
| 34 | struct IInfo { | 
|---|
| 35 | bool HasBRegAddr : 1;      // B-side of addr gen is a register | 
|---|
| 36 | bool HasBRegAddrShift : 1; // B-side of addr gen has a shift | 
|---|
| 37 | bool IsDivide : 1;         // Some form of integer divide | 
|---|
| 38 | bool IsInlineShiftALU : 1; // Inline shift+ALU | 
|---|
| 39 | bool IsMultiply : 1;       // Some form of integer multiply | 
|---|
| 40 | bool IsMVEIntMAC : 1;      // MVE 8/16/32-bit integer MAC operation | 
|---|
| 41 | bool IsNonSubwordLoad : 1; // Load which is a word or larger | 
|---|
| 42 | bool IsShift : 1;          // Shift operation | 
|---|
| 43 | bool IsRev : 1;            // REV operation | 
|---|
| 44 | bool ProducesQP : 1;       // Produces a vector register result | 
|---|
| 45 | bool ProducesDP : 1;       // Produces a double-precision register result | 
|---|
| 46 | bool ProducesSP : 1;       // Produces a single-precision register result | 
|---|
| 47 | bool ConsumesQP : 1;       // Consumes a vector register result | 
|---|
| 48 | bool ConsumesDP : 1;       // Consumes a double-precision register result | 
|---|
| 49 | bool ConsumesSP : 1;       // Consumes a single-precision register result | 
|---|
| 50 | unsigned MVEIntMACMatched; // Matched operand type (for MVE) | 
|---|
| 51 | unsigned AddressOpMask;    // Mask indicating which operands go into AGU | 
|---|
| 52 | IInfo() | 
|---|
| 53 | : HasBRegAddr(false), HasBRegAddrShift(false), IsDivide(false), | 
|---|
| 54 | IsInlineShiftALU(false), IsMultiply(false), IsMVEIntMAC(false), | 
|---|
| 55 | IsNonSubwordLoad(false), IsShift(false), IsRev(false), | 
|---|
| 56 | ProducesQP(false), ProducesDP(false), ProducesSP(false), | 
|---|
| 57 | ConsumesQP(false), ConsumesDP(false), ConsumesSP(false), | 
|---|
| 58 | MVEIntMACMatched(0), AddressOpMask(0) {} | 
|---|
| 59 | }; | 
|---|
| 60 | typedef std::array<IInfo, ARM::INSTRUCTION_LIST_END> IInfoArray; | 
|---|
| 61 | IInfoArray Info; | 
|---|
| 62 |  | 
|---|
| 63 | public: | 
|---|
| 64 | // Always available information | 
|---|
| 65 | unsigned getAddressOpMask(unsigned Op) { return Info[Op].AddressOpMask; } | 
|---|
| 66 | bool hasBRegAddr(unsigned Op) { return Info[Op].HasBRegAddr; } | 
|---|
| 67 | bool hasBRegAddrShift(unsigned Op) { return Info[Op].HasBRegAddrShift; } | 
|---|
| 68 | bool isDivide(unsigned Op) { return Info[Op].IsDivide; } | 
|---|
| 69 | bool isInlineShiftALU(unsigned Op) { return Info[Op].IsInlineShiftALU; } | 
|---|
| 70 | bool isMultiply(unsigned Op) { return Info[Op].IsMultiply; } | 
|---|
| 71 | bool isMVEIntMAC(unsigned Op) { return Info[Op].IsMVEIntMAC; } | 
|---|
| 72 | bool isNonSubwordLoad(unsigned Op) { return Info[Op].IsNonSubwordLoad; } | 
|---|
| 73 | bool isRev(unsigned Op) { return Info[Op].IsRev; } | 
|---|
| 74 | bool isShift(unsigned Op) { return Info[Op].IsShift; } | 
|---|
| 75 |  | 
|---|
| 76 | // information available if markDPConsumers is called. | 
|---|
| 77 | bool producesQP(unsigned Op) { return Info[Op].ProducesQP; } | 
|---|
| 78 | bool producesDP(unsigned Op) { return Info[Op].ProducesDP; } | 
|---|
| 79 | bool producesSP(unsigned Op) { return Info[Op].ProducesSP; } | 
|---|
| 80 | bool consumesQP(unsigned Op) { return Info[Op].ConsumesQP; } | 
|---|
| 81 | bool consumesDP(unsigned Op) { return Info[Op].ConsumesDP; } | 
|---|
| 82 | bool consumesSP(unsigned Op) { return Info[Op].ConsumesSP; } | 
|---|
| 83 |  | 
|---|
| 84 | bool isMVEIntMACMatched(unsigned SrcOp, unsigned DstOp) { | 
|---|
| 85 | return SrcOp == DstOp || Info[DstOp].MVEIntMACMatched == SrcOp; | 
|---|
| 86 | } | 
|---|
| 87 |  | 
|---|
| 88 | InstructionInformation(const ARMBaseInstrInfo *TII); | 
|---|
| 89 |  | 
|---|
| 90 | protected: | 
|---|
| 91 | void markDPProducersConsumers(const ARMBaseInstrInfo *TII); | 
|---|
| 92 | }; | 
|---|
| 93 |  | 
|---|
| 94 | InstructionInformation::InstructionInformation(const ARMBaseInstrInfo *TII) { | 
|---|
| 95 | using namespace ARM; | 
|---|
| 96 |  | 
|---|
| 97 | std::initializer_list<unsigned> hasBRegAddrList = { | 
|---|
| 98 | t2LDRs, t2LDRBs, t2LDRHs, t2STRs, t2STRBs, t2STRHs, | 
|---|
| 99 | tLDRr,  tLDRBr,  tLDRHr,  tSTRr,  tSTRBr,  tSTRHr, | 
|---|
| 100 | }; | 
|---|
| 101 | for (auto op : hasBRegAddrList) { | 
|---|
| 102 | Info[op].HasBRegAddr = true; | 
|---|
| 103 | } | 
|---|
| 104 |  | 
|---|
| 105 | std::initializer_list<unsigned> hasBRegAddrShiftList = { | 
|---|
| 106 | t2LDRs, t2LDRBs, t2LDRHs, t2STRs, t2STRBs, t2STRHs, | 
|---|
| 107 | }; | 
|---|
| 108 | for (auto op : hasBRegAddrShiftList) { | 
|---|
| 109 | Info[op].HasBRegAddrShift = true; | 
|---|
| 110 | } | 
|---|
| 111 |  | 
|---|
| 112 | Info[t2SDIV].IsDivide = Info[t2UDIV].IsDivide = true; | 
|---|
| 113 |  | 
|---|
| 114 | std::initializer_list<unsigned> isInlineShiftALUList = { | 
|---|
| 115 | t2ADCrs,  t2ADDSrs, t2ADDrs,  t2BICrs, t2EORrs, | 
|---|
| 116 | t2ORNrs,  t2RSBSrs, t2RSBrs,  t2SBCrs, t2SUBrs, | 
|---|
| 117 | t2SUBSrs, t2CMPrs,  t2CMNzrs, t2TEQrs, t2TSTrs, | 
|---|
| 118 | }; | 
|---|
| 119 | for (auto op : isInlineShiftALUList) { | 
|---|
| 120 | Info[op].IsInlineShiftALU = true; | 
|---|
| 121 | } | 
|---|
| 122 |  | 
|---|
| 123 | Info[t2SDIV].IsDivide = Info[t2UDIV].IsDivide = true; | 
|---|
| 124 |  | 
|---|
| 125 | std::initializer_list<unsigned> isMultiplyList = { | 
|---|
| 126 | t2MUL,    t2MLA,     t2MLS,     t2SMLABB, t2SMLABT,  t2SMLAD,   t2SMLADX, | 
|---|
| 127 | t2SMLAL,  t2SMLALBB, t2SMLALBT, t2SMLALD, t2SMLALDX, t2SMLALTB, t2SMLALTT, | 
|---|
| 128 | t2SMLATB, t2SMLATT,  t2SMLAWT,  t2SMLSD,  t2SMLSDX,  t2SMLSLD,  t2SMLSLDX, | 
|---|
| 129 | t2SMMLA,  t2SMMLAR,  t2SMMLS,   t2SMMLSR, t2SMMUL,   t2SMMULR,  t2SMUAD, | 
|---|
| 130 | t2SMUADX, t2SMULBB,  t2SMULBT,  t2SMULL,  t2SMULTB,  t2SMULTT,  t2SMULWT, | 
|---|
| 131 | t2SMUSD,  t2SMUSDX,  t2UMAAL,   t2UMLAL,  t2UMULL,   tMUL, | 
|---|
| 132 | }; | 
|---|
| 133 | for (auto op : isMultiplyList) { | 
|---|
| 134 | Info[op].IsMultiply = true; | 
|---|
| 135 | } | 
|---|
| 136 |  | 
|---|
| 137 | std::initializer_list<unsigned> isMVEIntMACList = { | 
|---|
| 138 | MVE_VMLAS_qr_i16,    MVE_VMLAS_qr_i32,    MVE_VMLAS_qr_i8, | 
|---|
| 139 | MVE_VMLA_qr_i16,     MVE_VMLA_qr_i32,     MVE_VMLA_qr_i8, | 
|---|
| 140 | MVE_VQDMLAH_qrs16,   MVE_VQDMLAH_qrs32,   MVE_VQDMLAH_qrs8, | 
|---|
| 141 | MVE_VQDMLASH_qrs16,  MVE_VQDMLASH_qrs32,  MVE_VQDMLASH_qrs8, | 
|---|
| 142 | MVE_VQRDMLAH_qrs16,  MVE_VQRDMLAH_qrs32,  MVE_VQRDMLAH_qrs8, | 
|---|
| 143 | MVE_VQRDMLASH_qrs16, MVE_VQRDMLASH_qrs32, MVE_VQRDMLASH_qrs8, | 
|---|
| 144 | MVE_VQDMLADHXs16,    MVE_VQDMLADHXs32,    MVE_VQDMLADHXs8, | 
|---|
| 145 | MVE_VQDMLADHs16,     MVE_VQDMLADHs32,     MVE_VQDMLADHs8, | 
|---|
| 146 | MVE_VQDMLSDHXs16,    MVE_VQDMLSDHXs32,    MVE_VQDMLSDHXs8, | 
|---|
| 147 | MVE_VQDMLSDHs16,     MVE_VQDMLSDHs32,     MVE_VQDMLSDHs8, | 
|---|
| 148 | MVE_VQRDMLADHXs16,   MVE_VQRDMLADHXs32,   MVE_VQRDMLADHXs8, | 
|---|
| 149 | MVE_VQRDMLADHs16,    MVE_VQRDMLADHs32,    MVE_VQRDMLADHs8, | 
|---|
| 150 | MVE_VQRDMLSDHXs16,   MVE_VQRDMLSDHXs32,   MVE_VQRDMLSDHXs8, | 
|---|
| 151 | MVE_VQRDMLSDHs16,    MVE_VQRDMLSDHs32,    MVE_VQRDMLSDHs8, | 
|---|
| 152 | }; | 
|---|
| 153 | for (auto op : isMVEIntMACList) { | 
|---|
| 154 | Info[op].IsMVEIntMAC = true; | 
|---|
| 155 | } | 
|---|
| 156 |  | 
|---|
| 157 | std::initializer_list<unsigned> isNonSubwordLoadList = { | 
|---|
| 158 | t2LDRi12, t2LDRi8,  t2LDR_POST,  t2LDR_PRE,  t2LDRpci, | 
|---|
| 159 | t2LDRs,   t2LDRDi8, t2LDRD_POST, t2LDRD_PRE, tLDRi, | 
|---|
| 160 | tLDRpci,  tLDRr,    tLDRspi, | 
|---|
| 161 | }; | 
|---|
| 162 | for (auto op : isNonSubwordLoadList) { | 
|---|
| 163 | Info[op].IsNonSubwordLoad = true; | 
|---|
| 164 | } | 
|---|
| 165 |  | 
|---|
| 166 | std::initializer_list<unsigned> isRevList = { | 
|---|
| 167 | t2REV, t2REV16, t2REVSH, t2RBIT, tREV, tREV16, tREVSH, | 
|---|
| 168 | }; | 
|---|
| 169 | for (auto op : isRevList) { | 
|---|
| 170 | Info[op].IsRev = true; | 
|---|
| 171 | } | 
|---|
| 172 |  | 
|---|
| 173 | std::initializer_list<unsigned> isShiftList = { | 
|---|
| 174 | t2ASRri, t2ASRrr, t2LSLri, t2LSLrr, t2LSRri, t2LSRrr, t2RORri, t2RORrr, | 
|---|
| 175 | tASRri,  tASRrr,  tLSLSri, tLSLri,  tLSLrr,  tLSRri,  tLSRrr,  tROR, | 
|---|
| 176 | }; | 
|---|
| 177 | for (auto op : isShiftList) { | 
|---|
| 178 | Info[op].IsShift = true; | 
|---|
| 179 | } | 
|---|
| 180 |  | 
|---|
| 181 | std::initializer_list<unsigned> Address1List = { | 
|---|
| 182 | t2LDRBi12, | 
|---|
| 183 | t2LDRBi8, | 
|---|
| 184 | t2LDRBpci, | 
|---|
| 185 | t2LDRBs, | 
|---|
| 186 | t2LDRHi12, | 
|---|
| 187 | t2LDRHi8, | 
|---|
| 188 | t2LDRHpci, | 
|---|
| 189 | t2LDRHs, | 
|---|
| 190 | t2LDRSBi12, | 
|---|
| 191 | t2LDRSBi8, | 
|---|
| 192 | t2LDRSBpci, | 
|---|
| 193 | t2LDRSBs, | 
|---|
| 194 | t2LDRSHi12, | 
|---|
| 195 | t2LDRSHi8, | 
|---|
| 196 | t2LDRSHpci, | 
|---|
| 197 | t2LDRSHs, | 
|---|
| 198 | t2LDRi12, | 
|---|
| 199 | t2LDRi8, | 
|---|
| 200 | t2LDRpci, | 
|---|
| 201 | t2LDRs, | 
|---|
| 202 | tLDRBi, | 
|---|
| 203 | tLDRBr, | 
|---|
| 204 | tLDRHi, | 
|---|
| 205 | tLDRHr, | 
|---|
| 206 | tLDRSB, | 
|---|
| 207 | tLDRSH, | 
|---|
| 208 | tLDRi, | 
|---|
| 209 | tLDRpci, | 
|---|
| 210 | tLDRr, | 
|---|
| 211 | tLDRspi, | 
|---|
| 212 | t2STRBi12, | 
|---|
| 213 | t2STRBi8, | 
|---|
| 214 | t2STRBs, | 
|---|
| 215 | t2STRHi12, | 
|---|
| 216 | t2STRHi8, | 
|---|
| 217 | t2STRHs, | 
|---|
| 218 | t2STRi12, | 
|---|
| 219 | t2STRi8, | 
|---|
| 220 | t2STRs, | 
|---|
| 221 | tSTRBi, | 
|---|
| 222 | tSTRBr, | 
|---|
| 223 | tSTRHi, | 
|---|
| 224 | tSTRHr, | 
|---|
| 225 | tSTRi, | 
|---|
| 226 | tSTRr, | 
|---|
| 227 | tSTRspi, | 
|---|
| 228 | VLDRD, | 
|---|
| 229 | VLDRH, | 
|---|
| 230 | VLDRS, | 
|---|
| 231 | VSTRD, | 
|---|
| 232 | VSTRH, | 
|---|
| 233 | VSTRS, | 
|---|
| 234 | MVE_VLD20_16, | 
|---|
| 235 | MVE_VLD20_32, | 
|---|
| 236 | MVE_VLD20_8, | 
|---|
| 237 | MVE_VLD21_16, | 
|---|
| 238 | MVE_VLD21_32, | 
|---|
| 239 | MVE_VLD21_8, | 
|---|
| 240 | MVE_VLD40_16, | 
|---|
| 241 | MVE_VLD40_32, | 
|---|
| 242 | MVE_VLD40_8, | 
|---|
| 243 | MVE_VLD41_16, | 
|---|
| 244 | MVE_VLD41_32, | 
|---|
| 245 | MVE_VLD41_8, | 
|---|
| 246 | MVE_VLD42_16, | 
|---|
| 247 | MVE_VLD42_32, | 
|---|
| 248 | MVE_VLD42_8, | 
|---|
| 249 | MVE_VLD43_16, | 
|---|
| 250 | MVE_VLD43_32, | 
|---|
| 251 | MVE_VLD43_8, | 
|---|
| 252 | MVE_VLDRBS16, | 
|---|
| 253 | MVE_VLDRBS16_rq, | 
|---|
| 254 | MVE_VLDRBS32, | 
|---|
| 255 | MVE_VLDRBS32_rq, | 
|---|
| 256 | MVE_VLDRBU16, | 
|---|
| 257 | MVE_VLDRBU16_rq, | 
|---|
| 258 | MVE_VLDRBU32, | 
|---|
| 259 | MVE_VLDRBU32_rq, | 
|---|
| 260 | MVE_VLDRBU8, | 
|---|
| 261 | MVE_VLDRBU8_rq, | 
|---|
| 262 | MVE_VLDRDU64_qi, | 
|---|
| 263 | MVE_VLDRDU64_rq, | 
|---|
| 264 | MVE_VLDRDU64_rq_u, | 
|---|
| 265 | MVE_VLDRHS32, | 
|---|
| 266 | MVE_VLDRHS32_rq, | 
|---|
| 267 | MVE_VLDRHS32_rq_u, | 
|---|
| 268 | MVE_VLDRHU16, | 
|---|
| 269 | MVE_VLDRHU16_rq, | 
|---|
| 270 | MVE_VLDRHU16_rq_u, | 
|---|
| 271 | MVE_VLDRHU32, | 
|---|
| 272 | MVE_VLDRHU32_rq, | 
|---|
| 273 | MVE_VLDRHU32_rq_u, | 
|---|
| 274 | MVE_VLDRWU32, | 
|---|
| 275 | MVE_VLDRWU32_qi, | 
|---|
| 276 | MVE_VLDRWU32_rq, | 
|---|
| 277 | MVE_VLDRWU32_rq_u, | 
|---|
| 278 | MVE_VST20_16, | 
|---|
| 279 | MVE_VST20_32, | 
|---|
| 280 | MVE_VST20_8, | 
|---|
| 281 | MVE_VST21_16, | 
|---|
| 282 | MVE_VST21_32, | 
|---|
| 283 | MVE_VST21_8, | 
|---|
| 284 | MVE_VST40_16, | 
|---|
| 285 | MVE_VST40_32, | 
|---|
| 286 | MVE_VST40_8, | 
|---|
| 287 | MVE_VST41_16, | 
|---|
| 288 | MVE_VST41_32, | 
|---|
| 289 | MVE_VST41_8, | 
|---|
| 290 | MVE_VST42_16, | 
|---|
| 291 | MVE_VST42_32, | 
|---|
| 292 | MVE_VST42_8, | 
|---|
| 293 | MVE_VST43_16, | 
|---|
| 294 | MVE_VST43_32, | 
|---|
| 295 | MVE_VST43_8, | 
|---|
| 296 | MVE_VSTRB16, | 
|---|
| 297 | MVE_VSTRB16_rq, | 
|---|
| 298 | MVE_VSTRB32, | 
|---|
| 299 | MVE_VSTRB32_rq, | 
|---|
| 300 | MVE_VSTRBU8, | 
|---|
| 301 | MVE_VSTRB8_rq, | 
|---|
| 302 | MVE_VSTRD64_qi, | 
|---|
| 303 | MVE_VSTRD64_rq, | 
|---|
| 304 | MVE_VSTRD64_rq_u, | 
|---|
| 305 | MVE_VSTRH32, | 
|---|
| 306 | MVE_VSTRH32_rq, | 
|---|
| 307 | MVE_VSTRH32_rq_u, | 
|---|
| 308 | MVE_VSTRHU16, | 
|---|
| 309 | MVE_VSTRH16_rq, | 
|---|
| 310 | MVE_VSTRH16_rq_u, | 
|---|
| 311 | MVE_VSTRWU32, | 
|---|
| 312 | MVE_VSTRW32_qi, | 
|---|
| 313 | MVE_VSTRW32_rq, | 
|---|
| 314 | MVE_VSTRW32_rq_u, | 
|---|
| 315 | }; | 
|---|
| 316 | std::initializer_list<unsigned> Address2List = { | 
|---|
| 317 | t2LDRB_POST, | 
|---|
| 318 | t2LDRB_PRE, | 
|---|
| 319 | t2LDRDi8, | 
|---|
| 320 | t2LDRH_POST, | 
|---|
| 321 | t2LDRH_PRE, | 
|---|
| 322 | t2LDRSB_POST, | 
|---|
| 323 | t2LDRSB_PRE, | 
|---|
| 324 | t2LDRSH_POST, | 
|---|
| 325 | t2LDRSH_PRE, | 
|---|
| 326 | t2LDR_POST, | 
|---|
| 327 | t2LDR_PRE, | 
|---|
| 328 | t2STRB_POST, | 
|---|
| 329 | t2STRB_PRE, | 
|---|
| 330 | t2STRDi8, | 
|---|
| 331 | t2STRH_POST, | 
|---|
| 332 | t2STRH_PRE, | 
|---|
| 333 | t2STR_POST, | 
|---|
| 334 | t2STR_PRE, | 
|---|
| 335 | MVE_VLD20_16_wb, | 
|---|
| 336 | MVE_VLD20_32_wb, | 
|---|
| 337 | MVE_VLD20_8_wb, | 
|---|
| 338 | MVE_VLD21_16_wb, | 
|---|
| 339 | MVE_VLD21_32_wb, | 
|---|
| 340 | MVE_VLD21_8_wb, | 
|---|
| 341 | MVE_VLD40_16_wb, | 
|---|
| 342 | MVE_VLD40_32_wb, | 
|---|
| 343 | MVE_VLD40_8_wb, | 
|---|
| 344 | MVE_VLD41_16_wb, | 
|---|
| 345 | MVE_VLD41_32_wb, | 
|---|
| 346 | MVE_VLD41_8_wb, | 
|---|
| 347 | MVE_VLD42_16_wb, | 
|---|
| 348 | MVE_VLD42_32_wb, | 
|---|
| 349 | MVE_VLD42_8_wb, | 
|---|
| 350 | MVE_VLD43_16_wb, | 
|---|
| 351 | MVE_VLD43_32_wb, | 
|---|
| 352 | MVE_VLD43_8_wb, | 
|---|
| 353 | MVE_VLDRBS16_post, | 
|---|
| 354 | MVE_VLDRBS16_pre, | 
|---|
| 355 | MVE_VLDRBS32_post, | 
|---|
| 356 | MVE_VLDRBS32_pre, | 
|---|
| 357 | MVE_VLDRBU16_post, | 
|---|
| 358 | MVE_VLDRBU16_pre, | 
|---|
| 359 | MVE_VLDRBU32_post, | 
|---|
| 360 | MVE_VLDRBU32_pre, | 
|---|
| 361 | MVE_VLDRBU8_post, | 
|---|
| 362 | MVE_VLDRBU8_pre, | 
|---|
| 363 | MVE_VLDRDU64_qi_pre, | 
|---|
| 364 | MVE_VLDRHS32_post, | 
|---|
| 365 | MVE_VLDRHS32_pre, | 
|---|
| 366 | MVE_VLDRHU16_post, | 
|---|
| 367 | MVE_VLDRHU16_pre, | 
|---|
| 368 | MVE_VLDRHU32_post, | 
|---|
| 369 | MVE_VLDRHU32_pre, | 
|---|
| 370 | MVE_VLDRWU32_post, | 
|---|
| 371 | MVE_VLDRWU32_pre, | 
|---|
| 372 | MVE_VLDRWU32_qi_pre, | 
|---|
| 373 | MVE_VST20_16_wb, | 
|---|
| 374 | MVE_VST20_32_wb, | 
|---|
| 375 | MVE_VST20_8_wb, | 
|---|
| 376 | MVE_VST21_16_wb, | 
|---|
| 377 | MVE_VST21_32_wb, | 
|---|
| 378 | MVE_VST21_8_wb, | 
|---|
| 379 | MVE_VST40_16_wb, | 
|---|
| 380 | MVE_VST40_32_wb, | 
|---|
| 381 | MVE_VST40_8_wb, | 
|---|
| 382 | MVE_VST41_16_wb, | 
|---|
| 383 | MVE_VST41_32_wb, | 
|---|
| 384 | MVE_VST41_8_wb, | 
|---|
| 385 | MVE_VST42_16_wb, | 
|---|
| 386 | MVE_VST42_32_wb, | 
|---|
| 387 | MVE_VST42_8_wb, | 
|---|
| 388 | MVE_VST43_16_wb, | 
|---|
| 389 | MVE_VST43_32_wb, | 
|---|
| 390 | MVE_VST43_8_wb, | 
|---|
| 391 | MVE_VSTRB16_post, | 
|---|
| 392 | MVE_VSTRB16_pre, | 
|---|
| 393 | MVE_VSTRB32_post, | 
|---|
| 394 | MVE_VSTRB32_pre, | 
|---|
| 395 | MVE_VSTRBU8_post, | 
|---|
| 396 | MVE_VSTRBU8_pre, | 
|---|
| 397 | MVE_VSTRD64_qi_pre, | 
|---|
| 398 | MVE_VSTRH32_post, | 
|---|
| 399 | MVE_VSTRH32_pre, | 
|---|
| 400 | MVE_VSTRHU16_post, | 
|---|
| 401 | MVE_VSTRHU16_pre, | 
|---|
| 402 | MVE_VSTRWU32_post, | 
|---|
| 403 | MVE_VSTRWU32_pre, | 
|---|
| 404 | MVE_VSTRW32_qi_pre, | 
|---|
| 405 | }; | 
|---|
| 406 | std::initializer_list<unsigned> Address3List = { | 
|---|
| 407 | t2LDRD_POST, | 
|---|
| 408 | t2LDRD_PRE, | 
|---|
| 409 | t2STRD_POST, | 
|---|
| 410 | t2STRD_PRE, | 
|---|
| 411 | }; | 
|---|
| 412 | // Compute a mask of which operands are involved in address computation | 
|---|
| 413 | for (auto &op : Address1List) { | 
|---|
| 414 | Info[op].AddressOpMask = 0x6; | 
|---|
| 415 | } | 
|---|
| 416 | for (auto &op : Address2List) { | 
|---|
| 417 | Info[op].AddressOpMask = 0xc; | 
|---|
| 418 | } | 
|---|
| 419 | for (auto &op : Address3List) { | 
|---|
| 420 | Info[op].AddressOpMask = 0x18; | 
|---|
| 421 | } | 
|---|
| 422 | for (auto &op : hasBRegAddrShiftList) { | 
|---|
| 423 | Info[op].AddressOpMask |= 0x8; | 
|---|
| 424 | } | 
|---|
| 425 | } | 
|---|
| 426 |  | 
|---|
| 427 | void InstructionInformation::markDPProducersConsumers( | 
|---|
| 428 | const ARMBaseInstrInfo *TII) { | 
|---|
| 429 | // Learn about all instructions which have FP source/dest registers | 
|---|
| 430 | for (unsigned MI = 0; MI < ARM::INSTRUCTION_LIST_END; ++MI) { | 
|---|
| 431 | const MCInstrDesc &MID = TII->get(Opcode: MI); | 
|---|
| 432 | auto Operands = MID.operands(); | 
|---|
| 433 | for (unsigned OI = 0, OIE = MID.getNumOperands(); OI != OIE; ++OI) { | 
|---|
| 434 | bool MarkQP = false, MarkDP = false, MarkSP = false; | 
|---|
| 435 | switch (Operands[OI].RegClass) { | 
|---|
| 436 | case ARM::MQPRRegClassID: | 
|---|
| 437 | case ARM::DPRRegClassID: | 
|---|
| 438 | case ARM::DPR_8RegClassID: | 
|---|
| 439 | case ARM::DPR_VFP2RegClassID: | 
|---|
| 440 | case ARM::DPairRegClassID: | 
|---|
| 441 | case ARM::DPairSpcRegClassID: | 
|---|
| 442 | case ARM::DQuadRegClassID: | 
|---|
| 443 | case ARM::DQuadSpcRegClassID: | 
|---|
| 444 | case ARM::DTripleRegClassID: | 
|---|
| 445 | case ARM::DTripleSpcRegClassID: | 
|---|
| 446 | MarkDP = true; | 
|---|
| 447 | break; | 
|---|
| 448 | case ARM::QPRRegClassID: | 
|---|
| 449 | case ARM::QPR_8RegClassID: | 
|---|
| 450 | case ARM::QPR_VFP2RegClassID: | 
|---|
| 451 | case ARM::QQPRRegClassID: | 
|---|
| 452 | case ARM::QQQQPRRegClassID: | 
|---|
| 453 | MarkQP = true; | 
|---|
| 454 | break; | 
|---|
| 455 | case ARM::SPRRegClassID: | 
|---|
| 456 | case ARM::SPR_8RegClassID: | 
|---|
| 457 | case ARM::FPWithVPRRegClassID: | 
|---|
| 458 | MarkSP = true; | 
|---|
| 459 | break; | 
|---|
| 460 | default: | 
|---|
| 461 | break; | 
|---|
| 462 | } | 
|---|
| 463 | if (MarkQP) { | 
|---|
| 464 | if (OI < MID.getNumDefs()) | 
|---|
| 465 | Info[MI].ProducesQP = true; | 
|---|
| 466 | else | 
|---|
| 467 | Info[MI].ConsumesQP = true; | 
|---|
| 468 | } | 
|---|
| 469 | if (MarkDP) { | 
|---|
| 470 | if (OI < MID.getNumDefs()) | 
|---|
| 471 | Info[MI].ProducesDP = true; | 
|---|
| 472 | else | 
|---|
| 473 | Info[MI].ConsumesDP = true; | 
|---|
| 474 | } | 
|---|
| 475 | if (MarkSP) { | 
|---|
| 476 | if (OI < MID.getNumDefs()) | 
|---|
| 477 | Info[MI].ProducesSP = true; | 
|---|
| 478 | else | 
|---|
| 479 | Info[MI].ConsumesSP = true; | 
|---|
| 480 | } | 
|---|
| 481 | } | 
|---|
| 482 | } | 
|---|
| 483 | } | 
|---|
| 484 |  | 
|---|
| 485 | } // anonymous namespace | 
|---|
| 486 |  | 
|---|
| 487 | static bool hasImplicitCPSRUse(const MachineInstr *MI) { | 
|---|
| 488 | return MI->getDesc().hasImplicitUseOfPhysReg(Reg: ARM::CPSR); | 
|---|
| 489 | } | 
|---|
| 490 |  | 
|---|
| 491 | void ARMOverrideBypasses::setBidirLatencies(SUnit &SrcSU, SDep &SrcDep, | 
|---|
| 492 | unsigned latency) { | 
|---|
| 493 | SDep Reverse = SrcDep; | 
|---|
| 494 | Reverse.setSUnit(&SrcSU); | 
|---|
| 495 | for (SDep &PDep : SrcDep.getSUnit()->Preds) { | 
|---|
| 496 | if (PDep == Reverse) { | 
|---|
| 497 | PDep.setLatency(latency); | 
|---|
| 498 | SrcDep.getSUnit()->setDepthDirty(); | 
|---|
| 499 | break; | 
|---|
| 500 | } | 
|---|
| 501 | } | 
|---|
| 502 | SrcDep.setLatency(latency); | 
|---|
| 503 | SrcSU.setHeightDirty(); | 
|---|
| 504 | } | 
|---|
| 505 |  | 
|---|
| 506 | static bool mismatchedPred(ARMCC::CondCodes a, ARMCC::CondCodes b) { | 
|---|
| 507 | return (a & 0xe) != (b & 0xe); | 
|---|
| 508 | } | 
|---|
| 509 |  | 
|---|
| 510 | // Set output dependences to zero latency for processors which can | 
|---|
| 511 | // simultaneously issue to the same register.  Returns true if a change | 
|---|
| 512 | // was made. | 
|---|
| 513 | bool ARMOverrideBypasses::zeroOutputDependences(SUnit &ISU, SDep &Dep) { | 
|---|
| 514 | if (Dep.getKind() == SDep::Output) { | 
|---|
| 515 | setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: 0); | 
|---|
| 516 | return true; | 
|---|
| 517 | } | 
|---|
| 518 | return false; | 
|---|
| 519 | } | 
|---|
| 520 |  | 
|---|
| 521 | // The graph doesn't look inside of bundles to determine their | 
|---|
| 522 | // scheduling boundaries and reports zero latency into and out of them | 
|---|
| 523 | // (except for CPSR into the bundle, which has latency 1). | 
|---|
| 524 | // Make some better scheduling assumptions: | 
|---|
| 525 | // 1) CPSR uses have zero latency; other uses have incoming latency 1 | 
|---|
| 526 | // 2) CPSR defs retain a latency of zero; others have a latency of 1. | 
|---|
| 527 | // | 
|---|
| 528 | // Returns 1 if a use change was made; 2 if a def change was made; 0 otherwise | 
|---|
| 529 | unsigned ARMOverrideBypasses::makeBundleAssumptions(SUnit &ISU, SDep &Dep) { | 
|---|
| 530 |  | 
|---|
| 531 | SUnit &DepSU = *Dep.getSUnit(); | 
|---|
| 532 | const MachineInstr *SrcMI = ISU.getInstr(); | 
|---|
| 533 | unsigned SrcOpcode = SrcMI->getOpcode(); | 
|---|
| 534 | const MachineInstr *DstMI = DepSU.getInstr(); | 
|---|
| 535 | unsigned DstOpcode = DstMI->getOpcode(); | 
|---|
| 536 |  | 
|---|
| 537 | if (DstOpcode == ARM::BUNDLE && TII->isPredicated(MI: *DstMI)) { | 
|---|
| 538 | setBidirLatencies( | 
|---|
| 539 | SrcSU&: ISU, SrcDep&: Dep, | 
|---|
| 540 | latency: (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR) ? 0 : 1); | 
|---|
| 541 | return 1; | 
|---|
| 542 | } | 
|---|
| 543 | if (SrcOpcode == ARM::BUNDLE && TII->isPredicated(MI: *SrcMI) && | 
|---|
| 544 | Dep.isAssignedRegDep() && Dep.getReg() != ARM::CPSR) { | 
|---|
| 545 | setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: 1); | 
|---|
| 546 | return 2; | 
|---|
| 547 | } | 
|---|
| 548 | return 0; | 
|---|
| 549 | } | 
|---|
| 550 |  | 
|---|
| 551 | // Determine whether there is a memory RAW hazard here and set up latency | 
|---|
| 552 | // accordingly | 
|---|
| 553 | bool ARMOverrideBypasses::memoryRAWHazard(SUnit &ISU, SDep &Dep, | 
|---|
| 554 | unsigned latency) { | 
|---|
| 555 | if (!Dep.isNormalMemory()) | 
|---|
| 556 | return false; | 
|---|
| 557 | auto &SrcInst = *ISU.getInstr(); | 
|---|
| 558 | auto &DstInst = *Dep.getSUnit()->getInstr(); | 
|---|
| 559 | if (!SrcInst.mayStore() || !DstInst.mayLoad()) | 
|---|
| 560 | return false; | 
|---|
| 561 |  | 
|---|
| 562 | auto SrcMO = *SrcInst.memoperands().begin(); | 
|---|
| 563 | auto DstMO = *DstInst.memoperands().begin(); | 
|---|
| 564 | auto SrcVal = SrcMO->getValue(); | 
|---|
| 565 | auto DstVal = DstMO->getValue(); | 
|---|
| 566 | auto SrcPseudoVal = SrcMO->getPseudoValue(); | 
|---|
| 567 | auto DstPseudoVal = DstMO->getPseudoValue(); | 
|---|
| 568 | if (SrcVal && DstVal && AA->alias(V1: SrcVal, V2: DstVal) == AliasResult::MustAlias && | 
|---|
| 569 | SrcMO->getOffset() == DstMO->getOffset()) { | 
|---|
| 570 | setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency); | 
|---|
| 571 | return true; | 
|---|
| 572 | } else if (SrcPseudoVal && DstPseudoVal && | 
|---|
| 573 | SrcPseudoVal->kind() == DstPseudoVal->kind() && | 
|---|
| 574 | SrcPseudoVal->kind() == PseudoSourceValue::FixedStack) { | 
|---|
| 575 | // Spills/fills | 
|---|
| 576 | auto FS0 = cast<FixedStackPseudoSourceValue>(Val: SrcPseudoVal); | 
|---|
| 577 | auto FS1 = cast<FixedStackPseudoSourceValue>(Val: DstPseudoVal); | 
|---|
| 578 | if (FS0 == FS1) { | 
|---|
| 579 | setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency); | 
|---|
| 580 | return true; | 
|---|
| 581 | } | 
|---|
| 582 | } | 
|---|
| 583 | return false; | 
|---|
| 584 | } | 
|---|
| 585 |  | 
|---|
| 586 | namespace { | 
|---|
| 587 |  | 
|---|
| 588 | std::unique_ptr<InstructionInformation> II; | 
|---|
| 589 |  | 
|---|
| 590 | class CortexM7InstructionInformation : public InstructionInformation { | 
|---|
| 591 | public: | 
|---|
| 592 | CortexM7InstructionInformation(const ARMBaseInstrInfo *TII) | 
|---|
| 593 | : InstructionInformation(TII) {} | 
|---|
| 594 | }; | 
|---|
| 595 |  | 
|---|
| 596 | class CortexM7Overrides : public ARMOverrideBypasses { | 
|---|
| 597 | public: | 
|---|
| 598 | CortexM7Overrides(const ARMBaseInstrInfo *TII, AAResults *AA) | 
|---|
| 599 | : ARMOverrideBypasses(TII, AA) { | 
|---|
| 600 | if (!II) | 
|---|
| 601 | II.reset(p: new CortexM7InstructionInformation(TII)); | 
|---|
| 602 | } | 
|---|
| 603 |  | 
|---|
| 604 | void modifyBypasses(SUnit &) override; | 
|---|
| 605 | }; | 
|---|
| 606 |  | 
|---|
| 607 | void CortexM7Overrides::modifyBypasses(SUnit &ISU) { | 
|---|
| 608 | const MachineInstr *SrcMI = ISU.getInstr(); | 
|---|
| 609 | unsigned SrcOpcode = SrcMI->getOpcode(); | 
|---|
| 610 | bool isNSWload = II->isNonSubwordLoad(Op: SrcOpcode); | 
|---|
| 611 |  | 
|---|
| 612 | // Walk the successors looking for latency overrides that are needed | 
|---|
| 613 | for (SDep &Dep : ISU.Succs) { | 
|---|
| 614 |  | 
|---|
| 615 | // Output dependences should have 0 latency, as M7 is able to | 
|---|
| 616 | // schedule writers to the same register for simultaneous issue. | 
|---|
| 617 | if (zeroOutputDependences(ISU, Dep)) | 
|---|
| 618 | continue; | 
|---|
| 619 |  | 
|---|
| 620 | if (memoryRAWHazard(ISU, Dep, latency: 4)) | 
|---|
| 621 | continue; | 
|---|
| 622 |  | 
|---|
| 623 | // Ignore dependencies other than data | 
|---|
| 624 | if (Dep.getKind() != SDep::Data) | 
|---|
| 625 | continue; | 
|---|
| 626 |  | 
|---|
| 627 | SUnit &DepSU = *Dep.getSUnit(); | 
|---|
| 628 | if (DepSU.isBoundaryNode()) | 
|---|
| 629 | continue; | 
|---|
| 630 |  | 
|---|
| 631 | if (makeBundleAssumptions(ISU, Dep) == 1) | 
|---|
| 632 | continue; | 
|---|
| 633 |  | 
|---|
| 634 | const MachineInstr *DstMI = DepSU.getInstr(); | 
|---|
| 635 | unsigned DstOpcode = DstMI->getOpcode(); | 
|---|
| 636 |  | 
|---|
| 637 | // Word loads into any multiply or divide instruction are considered | 
|---|
| 638 | // cannot bypass their scheduling stage. Didn't do this in the .td file | 
|---|
| 639 | // because we cannot easily create a read advance that is 0 from certain | 
|---|
| 640 | // writer classes and 1 from all the rest. | 
|---|
| 641 | // (The other way around would have been easy.) | 
|---|
| 642 | if (isNSWload && (II->isMultiply(Op: DstOpcode) || II->isDivide(Op: DstOpcode))) | 
|---|
| 643 | setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: Dep.getLatency() + 1); | 
|---|
| 644 |  | 
|---|
| 645 | // Word loads into B operand of a load/store are considered cannot bypass | 
|---|
| 646 | // their scheduling stage. Cannot do in the .td file because | 
|---|
| 647 | // need to decide between -1 and -2 for ReadAdvance | 
|---|
| 648 | if (isNSWload && II->hasBRegAddr(Op: DstOpcode) && | 
|---|
| 649 | DstMI->getOperand(i: 2).getReg() == Dep.getReg()) | 
|---|
| 650 | setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: Dep.getLatency() + 1); | 
|---|
| 651 |  | 
|---|
| 652 | // Multiplies into any address generation cannot bypass from EX3.  Cannot do | 
|---|
| 653 | // in the .td file because need to decide between -1 and -2 for ReadAdvance | 
|---|
| 654 | if (II->isMultiply(Op: SrcOpcode)) { | 
|---|
| 655 | unsigned OpMask = II->getAddressOpMask(Op: DstOpcode) >> 1; | 
|---|
| 656 | for (unsigned i = 1; OpMask; ++i, OpMask >>= 1) { | 
|---|
| 657 | if ((OpMask & 1) && DstMI->getOperand(i).isReg() && | 
|---|
| 658 | DstMI->getOperand(i).getReg() == Dep.getReg()) { | 
|---|
| 659 | setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: 4); // first legal bypass is EX4->EX1 | 
|---|
| 660 | break; | 
|---|
| 661 | } | 
|---|
| 662 | } | 
|---|
| 663 | } | 
|---|
| 664 |  | 
|---|
| 665 | // Mismatched conditional producers take longer on M7; they end up looking | 
|---|
| 666 | // like they were produced at EX3 and read at IS. | 
|---|
| 667 | if (TII->isPredicated(MI: *SrcMI) && Dep.isAssignedRegDep() && | 
|---|
| 668 | (SrcOpcode == ARM::BUNDLE || | 
|---|
| 669 | mismatchedPred(a: TII->getPredicate(MI: *SrcMI), | 
|---|
| 670 | b: TII->getPredicate(MI: *DstMI)))) { | 
|---|
| 671 | unsigned Lat = 1; | 
|---|
| 672 | // Operand A of shift+ALU is treated as an EX1 read instead of EX2. | 
|---|
| 673 | if (II->isInlineShiftALU(Op: DstOpcode) && DstMI->getOperand(i: 3).getImm() && | 
|---|
| 674 | DstMI->getOperand(i: 1).getReg() == Dep.getReg()) | 
|---|
| 675 | Lat = 2; | 
|---|
| 676 | Lat = std::min(a: 3u, b: Dep.getLatency() + Lat); | 
|---|
| 677 | setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: std::max(a: Dep.getLatency(), b: Lat)); | 
|---|
| 678 | } | 
|---|
| 679 |  | 
|---|
| 680 | // CC setter into conditional producer shouldn't have a latency of more | 
|---|
| 681 | // than 1 unless it's due to an implicit read. (All the "true" readers | 
|---|
| 682 | // of the condition code use an implicit read, and predicates use an | 
|---|
| 683 | // explicit.) | 
|---|
| 684 | if (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR && | 
|---|
| 685 | TII->isPredicated(MI: *DstMI) && !hasImplicitCPSRUse(MI: DstMI)) | 
|---|
| 686 | setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: 1); | 
|---|
| 687 |  | 
|---|
| 688 | // REV instructions cannot bypass directly into the EX1 shifter.  The | 
|---|
| 689 | // code is slightly inexact as it doesn't attempt to ensure that the bypass | 
|---|
| 690 | // is to the shifter operands. | 
|---|
| 691 | if (II->isRev(Op: SrcOpcode)) { | 
|---|
| 692 | if (II->isInlineShiftALU(Op: DstOpcode)) | 
|---|
| 693 | setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: 2); | 
|---|
| 694 | else if (II->isShift(Op: DstOpcode)) | 
|---|
| 695 | setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: 1); | 
|---|
| 696 | } | 
|---|
| 697 | } | 
|---|
| 698 | } | 
|---|
| 699 |  | 
|---|
| 700 | class M85InstructionInformation : public InstructionInformation { | 
|---|
| 701 | public: | 
|---|
| 702 | M85InstructionInformation(const ARMBaseInstrInfo *t) | 
|---|
| 703 | : InstructionInformation(t) { | 
|---|
| 704 | markDPProducersConsumers(TII: t); | 
|---|
| 705 | } | 
|---|
| 706 | }; | 
|---|
| 707 |  | 
|---|
| 708 | class M85Overrides : public ARMOverrideBypasses { | 
|---|
| 709 | public: | 
|---|
| 710 | M85Overrides(const ARMBaseInstrInfo *t, AAResults *a) | 
|---|
| 711 | : ARMOverrideBypasses(t, a) { | 
|---|
| 712 | if (!II) | 
|---|
| 713 | II.reset(p: new M85InstructionInformation(t)); | 
|---|
| 714 | } | 
|---|
| 715 |  | 
|---|
| 716 | void modifyBypasses(SUnit &) override; | 
|---|
| 717 |  | 
|---|
| 718 | private: | 
|---|
| 719 | unsigned computeBypassStage(const MCSchedClassDesc *SCD); | 
|---|
| 720 | signed modifyMixedWidthFP(const MachineInstr *SrcMI, | 
|---|
| 721 | const MachineInstr *DstMI, unsigned RegID, | 
|---|
| 722 | const MCSchedClassDesc *SCD); | 
|---|
| 723 | }; | 
|---|
| 724 |  | 
|---|
| 725 | unsigned M85Overrides::computeBypassStage(const MCSchedClassDesc *SCDesc) { | 
|---|
| 726 | auto SM = DAG->getSchedModel(); | 
|---|
| 727 | unsigned DefIdx = 0; // just look for the first output's timing | 
|---|
| 728 | if (DefIdx < SCDesc->NumWriteLatencyEntries) { | 
|---|
| 729 | // Lookup the definition's write latency in SubtargetInfo. | 
|---|
| 730 | const MCWriteLatencyEntry *WLEntry = | 
|---|
| 731 | SM->getSubtargetInfo()->getWriteLatencyEntry(SC: SCDesc, DefIdx); | 
|---|
| 732 | unsigned Latency = WLEntry->Cycles >= 0 ? WLEntry->Cycles : 1000; | 
|---|
| 733 | if (Latency == 4) | 
|---|
| 734 | return 2; | 
|---|
| 735 | else if (Latency == 5) | 
|---|
| 736 | return 3; | 
|---|
| 737 | else if (Latency > 3) | 
|---|
| 738 | return 3; | 
|---|
| 739 | else | 
|---|
| 740 | return Latency; | 
|---|
| 741 | } | 
|---|
| 742 | return 2; | 
|---|
| 743 | } | 
|---|
| 744 |  | 
|---|
| 745 | // Latency changes for bypassing between FP registers of different sizes: | 
|---|
| 746 | // | 
|---|
| 747 | // Note that mixed DP/SP are unlikely because of the semantics | 
|---|
| 748 | // of C.  Mixed MVE/SP are quite common when MVE intrinsics are used. | 
|---|
| 749 | signed M85Overrides::modifyMixedWidthFP(const MachineInstr *SrcMI, | 
|---|
| 750 | const MachineInstr *DstMI, | 
|---|
| 751 | unsigned RegID, | 
|---|
| 752 | const MCSchedClassDesc *SCD) { | 
|---|
| 753 |  | 
|---|
| 754 | if (!II->producesSP(Op: SrcMI->getOpcode()) && | 
|---|
| 755 | !II->producesDP(Op: SrcMI->getOpcode()) && | 
|---|
| 756 | !II->producesQP(Op: SrcMI->getOpcode())) | 
|---|
| 757 | return 0; | 
|---|
| 758 |  | 
|---|
| 759 | if (Register::isVirtualRegister(Reg: RegID)) { | 
|---|
| 760 | if (II->producesSP(Op: SrcMI->getOpcode()) && | 
|---|
| 761 | II->consumesDP(Op: DstMI->getOpcode())) { | 
|---|
| 762 | for (auto &OP : SrcMI->operands()) | 
|---|
| 763 | if (OP.isReg() && OP.isDef() && OP.getReg() == RegID && | 
|---|
| 764 | OP.getSubReg() == ARM::ssub_1) | 
|---|
| 765 | return 5 - computeBypassStage(SCDesc: SCD); | 
|---|
| 766 | } else if (II->producesSP(Op: SrcMI->getOpcode()) && | 
|---|
| 767 | II->consumesQP(Op: DstMI->getOpcode())) { | 
|---|
| 768 | for (auto &OP : SrcMI->operands()) | 
|---|
| 769 | if (OP.isReg() && OP.isDef() && OP.getReg() == RegID && | 
|---|
| 770 | (OP.getSubReg() == ARM::ssub_1 || OP.getSubReg() == ARM::ssub_3)) | 
|---|
| 771 | return 5 - computeBypassStage(SCDesc: SCD) - | 
|---|
| 772 | ((OP.getSubReg() == ARM::ssub_2 || | 
|---|
| 773 | OP.getSubReg() == ARM::ssub_3) | 
|---|
| 774 | ? 1 | 
|---|
| 775 | : 0); | 
|---|
| 776 | } else if (II->producesDP(Op: SrcMI->getOpcode()) && | 
|---|
| 777 | II->consumesQP(Op: DstMI->getOpcode())) { | 
|---|
| 778 | for (auto &OP : SrcMI->operands()) | 
|---|
| 779 | if (OP.isReg() && OP.isDef() && OP.getReg() == RegID && | 
|---|
| 780 | OP.getSubReg() == ARM::ssub_1) | 
|---|
| 781 | return -1; | 
|---|
| 782 | } else if (II->producesDP(Op: SrcMI->getOpcode()) && | 
|---|
| 783 | II->consumesSP(Op: DstMI->getOpcode())) { | 
|---|
| 784 | for (auto &OP : DstMI->operands()) | 
|---|
| 785 | if (OP.isReg() && OP.isUse() && OP.getReg() == RegID && | 
|---|
| 786 | OP.getSubReg() == ARM::ssub_1) | 
|---|
| 787 | return 5 - computeBypassStage(SCDesc: SCD); | 
|---|
| 788 | } else if (II->producesQP(Op: SrcMI->getOpcode()) && | 
|---|
| 789 | II->consumesSP(Op: DstMI->getOpcode())) { | 
|---|
| 790 | for (auto &OP : DstMI->operands()) | 
|---|
| 791 | if (OP.isReg() && OP.isUse() && OP.getReg() == RegID && | 
|---|
| 792 | (OP.getSubReg() == ARM::ssub_1 || OP.getSubReg() == ARM::ssub_3)) | 
|---|
| 793 | return 5 - computeBypassStage(SCDesc: SCD) + | 
|---|
| 794 | ((OP.getSubReg() == ARM::ssub_2 || | 
|---|
| 795 | OP.getSubReg() == ARM::ssub_3) | 
|---|
| 796 | ? 1 | 
|---|
| 797 | : 0); | 
|---|
| 798 | } else if (II->producesQP(Op: SrcMI->getOpcode()) && | 
|---|
| 799 | II->consumesDP(Op: DstMI->getOpcode())) { | 
|---|
| 800 | for (auto &OP : DstMI->operands()) | 
|---|
| 801 | if (OP.isReg() && OP.isUse() && OP.getReg() == RegID && | 
|---|
| 802 | OP.getSubReg() == ARM::ssub_1) | 
|---|
| 803 | return 1; | 
|---|
| 804 | } | 
|---|
| 805 | } else if (Register::isPhysicalRegister(Reg: RegID)) { | 
|---|
| 806 | // Note that when the producer is narrower, not all of the producers | 
|---|
| 807 | // may be present in the scheduling graph; somewhere earlier in the | 
|---|
| 808 | // compiler, an implicit def/use of the aliased full register gets | 
|---|
| 809 | // added to the producer, and so only that producer is seen as *the* | 
|---|
| 810 | // single producer.  This behavior also has the unfortunate effect of | 
|---|
| 811 | // serializing the producers in the compiler's view of things. | 
|---|
| 812 | if (II->producesSP(Op: SrcMI->getOpcode()) && | 
|---|
| 813 | II->consumesDP(Op: DstMI->getOpcode())) { | 
|---|
| 814 | for (auto &OP : SrcMI->operands()) | 
|---|
| 815 | if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::S1 && | 
|---|
| 816 | OP.getReg() <= ARM::S31 && (OP.getReg() - ARM::S0) % 2 && | 
|---|
| 817 | (OP.getReg() == RegID || | 
|---|
| 818 | (OP.getReg() - ARM::S0) / 2 + ARM::D0 == RegID || | 
|---|
| 819 | (OP.getReg() - ARM::S0) / 4 + ARM::Q0 == RegID)) | 
|---|
| 820 | return 5 - computeBypassStage(SCDesc: SCD); | 
|---|
| 821 | } else if (II->producesSP(Op: SrcMI->getOpcode()) && | 
|---|
| 822 | II->consumesQP(Op: DstMI->getOpcode())) { | 
|---|
| 823 | for (auto &OP : SrcMI->operands()) | 
|---|
| 824 | if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::S1 && | 
|---|
| 825 | OP.getReg() <= ARM::S31 && (OP.getReg() - ARM::S0) % 2 && | 
|---|
| 826 | (OP.getReg() == RegID || | 
|---|
| 827 | (OP.getReg() - ARM::S0) / 2 + ARM::D0 == RegID || | 
|---|
| 828 | (OP.getReg() - ARM::S0) / 4 + ARM::Q0 == RegID)) | 
|---|
| 829 | return 5 - computeBypassStage(SCDesc: SCD) - | 
|---|
| 830 | (((OP.getReg() - ARM::S0) / 2) % 2 ? 1 : 0); | 
|---|
| 831 | } else if (II->producesDP(Op: SrcMI->getOpcode()) && | 
|---|
| 832 | II->consumesQP(Op: DstMI->getOpcode())) { | 
|---|
| 833 | for (auto &OP : SrcMI->operands()) | 
|---|
| 834 | if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::D0 && | 
|---|
| 835 | OP.getReg() <= ARM::D15 && (OP.getReg() - ARM::D0) % 2 && | 
|---|
| 836 | (OP.getReg() == RegID || | 
|---|
| 837 | (OP.getReg() - ARM::D0) / 2 + ARM::Q0 == RegID)) | 
|---|
| 838 | return -1; | 
|---|
| 839 | } else if (II->producesDP(Op: SrcMI->getOpcode()) && | 
|---|
| 840 | II->consumesSP(Op: DstMI->getOpcode())) { | 
|---|
| 841 | if (RegID >= ARM::S1 && RegID <= ARM::S31 && (RegID - ARM::S0) % 2) | 
|---|
| 842 | return 5 - computeBypassStage(SCDesc: SCD); | 
|---|
| 843 | } else if (II->producesQP(Op: SrcMI->getOpcode()) && | 
|---|
| 844 | II->consumesSP(Op: DstMI->getOpcode())) { | 
|---|
| 845 | if (RegID >= ARM::S1 && RegID <= ARM::S31 && (RegID - ARM::S0) % 2) | 
|---|
| 846 | return 5 - computeBypassStage(SCDesc: SCD) + | 
|---|
| 847 | (((RegID - ARM::S0) / 2) % 2 ? 1 : 0); | 
|---|
| 848 | } else if (II->producesQP(Op: SrcMI->getOpcode()) && | 
|---|
| 849 | II->consumesDP(Op: DstMI->getOpcode())) { | 
|---|
| 850 | if (RegID >= ARM::D1 && RegID <= ARM::D15 && (RegID - ARM::D0) % 2) | 
|---|
| 851 | return 1; | 
|---|
| 852 | } | 
|---|
| 853 | } | 
|---|
| 854 | return 0; | 
|---|
| 855 | } | 
|---|
| 856 |  | 
|---|
| 857 | void M85Overrides::modifyBypasses(SUnit &ISU) { | 
|---|
| 858 | const MachineInstr *SrcMI = ISU.getInstr(); | 
|---|
| 859 | unsigned SrcOpcode = SrcMI->getOpcode(); | 
|---|
| 860 | bool isNSWload = II->isNonSubwordLoad(Op: SrcOpcode); | 
|---|
| 861 |  | 
|---|
| 862 | // Walk the successors looking for latency overrides that are needed | 
|---|
| 863 | for (SDep &Dep : ISU.Succs) { | 
|---|
| 864 |  | 
|---|
| 865 | // Output dependences should have 0 latency, as CortexM85 is able to | 
|---|
| 866 | // schedule writers to the same register for simultaneous issue. | 
|---|
| 867 | if (zeroOutputDependences(ISU, Dep)) | 
|---|
| 868 | continue; | 
|---|
| 869 |  | 
|---|
| 870 | if (memoryRAWHazard(ISU, Dep, latency: 3)) | 
|---|
| 871 | continue; | 
|---|
| 872 |  | 
|---|
| 873 | // Ignore dependencies other than data or strong ordering. | 
|---|
| 874 | if (Dep.getKind() != SDep::Data) | 
|---|
| 875 | continue; | 
|---|
| 876 |  | 
|---|
| 877 | SUnit &DepSU = *Dep.getSUnit(); | 
|---|
| 878 | if (DepSU.isBoundaryNode()) | 
|---|
| 879 | continue; | 
|---|
| 880 |  | 
|---|
| 881 | if (makeBundleAssumptions(ISU, Dep) == 1) | 
|---|
| 882 | continue; | 
|---|
| 883 |  | 
|---|
| 884 | const MachineInstr *DstMI = DepSU.getInstr(); | 
|---|
| 885 | unsigned DstOpcode = DstMI->getOpcode(); | 
|---|
| 886 |  | 
|---|
| 887 | // Word loads into B operand of a load/store with cannot bypass their | 
|---|
| 888 | // scheduling stage. Cannot do in the .td file because need to decide | 
|---|
| 889 | // between -1 and -2 for ReadAdvance | 
|---|
| 890 |  | 
|---|
| 891 | if (isNSWload && II->hasBRegAddrShift(Op: DstOpcode) && | 
|---|
| 892 | DstMI->getOperand(i: 3).getImm() != 0 && // shift operand | 
|---|
| 893 | DstMI->getOperand(i: 2).getReg() == Dep.getReg()) | 
|---|
| 894 | setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: Dep.getLatency() + 1); | 
|---|
| 895 |  | 
|---|
| 896 | if (isNSWload && isMVEVectorInstruction(MI: DstMI)) { | 
|---|
| 897 | setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: Dep.getLatency() + 1); | 
|---|
| 898 | } | 
|---|
| 899 |  | 
|---|
| 900 | if (II->isMVEIntMAC(Op: DstOpcode) && | 
|---|
| 901 | II->isMVEIntMACMatched(SrcOp: SrcOpcode, DstOp: DstOpcode) && | 
|---|
| 902 | DstMI->getOperand(i: 0).isReg() && | 
|---|
| 903 | DstMI->getOperand(i: 0).getReg() == Dep.getReg()) | 
|---|
| 904 | setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: Dep.getLatency() - 1); | 
|---|
| 905 |  | 
|---|
| 906 | // CC setter into conditional producer shouldn't have a latency of more | 
|---|
| 907 | // than 0 unless it's due to an implicit read. | 
|---|
| 908 | if (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR && | 
|---|
| 909 | TII->isPredicated(MI: *DstMI) && !hasImplicitCPSRUse(MI: DstMI)) | 
|---|
| 910 | setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: 0); | 
|---|
| 911 |  | 
|---|
| 912 | if (signed ALat = modifyMixedWidthFP(SrcMI, DstMI, RegID: Dep.getReg(), | 
|---|
| 913 | SCD: DAG->getSchedClass(SU: &ISU))) | 
|---|
| 914 | setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: std::max(a: 0, b: signed(Dep.getLatency()) + ALat)); | 
|---|
| 915 |  | 
|---|
| 916 | if (II->isRev(Op: SrcOpcode)) { | 
|---|
| 917 | if (II->isInlineShiftALU(Op: DstOpcode)) | 
|---|
| 918 | setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: 1); | 
|---|
| 919 | else if (II->isShift(Op: DstOpcode)) | 
|---|
| 920 | setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: 1); | 
|---|
| 921 | } | 
|---|
| 922 | } | 
|---|
| 923 | } | 
|---|
| 924 |  | 
|---|
| 925 | // Add M55 specific overrides for latencies between instructions. Currently it: | 
|---|
| 926 | //  - Adds an extra cycle latency between MVE VMLAV and scalar instructions. | 
|---|
| 927 | class CortexM55Overrides : public ARMOverrideBypasses { | 
|---|
| 928 | public: | 
|---|
| 929 | CortexM55Overrides(const ARMBaseInstrInfo *TII, AAResults *AA) | 
|---|
| 930 | : ARMOverrideBypasses(TII, AA) {} | 
|---|
| 931 |  | 
|---|
| 932 | void modifyBypasses(SUnit &SU) override { | 
|---|
| 933 | MachineInstr *SrcMI = SU.getInstr(); | 
|---|
| 934 | if (!(SrcMI->getDesc().TSFlags & ARMII::HorizontalReduction)) | 
|---|
| 935 | return; | 
|---|
| 936 |  | 
|---|
| 937 | for (SDep &Dep : SU.Succs) { | 
|---|
| 938 | if (Dep.getKind() != SDep::Data) | 
|---|
| 939 | continue; | 
|---|
| 940 | SUnit &DepSU = *Dep.getSUnit(); | 
|---|
| 941 | if (DepSU.isBoundaryNode()) | 
|---|
| 942 | continue; | 
|---|
| 943 | MachineInstr *DstMI = DepSU.getInstr(); | 
|---|
| 944 |  | 
|---|
| 945 | if (!isMVEVectorInstruction(MI: DstMI) && !DstMI->mayStore()) | 
|---|
| 946 | setBidirLatencies(SrcSU&: SU, SrcDep&: Dep, latency: 3); | 
|---|
| 947 | } | 
|---|
| 948 | } | 
|---|
| 949 | }; | 
|---|
| 950 |  | 
|---|
| 951 | } // end anonymous namespace | 
|---|
| 952 |  | 
|---|
| 953 | void ARMOverrideBypasses::apply(ScheduleDAGInstrs *DAGInstrs) { | 
|---|
| 954 | DAG = DAGInstrs; | 
|---|
| 955 | for (SUnit &ISU : DAGInstrs->SUnits) { | 
|---|
| 956 | if (ISU.isBoundaryNode()) | 
|---|
| 957 | continue; | 
|---|
| 958 | modifyBypasses(ISU); | 
|---|
| 959 | } | 
|---|
| 960 | if (DAGInstrs->ExitSU.getInstr()) | 
|---|
| 961 | modifyBypasses(DAGInstrs->ExitSU); | 
|---|
| 962 | } | 
|---|
| 963 |  | 
|---|
| 964 | std::unique_ptr<ScheduleDAGMutation> | 
|---|
| 965 | createARMLatencyMutations(const ARMSubtarget &ST, AAResults *AA) { | 
|---|
| 966 | if (ST.isCortexM85()) | 
|---|
| 967 | return std::make_unique<M85Overrides>(args: ST.getInstrInfo(), args&: AA); | 
|---|
| 968 | else if (ST.isCortexM7()) | 
|---|
| 969 | return std::make_unique<CortexM7Overrides>(args: ST.getInstrInfo(), args&: AA); | 
|---|
| 970 | else if (ST.isCortexM55()) | 
|---|
| 971 | return std::make_unique<CortexM55Overrides>(args: ST.getInstrInfo(), args&: AA); | 
|---|
| 972 |  | 
|---|
| 973 | return nullptr; | 
|---|
| 974 | } | 
|---|
| 975 |  | 
|---|
| 976 | } // end namespace llvm | 
|---|
| 977 |  | 
|---|