| 1 | //===- ARMLatencyMutations.cpp - ARM Latency Mutations --------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | /// \file This file contains the ARM definition DAG scheduling mutations which |
| 10 | /// change inter-instruction latencies |
| 11 | // |
| 12 | //===----------------------------------------------------------------------===// |
| 13 | |
| 14 | #include "ARMLatencyMutations.h" |
| 15 | #include "ARMSubtarget.h" |
| 16 | #include "Thumb2InstrInfo.h" |
| 17 | #include "llvm/Analysis/AliasAnalysis.h" |
| 18 | #include "llvm/CodeGen/ScheduleDAG.h" |
| 19 | #include "llvm/CodeGen/ScheduleDAGMutation.h" |
| 20 | #include "llvm/CodeGen/TargetInstrInfo.h" |
| 21 | #include <algorithm> |
| 22 | #include <array> |
| 23 | #include <initializer_list> |
| 24 | #include <memory> |
| 25 | |
| 26 | namespace llvm { |
| 27 | |
| 28 | namespace { |
| 29 | |
| 30 | // Precompute information about opcodes to speed up pass |
| 31 | |
| 32 | class InstructionInformation { |
| 33 | protected: |
| 34 | struct IInfo { |
| 35 | bool HasBRegAddr : 1; // B-side of addr gen is a register |
| 36 | bool HasBRegAddrShift : 1; // B-side of addr gen has a shift |
| 37 | bool IsDivide : 1; // Some form of integer divide |
| 38 | bool IsInlineShiftALU : 1; // Inline shift+ALU |
| 39 | bool IsMultiply : 1; // Some form of integer multiply |
| 40 | bool IsMVEIntMAC : 1; // MVE 8/16/32-bit integer MAC operation |
| 41 | bool IsNonSubwordLoad : 1; // Load which is a word or larger |
| 42 | bool IsShift : 1; // Shift operation |
| 43 | bool IsRev : 1; // REV operation |
| 44 | bool ProducesQP : 1; // Produces a vector register result |
| 45 | bool ProducesDP : 1; // Produces a double-precision register result |
| 46 | bool ProducesSP : 1; // Produces a single-precision register result |
| 47 | bool ConsumesQP : 1; // Consumes a vector register result |
| 48 | bool ConsumesDP : 1; // Consumes a double-precision register result |
| 49 | bool ConsumesSP : 1; // Consumes a single-precision register result |
| 50 | unsigned MVEIntMACMatched; // Matched operand type (for MVE) |
| 51 | unsigned AddressOpMask; // Mask indicating which operands go into AGU |
| 52 | IInfo() |
| 53 | : HasBRegAddr(false), HasBRegAddrShift(false), IsDivide(false), |
| 54 | IsInlineShiftALU(false), IsMultiply(false), IsMVEIntMAC(false), |
| 55 | IsNonSubwordLoad(false), IsShift(false), IsRev(false), |
| 56 | ProducesQP(false), ProducesDP(false), ProducesSP(false), |
| 57 | ConsumesQP(false), ConsumesDP(false), ConsumesSP(false), |
| 58 | MVEIntMACMatched(0), AddressOpMask(0) {} |
| 59 | }; |
| 60 | typedef std::array<IInfo, ARM::INSTRUCTION_LIST_END> IInfoArray; |
| 61 | IInfoArray Info; |
| 62 | |
| 63 | public: |
| 64 | // Always available information |
| 65 | unsigned getAddressOpMask(unsigned Op) { return Info[Op].AddressOpMask; } |
| 66 | bool hasBRegAddr(unsigned Op) { return Info[Op].HasBRegAddr; } |
| 67 | bool hasBRegAddrShift(unsigned Op) { return Info[Op].HasBRegAddrShift; } |
| 68 | bool isDivide(unsigned Op) { return Info[Op].IsDivide; } |
| 69 | bool isInlineShiftALU(unsigned Op) { return Info[Op].IsInlineShiftALU; } |
| 70 | bool isMultiply(unsigned Op) { return Info[Op].IsMultiply; } |
| 71 | bool isMVEIntMAC(unsigned Op) { return Info[Op].IsMVEIntMAC; } |
| 72 | bool isNonSubwordLoad(unsigned Op) { return Info[Op].IsNonSubwordLoad; } |
| 73 | bool isRev(unsigned Op) { return Info[Op].IsRev; } |
| 74 | bool isShift(unsigned Op) { return Info[Op].IsShift; } |
| 75 | |
| 76 | // information available if markDPConsumers is called. |
| 77 | bool producesQP(unsigned Op) { return Info[Op].ProducesQP; } |
| 78 | bool producesDP(unsigned Op) { return Info[Op].ProducesDP; } |
| 79 | bool producesSP(unsigned Op) { return Info[Op].ProducesSP; } |
| 80 | bool consumesQP(unsigned Op) { return Info[Op].ConsumesQP; } |
| 81 | bool consumesDP(unsigned Op) { return Info[Op].ConsumesDP; } |
| 82 | bool consumesSP(unsigned Op) { return Info[Op].ConsumesSP; } |
| 83 | |
| 84 | bool isMVEIntMACMatched(unsigned SrcOp, unsigned DstOp) { |
| 85 | return SrcOp == DstOp || Info[DstOp].MVEIntMACMatched == SrcOp; |
| 86 | } |
| 87 | |
| 88 | InstructionInformation(const ARMBaseInstrInfo *TII); |
| 89 | |
| 90 | protected: |
| 91 | void markDPProducersConsumers(const ARMBaseInstrInfo *TII); |
| 92 | }; |
| 93 | |
| 94 | InstructionInformation::InstructionInformation(const ARMBaseInstrInfo *TII) { |
| 95 | using namespace ARM; |
| 96 | |
| 97 | std::initializer_list<unsigned> hasBRegAddrList = { |
| 98 | t2LDRs, t2LDRBs, t2LDRHs, t2STRs, t2STRBs, t2STRHs, |
| 99 | tLDRr, tLDRBr, tLDRHr, tSTRr, tSTRBr, tSTRHr, |
| 100 | }; |
| 101 | for (auto op : hasBRegAddrList) { |
| 102 | Info[op].HasBRegAddr = true; |
| 103 | } |
| 104 | |
| 105 | std::initializer_list<unsigned> hasBRegAddrShiftList = { |
| 106 | t2LDRs, t2LDRBs, t2LDRHs, t2STRs, t2STRBs, t2STRHs, |
| 107 | }; |
| 108 | for (auto op : hasBRegAddrShiftList) { |
| 109 | Info[op].HasBRegAddrShift = true; |
| 110 | } |
| 111 | |
| 112 | Info[t2SDIV].IsDivide = Info[t2UDIV].IsDivide = true; |
| 113 | |
| 114 | std::initializer_list<unsigned> isInlineShiftALUList = { |
| 115 | t2ADCrs, t2ADDSrs, t2ADDrs, t2BICrs, t2EORrs, |
| 116 | t2ORNrs, t2RSBSrs, t2RSBrs, t2SBCrs, t2SUBrs, |
| 117 | t2SUBSrs, t2CMPrs, t2CMNzrs, t2TEQrs, t2TSTrs, |
| 118 | }; |
| 119 | for (auto op : isInlineShiftALUList) { |
| 120 | Info[op].IsInlineShiftALU = true; |
| 121 | } |
| 122 | |
| 123 | Info[t2SDIV].IsDivide = Info[t2UDIV].IsDivide = true; |
| 124 | |
| 125 | std::initializer_list<unsigned> isMultiplyList = { |
| 126 | t2MUL, t2MLA, t2MLS, t2SMLABB, t2SMLABT, t2SMLAD, t2SMLADX, |
| 127 | t2SMLAL, t2SMLALBB, t2SMLALBT, t2SMLALD, t2SMLALDX, t2SMLALTB, t2SMLALTT, |
| 128 | t2SMLATB, t2SMLATT, t2SMLAWT, t2SMLSD, t2SMLSDX, t2SMLSLD, t2SMLSLDX, |
| 129 | t2SMMLA, t2SMMLAR, t2SMMLS, t2SMMLSR, t2SMMUL, t2SMMULR, t2SMUAD, |
| 130 | t2SMUADX, t2SMULBB, t2SMULBT, t2SMULL, t2SMULTB, t2SMULTT, t2SMULWT, |
| 131 | t2SMUSD, t2SMUSDX, t2UMAAL, t2UMLAL, t2UMULL, tMUL, |
| 132 | }; |
| 133 | for (auto op : isMultiplyList) { |
| 134 | Info[op].IsMultiply = true; |
| 135 | } |
| 136 | |
| 137 | std::initializer_list<unsigned> isMVEIntMACList = { |
| 138 | MVE_VMLAS_qr_i16, MVE_VMLAS_qr_i32, MVE_VMLAS_qr_i8, |
| 139 | MVE_VMLA_qr_i16, MVE_VMLA_qr_i32, MVE_VMLA_qr_i8, |
| 140 | MVE_VQDMLAH_qrs16, MVE_VQDMLAH_qrs32, MVE_VQDMLAH_qrs8, |
| 141 | MVE_VQDMLASH_qrs16, MVE_VQDMLASH_qrs32, MVE_VQDMLASH_qrs8, |
| 142 | MVE_VQRDMLAH_qrs16, MVE_VQRDMLAH_qrs32, MVE_VQRDMLAH_qrs8, |
| 143 | MVE_VQRDMLASH_qrs16, MVE_VQRDMLASH_qrs32, MVE_VQRDMLASH_qrs8, |
| 144 | MVE_VQDMLADHXs16, MVE_VQDMLADHXs32, MVE_VQDMLADHXs8, |
| 145 | MVE_VQDMLADHs16, MVE_VQDMLADHs32, MVE_VQDMLADHs8, |
| 146 | MVE_VQDMLSDHXs16, MVE_VQDMLSDHXs32, MVE_VQDMLSDHXs8, |
| 147 | MVE_VQDMLSDHs16, MVE_VQDMLSDHs32, MVE_VQDMLSDHs8, |
| 148 | MVE_VQRDMLADHXs16, MVE_VQRDMLADHXs32, MVE_VQRDMLADHXs8, |
| 149 | MVE_VQRDMLADHs16, MVE_VQRDMLADHs32, MVE_VQRDMLADHs8, |
| 150 | MVE_VQRDMLSDHXs16, MVE_VQRDMLSDHXs32, MVE_VQRDMLSDHXs8, |
| 151 | MVE_VQRDMLSDHs16, MVE_VQRDMLSDHs32, MVE_VQRDMLSDHs8, |
| 152 | }; |
| 153 | for (auto op : isMVEIntMACList) { |
| 154 | Info[op].IsMVEIntMAC = true; |
| 155 | } |
| 156 | |
| 157 | std::initializer_list<unsigned> isNonSubwordLoadList = { |
| 158 | t2LDRi12, t2LDRi8, t2LDR_POST, t2LDR_PRE, t2LDRpci, |
| 159 | t2LDRs, t2LDRDi8, t2LDRD_POST, t2LDRD_PRE, tLDRi, |
| 160 | tLDRpci, tLDRr, tLDRspi, |
| 161 | }; |
| 162 | for (auto op : isNonSubwordLoadList) { |
| 163 | Info[op].IsNonSubwordLoad = true; |
| 164 | } |
| 165 | |
| 166 | std::initializer_list<unsigned> isRevList = { |
| 167 | t2REV, t2REV16, t2REVSH, t2RBIT, tREV, tREV16, tREVSH, |
| 168 | }; |
| 169 | for (auto op : isRevList) { |
| 170 | Info[op].IsRev = true; |
| 171 | } |
| 172 | |
| 173 | std::initializer_list<unsigned> isShiftList = { |
| 174 | t2ASRri, t2ASRrr, t2LSLri, t2LSLrr, t2LSRri, t2LSRrr, t2RORri, t2RORrr, |
| 175 | tASRri, tASRrr, tLSLSri, tLSLri, tLSLrr, tLSRri, tLSRrr, tROR, |
| 176 | }; |
| 177 | for (auto op : isShiftList) { |
| 178 | Info[op].IsShift = true; |
| 179 | } |
| 180 | |
| 181 | std::initializer_list<unsigned> Address1List = { |
| 182 | t2LDRBi12, |
| 183 | t2LDRBi8, |
| 184 | t2LDRBpci, |
| 185 | t2LDRBs, |
| 186 | t2LDRHi12, |
| 187 | t2LDRHi8, |
| 188 | t2LDRHpci, |
| 189 | t2LDRHs, |
| 190 | t2LDRSBi12, |
| 191 | t2LDRSBi8, |
| 192 | t2LDRSBpci, |
| 193 | t2LDRSBs, |
| 194 | t2LDRSHi12, |
| 195 | t2LDRSHi8, |
| 196 | t2LDRSHpci, |
| 197 | t2LDRSHs, |
| 198 | t2LDRi12, |
| 199 | t2LDRi8, |
| 200 | t2LDRpci, |
| 201 | t2LDRs, |
| 202 | tLDRBi, |
| 203 | tLDRBr, |
| 204 | tLDRHi, |
| 205 | tLDRHr, |
| 206 | tLDRSB, |
| 207 | tLDRSH, |
| 208 | tLDRi, |
| 209 | tLDRpci, |
| 210 | tLDRr, |
| 211 | tLDRspi, |
| 212 | t2STRBi12, |
| 213 | t2STRBi8, |
| 214 | t2STRBs, |
| 215 | t2STRHi12, |
| 216 | t2STRHi8, |
| 217 | t2STRHs, |
| 218 | t2STRi12, |
| 219 | t2STRi8, |
| 220 | t2STRs, |
| 221 | tSTRBi, |
| 222 | tSTRBr, |
| 223 | tSTRHi, |
| 224 | tSTRHr, |
| 225 | tSTRi, |
| 226 | tSTRr, |
| 227 | tSTRspi, |
| 228 | VLDRD, |
| 229 | VLDRH, |
| 230 | VLDRS, |
| 231 | VSTRD, |
| 232 | VSTRH, |
| 233 | VSTRS, |
| 234 | MVE_VLD20_16, |
| 235 | MVE_VLD20_32, |
| 236 | MVE_VLD20_8, |
| 237 | MVE_VLD21_16, |
| 238 | MVE_VLD21_32, |
| 239 | MVE_VLD21_8, |
| 240 | MVE_VLD40_16, |
| 241 | MVE_VLD40_32, |
| 242 | MVE_VLD40_8, |
| 243 | MVE_VLD41_16, |
| 244 | MVE_VLD41_32, |
| 245 | MVE_VLD41_8, |
| 246 | MVE_VLD42_16, |
| 247 | MVE_VLD42_32, |
| 248 | MVE_VLD42_8, |
| 249 | MVE_VLD43_16, |
| 250 | MVE_VLD43_32, |
| 251 | MVE_VLD43_8, |
| 252 | MVE_VLDRBS16, |
| 253 | MVE_VLDRBS16_rq, |
| 254 | MVE_VLDRBS32, |
| 255 | MVE_VLDRBS32_rq, |
| 256 | MVE_VLDRBU16, |
| 257 | MVE_VLDRBU16_rq, |
| 258 | MVE_VLDRBU32, |
| 259 | MVE_VLDRBU32_rq, |
| 260 | MVE_VLDRBU8, |
| 261 | MVE_VLDRBU8_rq, |
| 262 | MVE_VLDRDU64_qi, |
| 263 | MVE_VLDRDU64_rq, |
| 264 | MVE_VLDRDU64_rq_u, |
| 265 | MVE_VLDRHS32, |
| 266 | MVE_VLDRHS32_rq, |
| 267 | MVE_VLDRHS32_rq_u, |
| 268 | MVE_VLDRHU16, |
| 269 | MVE_VLDRHU16_rq, |
| 270 | MVE_VLDRHU16_rq_u, |
| 271 | MVE_VLDRHU32, |
| 272 | MVE_VLDRHU32_rq, |
| 273 | MVE_VLDRHU32_rq_u, |
| 274 | MVE_VLDRWU32, |
| 275 | MVE_VLDRWU32_qi, |
| 276 | MVE_VLDRWU32_rq, |
| 277 | MVE_VLDRWU32_rq_u, |
| 278 | MVE_VST20_16, |
| 279 | MVE_VST20_32, |
| 280 | MVE_VST20_8, |
| 281 | MVE_VST21_16, |
| 282 | MVE_VST21_32, |
| 283 | MVE_VST21_8, |
| 284 | MVE_VST40_16, |
| 285 | MVE_VST40_32, |
| 286 | MVE_VST40_8, |
| 287 | MVE_VST41_16, |
| 288 | MVE_VST41_32, |
| 289 | MVE_VST41_8, |
| 290 | MVE_VST42_16, |
| 291 | MVE_VST42_32, |
| 292 | MVE_VST42_8, |
| 293 | MVE_VST43_16, |
| 294 | MVE_VST43_32, |
| 295 | MVE_VST43_8, |
| 296 | MVE_VSTRB16, |
| 297 | MVE_VSTRB16_rq, |
| 298 | MVE_VSTRB32, |
| 299 | MVE_VSTRB32_rq, |
| 300 | MVE_VSTRBU8, |
| 301 | MVE_VSTRB8_rq, |
| 302 | MVE_VSTRD64_qi, |
| 303 | MVE_VSTRD64_rq, |
| 304 | MVE_VSTRD64_rq_u, |
| 305 | MVE_VSTRH32, |
| 306 | MVE_VSTRH32_rq, |
| 307 | MVE_VSTRH32_rq_u, |
| 308 | MVE_VSTRHU16, |
| 309 | MVE_VSTRH16_rq, |
| 310 | MVE_VSTRH16_rq_u, |
| 311 | MVE_VSTRWU32, |
| 312 | MVE_VSTRW32_qi, |
| 313 | MVE_VSTRW32_rq, |
| 314 | MVE_VSTRW32_rq_u, |
| 315 | }; |
| 316 | std::initializer_list<unsigned> Address2List = { |
| 317 | t2LDRB_POST, |
| 318 | t2LDRB_PRE, |
| 319 | t2LDRDi8, |
| 320 | t2LDRH_POST, |
| 321 | t2LDRH_PRE, |
| 322 | t2LDRSB_POST, |
| 323 | t2LDRSB_PRE, |
| 324 | t2LDRSH_POST, |
| 325 | t2LDRSH_PRE, |
| 326 | t2LDR_POST, |
| 327 | t2LDR_PRE, |
| 328 | t2STRB_POST, |
| 329 | t2STRB_PRE, |
| 330 | t2STRDi8, |
| 331 | t2STRH_POST, |
| 332 | t2STRH_PRE, |
| 333 | t2STR_POST, |
| 334 | t2STR_PRE, |
| 335 | MVE_VLD20_16_wb, |
| 336 | MVE_VLD20_32_wb, |
| 337 | MVE_VLD20_8_wb, |
| 338 | MVE_VLD21_16_wb, |
| 339 | MVE_VLD21_32_wb, |
| 340 | MVE_VLD21_8_wb, |
| 341 | MVE_VLD40_16_wb, |
| 342 | MVE_VLD40_32_wb, |
| 343 | MVE_VLD40_8_wb, |
| 344 | MVE_VLD41_16_wb, |
| 345 | MVE_VLD41_32_wb, |
| 346 | MVE_VLD41_8_wb, |
| 347 | MVE_VLD42_16_wb, |
| 348 | MVE_VLD42_32_wb, |
| 349 | MVE_VLD42_8_wb, |
| 350 | MVE_VLD43_16_wb, |
| 351 | MVE_VLD43_32_wb, |
| 352 | MVE_VLD43_8_wb, |
| 353 | MVE_VLDRBS16_post, |
| 354 | MVE_VLDRBS16_pre, |
| 355 | MVE_VLDRBS32_post, |
| 356 | MVE_VLDRBS32_pre, |
| 357 | MVE_VLDRBU16_post, |
| 358 | MVE_VLDRBU16_pre, |
| 359 | MVE_VLDRBU32_post, |
| 360 | MVE_VLDRBU32_pre, |
| 361 | MVE_VLDRBU8_post, |
| 362 | MVE_VLDRBU8_pre, |
| 363 | MVE_VLDRDU64_qi_pre, |
| 364 | MVE_VLDRHS32_post, |
| 365 | MVE_VLDRHS32_pre, |
| 366 | MVE_VLDRHU16_post, |
| 367 | MVE_VLDRHU16_pre, |
| 368 | MVE_VLDRHU32_post, |
| 369 | MVE_VLDRHU32_pre, |
| 370 | MVE_VLDRWU32_post, |
| 371 | MVE_VLDRWU32_pre, |
| 372 | MVE_VLDRWU32_qi_pre, |
| 373 | MVE_VST20_16_wb, |
| 374 | MVE_VST20_32_wb, |
| 375 | MVE_VST20_8_wb, |
| 376 | MVE_VST21_16_wb, |
| 377 | MVE_VST21_32_wb, |
| 378 | MVE_VST21_8_wb, |
| 379 | MVE_VST40_16_wb, |
| 380 | MVE_VST40_32_wb, |
| 381 | MVE_VST40_8_wb, |
| 382 | MVE_VST41_16_wb, |
| 383 | MVE_VST41_32_wb, |
| 384 | MVE_VST41_8_wb, |
| 385 | MVE_VST42_16_wb, |
| 386 | MVE_VST42_32_wb, |
| 387 | MVE_VST42_8_wb, |
| 388 | MVE_VST43_16_wb, |
| 389 | MVE_VST43_32_wb, |
| 390 | MVE_VST43_8_wb, |
| 391 | MVE_VSTRB16_post, |
| 392 | MVE_VSTRB16_pre, |
| 393 | MVE_VSTRB32_post, |
| 394 | MVE_VSTRB32_pre, |
| 395 | MVE_VSTRBU8_post, |
| 396 | MVE_VSTRBU8_pre, |
| 397 | MVE_VSTRD64_qi_pre, |
| 398 | MVE_VSTRH32_post, |
| 399 | MVE_VSTRH32_pre, |
| 400 | MVE_VSTRHU16_post, |
| 401 | MVE_VSTRHU16_pre, |
| 402 | MVE_VSTRWU32_post, |
| 403 | MVE_VSTRWU32_pre, |
| 404 | MVE_VSTRW32_qi_pre, |
| 405 | }; |
| 406 | std::initializer_list<unsigned> Address3List = { |
| 407 | t2LDRD_POST, |
| 408 | t2LDRD_PRE, |
| 409 | t2STRD_POST, |
| 410 | t2STRD_PRE, |
| 411 | }; |
| 412 | // Compute a mask of which operands are involved in address computation |
| 413 | for (auto &op : Address1List) { |
| 414 | Info[op].AddressOpMask = 0x6; |
| 415 | } |
| 416 | for (auto &op : Address2List) { |
| 417 | Info[op].AddressOpMask = 0xc; |
| 418 | } |
| 419 | for (auto &op : Address3List) { |
| 420 | Info[op].AddressOpMask = 0x18; |
| 421 | } |
| 422 | for (auto &op : hasBRegAddrShiftList) { |
| 423 | Info[op].AddressOpMask |= 0x8; |
| 424 | } |
| 425 | } |
| 426 | |
| 427 | void InstructionInformation::markDPProducersConsumers( |
| 428 | const ARMBaseInstrInfo *TII) { |
| 429 | // Learn about all instructions which have FP source/dest registers |
| 430 | for (unsigned MI = 0; MI < ARM::INSTRUCTION_LIST_END; ++MI) { |
| 431 | const MCInstrDesc &MID = TII->get(Opcode: MI); |
| 432 | auto Operands = MID.operands(); |
| 433 | for (unsigned OI = 0, OIE = MID.getNumOperands(); OI != OIE; ++OI) { |
| 434 | bool MarkQP = false, MarkDP = false, MarkSP = false; |
| 435 | switch (Operands[OI].RegClass) { |
| 436 | case ARM::MQPRRegClassID: |
| 437 | case ARM::DPRRegClassID: |
| 438 | case ARM::DPR_8RegClassID: |
| 439 | case ARM::DPR_VFP2RegClassID: |
| 440 | case ARM::DPairRegClassID: |
| 441 | case ARM::DPairSpcRegClassID: |
| 442 | case ARM::DQuadRegClassID: |
| 443 | case ARM::DQuadSpcRegClassID: |
| 444 | case ARM::DTripleRegClassID: |
| 445 | case ARM::DTripleSpcRegClassID: |
| 446 | MarkDP = true; |
| 447 | break; |
| 448 | case ARM::QPRRegClassID: |
| 449 | case ARM::QPR_8RegClassID: |
| 450 | case ARM::QPR_VFP2RegClassID: |
| 451 | case ARM::QQPRRegClassID: |
| 452 | case ARM::QQQQPRRegClassID: |
| 453 | MarkQP = true; |
| 454 | break; |
| 455 | case ARM::SPRRegClassID: |
| 456 | case ARM::SPR_8RegClassID: |
| 457 | case ARM::FPWithVPRRegClassID: |
| 458 | MarkSP = true; |
| 459 | break; |
| 460 | default: |
| 461 | break; |
| 462 | } |
| 463 | if (MarkQP) { |
| 464 | if (OI < MID.getNumDefs()) |
| 465 | Info[MI].ProducesQP = true; |
| 466 | else |
| 467 | Info[MI].ConsumesQP = true; |
| 468 | } |
| 469 | if (MarkDP) { |
| 470 | if (OI < MID.getNumDefs()) |
| 471 | Info[MI].ProducesDP = true; |
| 472 | else |
| 473 | Info[MI].ConsumesDP = true; |
| 474 | } |
| 475 | if (MarkSP) { |
| 476 | if (OI < MID.getNumDefs()) |
| 477 | Info[MI].ProducesSP = true; |
| 478 | else |
| 479 | Info[MI].ConsumesSP = true; |
| 480 | } |
| 481 | } |
| 482 | } |
| 483 | } |
| 484 | |
| 485 | } // anonymous namespace |
| 486 | |
| 487 | static bool hasImplicitCPSRUse(const MachineInstr *MI) { |
| 488 | return MI->getDesc().hasImplicitUseOfPhysReg(Reg: ARM::CPSR); |
| 489 | } |
| 490 | |
| 491 | void ARMOverrideBypasses::setBidirLatencies(SUnit &SrcSU, SDep &SrcDep, |
| 492 | unsigned latency) { |
| 493 | SDep Reverse = SrcDep; |
| 494 | Reverse.setSUnit(&SrcSU); |
| 495 | for (SDep &PDep : SrcDep.getSUnit()->Preds) { |
| 496 | if (PDep == Reverse) { |
| 497 | PDep.setLatency(latency); |
| 498 | SrcDep.getSUnit()->setDepthDirty(); |
| 499 | break; |
| 500 | } |
| 501 | } |
| 502 | SrcDep.setLatency(latency); |
| 503 | SrcSU.setHeightDirty(); |
| 504 | } |
| 505 | |
| 506 | static bool mismatchedPred(ARMCC::CondCodes a, ARMCC::CondCodes b) { |
| 507 | return (a & 0xe) != (b & 0xe); |
| 508 | } |
| 509 | |
| 510 | // Set output dependences to zero latency for processors which can |
| 511 | // simultaneously issue to the same register. Returns true if a change |
| 512 | // was made. |
| 513 | bool ARMOverrideBypasses::zeroOutputDependences(SUnit &ISU, SDep &Dep) { |
| 514 | if (Dep.getKind() == SDep::Output) { |
| 515 | setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: 0); |
| 516 | return true; |
| 517 | } |
| 518 | return false; |
| 519 | } |
| 520 | |
| 521 | // The graph doesn't look inside of bundles to determine their |
| 522 | // scheduling boundaries and reports zero latency into and out of them |
| 523 | // (except for CPSR into the bundle, which has latency 1). |
| 524 | // Make some better scheduling assumptions: |
| 525 | // 1) CPSR uses have zero latency; other uses have incoming latency 1 |
| 526 | // 2) CPSR defs retain a latency of zero; others have a latency of 1. |
| 527 | // |
| 528 | // Returns 1 if a use change was made; 2 if a def change was made; 0 otherwise |
| 529 | unsigned ARMOverrideBypasses::makeBundleAssumptions(SUnit &ISU, SDep &Dep) { |
| 530 | |
| 531 | SUnit &DepSU = *Dep.getSUnit(); |
| 532 | const MachineInstr *SrcMI = ISU.getInstr(); |
| 533 | unsigned SrcOpcode = SrcMI->getOpcode(); |
| 534 | const MachineInstr *DstMI = DepSU.getInstr(); |
| 535 | unsigned DstOpcode = DstMI->getOpcode(); |
| 536 | |
| 537 | if (DstOpcode == ARM::BUNDLE && TII->isPredicated(MI: *DstMI)) { |
| 538 | setBidirLatencies( |
| 539 | SrcSU&: ISU, SrcDep&: Dep, |
| 540 | latency: (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR) ? 0 : 1); |
| 541 | return 1; |
| 542 | } |
| 543 | if (SrcOpcode == ARM::BUNDLE && TII->isPredicated(MI: *SrcMI) && |
| 544 | Dep.isAssignedRegDep() && Dep.getReg() != ARM::CPSR) { |
| 545 | setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: 1); |
| 546 | return 2; |
| 547 | } |
| 548 | return 0; |
| 549 | } |
| 550 | |
| 551 | // Determine whether there is a memory RAW hazard here and set up latency |
| 552 | // accordingly |
| 553 | bool ARMOverrideBypasses::memoryRAWHazard(SUnit &ISU, SDep &Dep, |
| 554 | unsigned latency) { |
| 555 | if (!Dep.isNormalMemory()) |
| 556 | return false; |
| 557 | auto &SrcInst = *ISU.getInstr(); |
| 558 | auto &DstInst = *Dep.getSUnit()->getInstr(); |
| 559 | if (!SrcInst.mayStore() || !DstInst.mayLoad()) |
| 560 | return false; |
| 561 | |
| 562 | auto SrcMO = *SrcInst.memoperands().begin(); |
| 563 | auto DstMO = *DstInst.memoperands().begin(); |
| 564 | auto SrcVal = SrcMO->getValue(); |
| 565 | auto DstVal = DstMO->getValue(); |
| 566 | auto SrcPseudoVal = SrcMO->getPseudoValue(); |
| 567 | auto DstPseudoVal = DstMO->getPseudoValue(); |
| 568 | if (SrcVal && DstVal && AA->alias(V1: SrcVal, V2: DstVal) == AliasResult::MustAlias && |
| 569 | SrcMO->getOffset() == DstMO->getOffset()) { |
| 570 | setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency); |
| 571 | return true; |
| 572 | } else if (SrcPseudoVal && DstPseudoVal && |
| 573 | SrcPseudoVal->kind() == DstPseudoVal->kind() && |
| 574 | SrcPseudoVal->kind() == PseudoSourceValue::FixedStack) { |
| 575 | // Spills/fills |
| 576 | auto FS0 = cast<FixedStackPseudoSourceValue>(Val: SrcPseudoVal); |
| 577 | auto FS1 = cast<FixedStackPseudoSourceValue>(Val: DstPseudoVal); |
| 578 | if (FS0 == FS1) { |
| 579 | setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency); |
| 580 | return true; |
| 581 | } |
| 582 | } |
| 583 | return false; |
| 584 | } |
| 585 | |
| 586 | namespace { |
| 587 | |
| 588 | std::unique_ptr<InstructionInformation> II; |
| 589 | |
| 590 | class CortexM7InstructionInformation : public InstructionInformation { |
| 591 | public: |
| 592 | CortexM7InstructionInformation(const ARMBaseInstrInfo *TII) |
| 593 | : InstructionInformation(TII) {} |
| 594 | }; |
| 595 | |
| 596 | class CortexM7Overrides : public ARMOverrideBypasses { |
| 597 | public: |
| 598 | CortexM7Overrides(const ARMBaseInstrInfo *TII, AAResults *AA) |
| 599 | : ARMOverrideBypasses(TII, AA) { |
| 600 | if (!II) |
| 601 | II.reset(p: new CortexM7InstructionInformation(TII)); |
| 602 | } |
| 603 | |
| 604 | void modifyBypasses(SUnit &) override; |
| 605 | }; |
| 606 | |
| 607 | void CortexM7Overrides::modifyBypasses(SUnit &ISU) { |
| 608 | const MachineInstr *SrcMI = ISU.getInstr(); |
| 609 | unsigned SrcOpcode = SrcMI->getOpcode(); |
| 610 | bool isNSWload = II->isNonSubwordLoad(Op: SrcOpcode); |
| 611 | |
| 612 | // Walk the successors looking for latency overrides that are needed |
| 613 | for (SDep &Dep : ISU.Succs) { |
| 614 | |
| 615 | // Output dependences should have 0 latency, as M7 is able to |
| 616 | // schedule writers to the same register for simultaneous issue. |
| 617 | if (zeroOutputDependences(ISU, Dep)) |
| 618 | continue; |
| 619 | |
| 620 | if (memoryRAWHazard(ISU, Dep, latency: 4)) |
| 621 | continue; |
| 622 | |
| 623 | // Ignore dependencies other than data |
| 624 | if (Dep.getKind() != SDep::Data) |
| 625 | continue; |
| 626 | |
| 627 | SUnit &DepSU = *Dep.getSUnit(); |
| 628 | if (DepSU.isBoundaryNode()) |
| 629 | continue; |
| 630 | |
| 631 | if (makeBundleAssumptions(ISU, Dep) == 1) |
| 632 | continue; |
| 633 | |
| 634 | const MachineInstr *DstMI = DepSU.getInstr(); |
| 635 | unsigned DstOpcode = DstMI->getOpcode(); |
| 636 | |
| 637 | // Word loads into any multiply or divide instruction are considered |
| 638 | // cannot bypass their scheduling stage. Didn't do this in the .td file |
| 639 | // because we cannot easily create a read advance that is 0 from certain |
| 640 | // writer classes and 1 from all the rest. |
| 641 | // (The other way around would have been easy.) |
| 642 | if (isNSWload && (II->isMultiply(Op: DstOpcode) || II->isDivide(Op: DstOpcode))) |
| 643 | setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: Dep.getLatency() + 1); |
| 644 | |
| 645 | // Word loads into B operand of a load/store are considered cannot bypass |
| 646 | // their scheduling stage. Cannot do in the .td file because |
| 647 | // need to decide between -1 and -2 for ReadAdvance |
| 648 | if (isNSWload && II->hasBRegAddr(Op: DstOpcode) && |
| 649 | DstMI->getOperand(i: 2).getReg() == Dep.getReg()) |
| 650 | setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: Dep.getLatency() + 1); |
| 651 | |
| 652 | // Multiplies into any address generation cannot bypass from EX3. Cannot do |
| 653 | // in the .td file because need to decide between -1 and -2 for ReadAdvance |
| 654 | if (II->isMultiply(Op: SrcOpcode)) { |
| 655 | unsigned OpMask = II->getAddressOpMask(Op: DstOpcode) >> 1; |
| 656 | for (unsigned i = 1; OpMask; ++i, OpMask >>= 1) { |
| 657 | if ((OpMask & 1) && DstMI->getOperand(i).isReg() && |
| 658 | DstMI->getOperand(i).getReg() == Dep.getReg()) { |
| 659 | setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: 4); // first legal bypass is EX4->EX1 |
| 660 | break; |
| 661 | } |
| 662 | } |
| 663 | } |
| 664 | |
| 665 | // Mismatched conditional producers take longer on M7; they end up looking |
| 666 | // like they were produced at EX3 and read at IS. |
| 667 | if (TII->isPredicated(MI: *SrcMI) && Dep.isAssignedRegDep() && |
| 668 | (SrcOpcode == ARM::BUNDLE || |
| 669 | mismatchedPred(a: TII->getPredicate(MI: *SrcMI), |
| 670 | b: TII->getPredicate(MI: *DstMI)))) { |
| 671 | unsigned Lat = 1; |
| 672 | // Operand A of shift+ALU is treated as an EX1 read instead of EX2. |
| 673 | if (II->isInlineShiftALU(Op: DstOpcode) && DstMI->getOperand(i: 3).getImm() && |
| 674 | DstMI->getOperand(i: 1).getReg() == Dep.getReg()) |
| 675 | Lat = 2; |
| 676 | Lat = std::min(a: 3u, b: Dep.getLatency() + Lat); |
| 677 | setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: std::max(a: Dep.getLatency(), b: Lat)); |
| 678 | } |
| 679 | |
| 680 | // CC setter into conditional producer shouldn't have a latency of more |
| 681 | // than 1 unless it's due to an implicit read. (All the "true" readers |
| 682 | // of the condition code use an implicit read, and predicates use an |
| 683 | // explicit.) |
| 684 | if (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR && |
| 685 | TII->isPredicated(MI: *DstMI) && !hasImplicitCPSRUse(MI: DstMI)) |
| 686 | setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: 1); |
| 687 | |
| 688 | // REV instructions cannot bypass directly into the EX1 shifter. The |
| 689 | // code is slightly inexact as it doesn't attempt to ensure that the bypass |
| 690 | // is to the shifter operands. |
| 691 | if (II->isRev(Op: SrcOpcode)) { |
| 692 | if (II->isInlineShiftALU(Op: DstOpcode)) |
| 693 | setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: 2); |
| 694 | else if (II->isShift(Op: DstOpcode)) |
| 695 | setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: 1); |
| 696 | } |
| 697 | } |
| 698 | } |
| 699 | |
| 700 | class M85InstructionInformation : public InstructionInformation { |
| 701 | public: |
| 702 | M85InstructionInformation(const ARMBaseInstrInfo *t) |
| 703 | : InstructionInformation(t) { |
| 704 | markDPProducersConsumers(TII: t); |
| 705 | } |
| 706 | }; |
| 707 | |
| 708 | class M85Overrides : public ARMOverrideBypasses { |
| 709 | public: |
| 710 | M85Overrides(const ARMBaseInstrInfo *t, AAResults *a) |
| 711 | : ARMOverrideBypasses(t, a) { |
| 712 | if (!II) |
| 713 | II.reset(p: new M85InstructionInformation(t)); |
| 714 | } |
| 715 | |
| 716 | void modifyBypasses(SUnit &) override; |
| 717 | |
| 718 | private: |
| 719 | unsigned computeBypassStage(const MCSchedClassDesc *SCD); |
| 720 | signed modifyMixedWidthFP(const MachineInstr *SrcMI, |
| 721 | const MachineInstr *DstMI, unsigned RegID, |
| 722 | const MCSchedClassDesc *SCD); |
| 723 | }; |
| 724 | |
| 725 | unsigned M85Overrides::computeBypassStage(const MCSchedClassDesc *SCDesc) { |
| 726 | auto SM = DAG->getSchedModel(); |
| 727 | unsigned DefIdx = 0; // just look for the first output's timing |
| 728 | if (DefIdx < SCDesc->NumWriteLatencyEntries) { |
| 729 | // Lookup the definition's write latency in SubtargetInfo. |
| 730 | const MCWriteLatencyEntry *WLEntry = |
| 731 | SM->getSubtargetInfo()->getWriteLatencyEntry(SC: SCDesc, DefIdx); |
| 732 | unsigned Latency = WLEntry->Cycles >= 0 ? WLEntry->Cycles : 1000; |
| 733 | if (Latency == 4) |
| 734 | return 2; |
| 735 | else if (Latency == 5) |
| 736 | return 3; |
| 737 | else if (Latency > 3) |
| 738 | return 3; |
| 739 | else |
| 740 | return Latency; |
| 741 | } |
| 742 | return 2; |
| 743 | } |
| 744 | |
| 745 | // Latency changes for bypassing between FP registers of different sizes: |
| 746 | // |
| 747 | // Note that mixed DP/SP are unlikely because of the semantics |
| 748 | // of C. Mixed MVE/SP are quite common when MVE intrinsics are used. |
| 749 | signed M85Overrides::modifyMixedWidthFP(const MachineInstr *SrcMI, |
| 750 | const MachineInstr *DstMI, |
| 751 | unsigned RegID, |
| 752 | const MCSchedClassDesc *SCD) { |
| 753 | |
| 754 | if (!II->producesSP(Op: SrcMI->getOpcode()) && |
| 755 | !II->producesDP(Op: SrcMI->getOpcode()) && |
| 756 | !II->producesQP(Op: SrcMI->getOpcode())) |
| 757 | return 0; |
| 758 | |
| 759 | if (Register::isVirtualRegister(Reg: RegID)) { |
| 760 | if (II->producesSP(Op: SrcMI->getOpcode()) && |
| 761 | II->consumesDP(Op: DstMI->getOpcode())) { |
| 762 | for (auto &OP : SrcMI->operands()) |
| 763 | if (OP.isReg() && OP.isDef() && OP.getReg() == RegID && |
| 764 | OP.getSubReg() == ARM::ssub_1) |
| 765 | return 5 - computeBypassStage(SCDesc: SCD); |
| 766 | } else if (II->producesSP(Op: SrcMI->getOpcode()) && |
| 767 | II->consumesQP(Op: DstMI->getOpcode())) { |
| 768 | for (auto &OP : SrcMI->operands()) |
| 769 | if (OP.isReg() && OP.isDef() && OP.getReg() == RegID && |
| 770 | (OP.getSubReg() == ARM::ssub_1 || OP.getSubReg() == ARM::ssub_3)) |
| 771 | return 5 - computeBypassStage(SCDesc: SCD) - |
| 772 | ((OP.getSubReg() == ARM::ssub_2 || |
| 773 | OP.getSubReg() == ARM::ssub_3) |
| 774 | ? 1 |
| 775 | : 0); |
| 776 | } else if (II->producesDP(Op: SrcMI->getOpcode()) && |
| 777 | II->consumesQP(Op: DstMI->getOpcode())) { |
| 778 | for (auto &OP : SrcMI->operands()) |
| 779 | if (OP.isReg() && OP.isDef() && OP.getReg() == RegID && |
| 780 | OP.getSubReg() == ARM::ssub_1) |
| 781 | return -1; |
| 782 | } else if (II->producesDP(Op: SrcMI->getOpcode()) && |
| 783 | II->consumesSP(Op: DstMI->getOpcode())) { |
| 784 | for (auto &OP : DstMI->operands()) |
| 785 | if (OP.isReg() && OP.isUse() && OP.getReg() == RegID && |
| 786 | OP.getSubReg() == ARM::ssub_1) |
| 787 | return 5 - computeBypassStage(SCDesc: SCD); |
| 788 | } else if (II->producesQP(Op: SrcMI->getOpcode()) && |
| 789 | II->consumesSP(Op: DstMI->getOpcode())) { |
| 790 | for (auto &OP : DstMI->operands()) |
| 791 | if (OP.isReg() && OP.isUse() && OP.getReg() == RegID && |
| 792 | (OP.getSubReg() == ARM::ssub_1 || OP.getSubReg() == ARM::ssub_3)) |
| 793 | return 5 - computeBypassStage(SCDesc: SCD) + |
| 794 | ((OP.getSubReg() == ARM::ssub_2 || |
| 795 | OP.getSubReg() == ARM::ssub_3) |
| 796 | ? 1 |
| 797 | : 0); |
| 798 | } else if (II->producesQP(Op: SrcMI->getOpcode()) && |
| 799 | II->consumesDP(Op: DstMI->getOpcode())) { |
| 800 | for (auto &OP : DstMI->operands()) |
| 801 | if (OP.isReg() && OP.isUse() && OP.getReg() == RegID && |
| 802 | OP.getSubReg() == ARM::ssub_1) |
| 803 | return 1; |
| 804 | } |
| 805 | } else if (Register::isPhysicalRegister(Reg: RegID)) { |
| 806 | // Note that when the producer is narrower, not all of the producers |
| 807 | // may be present in the scheduling graph; somewhere earlier in the |
| 808 | // compiler, an implicit def/use of the aliased full register gets |
| 809 | // added to the producer, and so only that producer is seen as *the* |
| 810 | // single producer. This behavior also has the unfortunate effect of |
| 811 | // serializing the producers in the compiler's view of things. |
| 812 | if (II->producesSP(Op: SrcMI->getOpcode()) && |
| 813 | II->consumesDP(Op: DstMI->getOpcode())) { |
| 814 | for (auto &OP : SrcMI->operands()) |
| 815 | if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::S1 && |
| 816 | OP.getReg() <= ARM::S31 && (OP.getReg() - ARM::S0) % 2 && |
| 817 | (OP.getReg() == RegID || |
| 818 | (OP.getReg() - ARM::S0) / 2 + ARM::D0 == RegID || |
| 819 | (OP.getReg() - ARM::S0) / 4 + ARM::Q0 == RegID)) |
| 820 | return 5 - computeBypassStage(SCDesc: SCD); |
| 821 | } else if (II->producesSP(Op: SrcMI->getOpcode()) && |
| 822 | II->consumesQP(Op: DstMI->getOpcode())) { |
| 823 | for (auto &OP : SrcMI->operands()) |
| 824 | if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::S1 && |
| 825 | OP.getReg() <= ARM::S31 && (OP.getReg() - ARM::S0) % 2 && |
| 826 | (OP.getReg() == RegID || |
| 827 | (OP.getReg() - ARM::S0) / 2 + ARM::D0 == RegID || |
| 828 | (OP.getReg() - ARM::S0) / 4 + ARM::Q0 == RegID)) |
| 829 | return 5 - computeBypassStage(SCDesc: SCD) - |
| 830 | (((OP.getReg() - ARM::S0) / 2) % 2 ? 1 : 0); |
| 831 | } else if (II->producesDP(Op: SrcMI->getOpcode()) && |
| 832 | II->consumesQP(Op: DstMI->getOpcode())) { |
| 833 | for (auto &OP : SrcMI->operands()) |
| 834 | if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::D0 && |
| 835 | OP.getReg() <= ARM::D15 && (OP.getReg() - ARM::D0) % 2 && |
| 836 | (OP.getReg() == RegID || |
| 837 | (OP.getReg() - ARM::D0) / 2 + ARM::Q0 == RegID)) |
| 838 | return -1; |
| 839 | } else if (II->producesDP(Op: SrcMI->getOpcode()) && |
| 840 | II->consumesSP(Op: DstMI->getOpcode())) { |
| 841 | if (RegID >= ARM::S1 && RegID <= ARM::S31 && (RegID - ARM::S0) % 2) |
| 842 | return 5 - computeBypassStage(SCDesc: SCD); |
| 843 | } else if (II->producesQP(Op: SrcMI->getOpcode()) && |
| 844 | II->consumesSP(Op: DstMI->getOpcode())) { |
| 845 | if (RegID >= ARM::S1 && RegID <= ARM::S31 && (RegID - ARM::S0) % 2) |
| 846 | return 5 - computeBypassStage(SCDesc: SCD) + |
| 847 | (((RegID - ARM::S0) / 2) % 2 ? 1 : 0); |
| 848 | } else if (II->producesQP(Op: SrcMI->getOpcode()) && |
| 849 | II->consumesDP(Op: DstMI->getOpcode())) { |
| 850 | if (RegID >= ARM::D1 && RegID <= ARM::D15 && (RegID - ARM::D0) % 2) |
| 851 | return 1; |
| 852 | } |
| 853 | } |
| 854 | return 0; |
| 855 | } |
| 856 | |
| 857 | void M85Overrides::modifyBypasses(SUnit &ISU) { |
| 858 | const MachineInstr *SrcMI = ISU.getInstr(); |
| 859 | unsigned SrcOpcode = SrcMI->getOpcode(); |
| 860 | bool isNSWload = II->isNonSubwordLoad(Op: SrcOpcode); |
| 861 | |
| 862 | // Walk the successors looking for latency overrides that are needed |
| 863 | for (SDep &Dep : ISU.Succs) { |
| 864 | |
| 865 | // Output dependences should have 0 latency, as CortexM85 is able to |
| 866 | // schedule writers to the same register for simultaneous issue. |
| 867 | if (zeroOutputDependences(ISU, Dep)) |
| 868 | continue; |
| 869 | |
| 870 | if (memoryRAWHazard(ISU, Dep, latency: 3)) |
| 871 | continue; |
| 872 | |
| 873 | // Ignore dependencies other than data or strong ordering. |
| 874 | if (Dep.getKind() != SDep::Data) |
| 875 | continue; |
| 876 | |
| 877 | SUnit &DepSU = *Dep.getSUnit(); |
| 878 | if (DepSU.isBoundaryNode()) |
| 879 | continue; |
| 880 | |
| 881 | if (makeBundleAssumptions(ISU, Dep) == 1) |
| 882 | continue; |
| 883 | |
| 884 | const MachineInstr *DstMI = DepSU.getInstr(); |
| 885 | unsigned DstOpcode = DstMI->getOpcode(); |
| 886 | |
| 887 | // Word loads into B operand of a load/store with cannot bypass their |
| 888 | // scheduling stage. Cannot do in the .td file because need to decide |
| 889 | // between -1 and -2 for ReadAdvance |
| 890 | |
| 891 | if (isNSWload && II->hasBRegAddrShift(Op: DstOpcode) && |
| 892 | DstMI->getOperand(i: 3).getImm() != 0 && // shift operand |
| 893 | DstMI->getOperand(i: 2).getReg() == Dep.getReg()) |
| 894 | setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: Dep.getLatency() + 1); |
| 895 | |
| 896 | if (isNSWload && isMVEVectorInstruction(MI: DstMI)) { |
| 897 | setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: Dep.getLatency() + 1); |
| 898 | } |
| 899 | |
| 900 | if (II->isMVEIntMAC(Op: DstOpcode) && |
| 901 | II->isMVEIntMACMatched(SrcOp: SrcOpcode, DstOp: DstOpcode) && |
| 902 | DstMI->getOperand(i: 0).isReg() && |
| 903 | DstMI->getOperand(i: 0).getReg() == Dep.getReg()) |
| 904 | setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: Dep.getLatency() - 1); |
| 905 | |
| 906 | // CC setter into conditional producer shouldn't have a latency of more |
| 907 | // than 0 unless it's due to an implicit read. |
| 908 | if (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR && |
| 909 | TII->isPredicated(MI: *DstMI) && !hasImplicitCPSRUse(MI: DstMI)) |
| 910 | setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: 0); |
| 911 | |
| 912 | if (signed ALat = modifyMixedWidthFP(SrcMI, DstMI, RegID: Dep.getReg(), |
| 913 | SCD: DAG->getSchedClass(SU: &ISU))) |
| 914 | setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: std::max(a: 0, b: signed(Dep.getLatency()) + ALat)); |
| 915 | |
| 916 | if (II->isRev(Op: SrcOpcode)) { |
| 917 | if (II->isInlineShiftALU(Op: DstOpcode)) |
| 918 | setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: 1); |
| 919 | else if (II->isShift(Op: DstOpcode)) |
| 920 | setBidirLatencies(SrcSU&: ISU, SrcDep&: Dep, latency: 1); |
| 921 | } |
| 922 | } |
| 923 | } |
| 924 | |
| 925 | // Add M55 specific overrides for latencies between instructions. Currently it: |
| 926 | // - Adds an extra cycle latency between MVE VMLAV and scalar instructions. |
| 927 | class CortexM55Overrides : public ARMOverrideBypasses { |
| 928 | public: |
| 929 | CortexM55Overrides(const ARMBaseInstrInfo *TII, AAResults *AA) |
| 930 | : ARMOverrideBypasses(TII, AA) {} |
| 931 | |
| 932 | void modifyBypasses(SUnit &SU) override { |
| 933 | MachineInstr *SrcMI = SU.getInstr(); |
| 934 | if (!(SrcMI->getDesc().TSFlags & ARMII::HorizontalReduction)) |
| 935 | return; |
| 936 | |
| 937 | for (SDep &Dep : SU.Succs) { |
| 938 | if (Dep.getKind() != SDep::Data) |
| 939 | continue; |
| 940 | SUnit &DepSU = *Dep.getSUnit(); |
| 941 | if (DepSU.isBoundaryNode()) |
| 942 | continue; |
| 943 | MachineInstr *DstMI = DepSU.getInstr(); |
| 944 | |
| 945 | if (!isMVEVectorInstruction(MI: DstMI) && !DstMI->mayStore()) |
| 946 | setBidirLatencies(SrcSU&: SU, SrcDep&: Dep, latency: 3); |
| 947 | } |
| 948 | } |
| 949 | }; |
| 950 | |
| 951 | } // end anonymous namespace |
| 952 | |
| 953 | void ARMOverrideBypasses::apply(ScheduleDAGInstrs *DAGInstrs) { |
| 954 | DAG = DAGInstrs; |
| 955 | for (SUnit &ISU : DAGInstrs->SUnits) { |
| 956 | if (ISU.isBoundaryNode()) |
| 957 | continue; |
| 958 | modifyBypasses(ISU); |
| 959 | } |
| 960 | if (DAGInstrs->ExitSU.getInstr()) |
| 961 | modifyBypasses(DAGInstrs->ExitSU); |
| 962 | } |
| 963 | |
| 964 | std::unique_ptr<ScheduleDAGMutation> |
| 965 | createARMLatencyMutations(const ARMSubtarget &ST, AAResults *AA) { |
| 966 | if (ST.isCortexM85()) |
| 967 | return std::make_unique<M85Overrides>(args: ST.getInstrInfo(), args&: AA); |
| 968 | else if (ST.isCortexM7()) |
| 969 | return std::make_unique<CortexM7Overrides>(args: ST.getInstrInfo(), args&: AA); |
| 970 | else if (ST.isCortexM55()) |
| 971 | return std::make_unique<CortexM55Overrides>(args: ST.getInstrInfo(), args&: AA); |
| 972 | |
| 973 | return nullptr; |
| 974 | } |
| 975 | |
| 976 | } // end namespace llvm |
| 977 | |