| 1 | //===- AMDGPURegBankLegalizeRules --------------------------------*- C++ -*-==// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREGBANKLEGALIZERULES_H |
| 10 | #define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGBANKLEGALIZERULES_H |
| 11 | |
| 12 | #include "llvm/ADT/DenseMap.h" |
| 13 | #include "llvm/ADT/SmallVector.h" |
| 14 | #include <functional> |
| 15 | |
| 16 | namespace llvm { |
| 17 | |
| 18 | class LLT; |
| 19 | class MachineRegisterInfo; |
| 20 | class MachineInstr; |
| 21 | class GCNSubtarget; |
| 22 | class MachineFunction; |
| 23 | template <typename T> class GenericUniformityInfo; |
| 24 | template <typename T> class GenericSSAContext; |
| 25 | using MachineSSAContext = GenericSSAContext<MachineFunction>; |
| 26 | using MachineUniformityInfo = GenericUniformityInfo<MachineSSAContext>; |
| 27 | |
| 28 | namespace AMDGPU { |
| 29 | |
| 30 | /// \returns true if \p Ty is a pointer type with size \p Width. |
| 31 | bool isAnyPtr(LLT Ty, unsigned Width); |
| 32 | |
| 33 | // IDs used to build predicate for RegBankLegalizeRule. Predicate can have one |
| 34 | // or more IDs and each represents a check for 'uniform or divergent' + LLT or |
| 35 | // just LLT on register operand. |
| 36 | // Most often checking one operand is enough to decide which RegBankLLTMapping |
| 37 | // to apply (see Fast Rules), IDs are useful when two or more operands need to |
| 38 | // be checked. |
| 39 | enum UniformityLLTOpPredicateID { |
| 40 | // Represents non-register and physical register operands. |
| 41 | _, |
| 42 | // scalars |
| 43 | S1, |
| 44 | S16, |
| 45 | S32, |
| 46 | S64, |
| 47 | S128, |
| 48 | |
| 49 | UniS1, |
| 50 | UniS16, |
| 51 | UniS32, |
| 52 | UniS64, |
| 53 | UniS128, |
| 54 | |
| 55 | DivS1, |
| 56 | DivS16, |
| 57 | DivS32, |
| 58 | DivS64, |
| 59 | DivS128, |
| 60 | |
| 61 | // any LLT, divergent-check only predicate |
| 62 | DivAnyTy, |
| 63 | |
| 64 | // pointers |
| 65 | P0, |
| 66 | P1, |
| 67 | P2, |
| 68 | P3, |
| 69 | P4, |
| 70 | P5, |
| 71 | P8, |
| 72 | Ptr32, |
| 73 | Ptr64, |
| 74 | Ptr128, |
| 75 | |
| 76 | UniP0, |
| 77 | UniP1, |
| 78 | UniP2, |
| 79 | UniP3, |
| 80 | UniP4, |
| 81 | UniP5, |
| 82 | UniP6, |
| 83 | UniP8, |
| 84 | UniPtr32, |
| 85 | UniPtr64, |
| 86 | UniPtr128, |
| 87 | |
| 88 | DivP0, |
| 89 | DivP1, |
| 90 | DivP2, |
| 91 | DivP3, |
| 92 | DivP4, |
| 93 | DivP5, |
| 94 | DivPtr32, |
| 95 | DivPtr64, |
| 96 | DivPtr128, |
| 97 | |
| 98 | // vectors |
| 99 | V2S16, |
| 100 | V2S32, |
| 101 | V2S64, |
| 102 | V3S32, |
| 103 | V4S32, |
| 104 | V32S32, |
| 105 | |
| 106 | UniV2S16, |
| 107 | UniV2S32, |
| 108 | UniV4S32, |
| 109 | UniV2S64, |
| 110 | UniV3S32, |
| 111 | UniV6S32, |
| 112 | UniV8S16, |
| 113 | UniV8S32, |
| 114 | UniV16S16, |
| 115 | UniV16S32, |
| 116 | UniV32S16, |
| 117 | UniV32S32, |
| 118 | |
| 119 | DivV2S16, |
| 120 | DivV2S32, |
| 121 | DivV4S32, |
| 122 | DivV2S64, |
| 123 | DivV3S32, |
| 124 | DivV4S16, |
| 125 | DivV8S16, |
| 126 | DivV8S32, |
| 127 | DivV16S16, |
| 128 | DivV16S32, |
| 129 | DivV6S32, |
| 130 | DivV32S16, |
| 131 | DivV32S32, |
| 132 | |
| 133 | // B types |
| 134 | B32, |
| 135 | B64, |
| 136 | B96, |
| 137 | B128, |
| 138 | B160, |
| 139 | B256, |
| 140 | B512, |
| 141 | BRC, |
| 142 | |
| 143 | UniB32, |
| 144 | UniB64, |
| 145 | UniB96, |
| 146 | UniB128, |
| 147 | UniB160, |
| 148 | UniB256, |
| 149 | UniB512, |
| 150 | UniBRC, |
| 151 | |
| 152 | DivB32, |
| 153 | DivB64, |
| 154 | DivB96, |
| 155 | DivB128, |
| 156 | DivB160, |
| 157 | DivB256, |
| 158 | DivB512, |
| 159 | DivBRC |
| 160 | }; |
| 161 | |
| 162 | // How to apply register bank on register operand. |
| 163 | // In most cases, this serves as a LLT and register bank assert. |
| 164 | // Can change operands and insert copies, extends, truncs, and read-any-lanes. |
| 165 | // Anything more complicated requires LoweringMethod. |
| 166 | enum RegBankLLTMappingApplyID { |
| 167 | InvalidMapping, |
| 168 | None, |
| 169 | IntrId, |
| 170 | Imm, |
| 171 | Vcc, |
| 172 | |
| 173 | // any LLT, bank-only apply IDs |
| 174 | VgprAnyTy, |
| 175 | AgprAnyTy, |
| 176 | VgprOrAgprAnyTy, |
| 177 | |
| 178 | // sgpr scalars, pointers, vectors and B-types |
| 179 | Sgpr16, |
| 180 | Sgpr32, |
| 181 | Sgpr64, |
| 182 | Sgpr128, |
| 183 | SgprP0, |
| 184 | SgprP1, |
| 185 | SgprP2, |
| 186 | SgprP3, |
| 187 | SgprP4, |
| 188 | SgprP5, |
| 189 | SgprP6, |
| 190 | SgprP8, |
| 191 | SgprPtr32, |
| 192 | SgprPtr64, |
| 193 | SgprPtr128, |
| 194 | SgprV2S16, |
| 195 | SgprV4S32, |
| 196 | SgprV2S32, |
| 197 | SgprB32, |
| 198 | SgprB64, |
| 199 | SgprB96, |
| 200 | SgprB128, |
| 201 | SgprB256, |
| 202 | SgprB512, |
| 203 | SgprBRC, |
| 204 | |
| 205 | // vgpr scalars, pointers, vectors and B-types |
| 206 | Vgpr16, |
| 207 | Vgpr32, |
| 208 | Vgpr64, |
| 209 | Vgpr128, |
| 210 | VgprP0, |
| 211 | VgprP1, |
| 212 | VgprP2, |
| 213 | VgprP3, |
| 214 | VgprP4, |
| 215 | VgprP5, |
| 216 | VgprPtr32, |
| 217 | VgprPtr64, |
| 218 | VgprPtr128, |
| 219 | VgprV2S16, |
| 220 | VgprV2S32, |
| 221 | VgprV3S32, |
| 222 | VgprB32, |
| 223 | VgprB64, |
| 224 | VgprB96, |
| 225 | VgprB128, |
| 226 | VgprB160, |
| 227 | VgprB256, |
| 228 | VgprB512, |
| 229 | VgprBRC, |
| 230 | VgprV4S16, |
| 231 | VgprV8S16, |
| 232 | VgprV16S16, |
| 233 | VgprV4S32, |
| 234 | VgprV8S32, |
| 235 | VgprV2S64, |
| 236 | |
| 237 | // Dst only modifiers: read-any-lane and truncs |
| 238 | UniInVcc, |
| 239 | UniInVgprS16, |
| 240 | UniInVgprS32, |
| 241 | UniInVgprS64, |
| 242 | UniInVgprV2S16, |
| 243 | UniInVgprV2S32, |
| 244 | UniInVgprV3S32, |
| 245 | UniInVgprV4S32, |
| 246 | UniInVgprV2S64, |
| 247 | UniInVgprV6S32, |
| 248 | UniInVgprV8S16, |
| 249 | UniInVgprV8S32, |
| 250 | UniInVgprV16S16, |
| 251 | UniInVgprV16S32, |
| 252 | UniInVgprV32S16, |
| 253 | UniInVgprV32S32, |
| 254 | UniInVgprB32, |
| 255 | UniInVgprB64, |
| 256 | UniInVgprB96, |
| 257 | UniInVgprB128, |
| 258 | UniInVgprB160, |
| 259 | UniInVgprB256, |
| 260 | UniInVgprB512, |
| 261 | |
| 262 | Sgpr32Trunc, |
| 263 | |
| 264 | // Dst only modifiers: dst was assigned VGPR by RegBankSelect but the |
| 265 | // instruction result must be in SGPR. Replace dst with SGPR, then copy the |
| 266 | // result back to the original VGPR. |
| 267 | Sgpr32ToVgprDst, |
| 268 | Sgpr64ToVgprDst, |
| 269 | |
| 270 | // Src only modifiers: execute in waterfall loop if divergent |
| 271 | Sgpr32_WF, |
| 272 | SgprV4S32_WF, |
| 273 | |
| 274 | // Src only modifiers: execute in waterfall loop for calls |
| 275 | SgprP0Call_WF, |
| 276 | SgprP4Call_WF, |
| 277 | |
| 278 | // Src only modifiers: for operands that must end up in M0. If divergent, |
| 279 | // readfirstlane to SGPR. The result can then be copied to M0 in ISel. |
| 280 | SgprB32_M0, |
| 281 | |
| 282 | // Src only modifiers: operand must be SGPR, if in VGPR, insert readfirstlane |
| 283 | // to move to SGPR. |
| 284 | SgprB32_ReadFirstLane, |
| 285 | SgprB64_ReadFirstLane, |
| 286 | SgprV4S32_ReadFirstLane, |
| 287 | SgprV8S32_ReadFirstLane, |
| 288 | |
| 289 | // Src only modifiers: extends |
| 290 | Sgpr32AExt, |
| 291 | Sgpr32AExtBoolInReg, |
| 292 | Sgpr32SExt, |
| 293 | Sgpr32ZExt, |
| 294 | Vgpr32AExt, |
| 295 | Vgpr32SExt, |
| 296 | Vgpr32ZExt, |
| 297 | |
| 298 | VgprV6S32, |
| 299 | VgprV16S32, |
| 300 | VgprV32S16, |
| 301 | VgprV32S32, |
| 302 | }; |
| 303 | |
| 304 | // Instruction needs to be replaced with sequence of instructions. Lowering was |
| 305 | // not done by legalizer since instructions is available in either sgpr or vgpr. |
| 306 | // For example S64 AND is available on sgpr, for that reason S64 AND is legal in |
| 307 | // context of Legalizer that only checks LLT. But S64 AND is not available on |
| 308 | // vgpr. Lower it to two S32 vgpr ANDs. |
| 309 | enum LoweringMethodID { |
| 310 | DoNotLower, |
| 311 | VccExtToSel, |
| 312 | UniExtToSel, |
| 313 | UnpackBitShift, |
| 314 | UnpackMinMax, |
| 315 | S_BFE, |
| 316 | V_BFE, |
| 317 | VgprToVccCopy, |
| 318 | UniMAD64, |
| 319 | UniMul64, |
| 320 | DivSMulToMAD, |
| 321 | SplitTo32, |
| 322 | SplitTo32Mul, |
| 323 | ScalarizeToS16, |
| 324 | SplitTo32Select, |
| 325 | SplitTo32SExtInReg, |
| 326 | S_BUF_to_BUF, |
| 327 | Ext32To64, |
| 328 | UniCstExt, |
| 329 | CtPop64To32, |
| 330 | SplitLoad, |
| 331 | WidenLoad, |
| 332 | WidenMMOToS32, |
| 333 | UnpackAExt, |
| 334 | VerifyAllSgpr, |
| 335 | ApplyAllVgpr, |
| 336 | UnmergeToShiftTrunc, |
| 337 | AextToS32InIncomingBlockGPHI, |
| 338 | VerifyAllSgprGPHI, |
| 339 | VerifyAllSgprOrVgprGPHI, |
| 340 | ApplyINTRIN_IMAGE, |
| 341 | ApplyBVH_INTERSECT_RAY, |
| 342 | SplitBitCount64To32, |
| 343 | ExtrVecEltToSel, |
| 344 | ExtrVecEltTo32, |
| 345 | InsVecEltToSel, |
| 346 | InsVecEltTo32, |
| 347 | AbsToNegMax, |
| 348 | AbsToS32, |
| 349 | DynStackAlloc, |
| 350 | DeletePrefetch, |
| 351 | LowerSetRounding, |
| 352 | LowerGetRounding |
| 353 | }; |
| 354 | |
| 355 | enum FastRulesTypes { |
| 356 | NoFastRules, |
| 357 | Standard, // S16, S32, S64, V2S16 |
| 358 | StandardB, // B32, B64, B96, B128 |
| 359 | Vector, // S32, V2S32, V3S32, V4S32 |
| 360 | }; |
| 361 | |
| 362 | struct RegBankLLTMapping { |
| 363 | SmallVector<RegBankLLTMappingApplyID, 2> DstOpMapping; |
| 364 | SmallVector<RegBankLLTMappingApplyID, 4> SrcOpMapping; |
| 365 | LoweringMethodID LoweringMethod; |
| 366 | RegBankLLTMapping( |
| 367 | std::initializer_list<RegBankLLTMappingApplyID> DstOpMappingList, |
| 368 | std::initializer_list<RegBankLLTMappingApplyID> SrcOpMappingList, |
| 369 | LoweringMethodID LoweringMethod = DoNotLower); |
| 370 | }; |
| 371 | |
| 372 | struct PredicateMapping { |
| 373 | SmallVector<UniformityLLTOpPredicateID, 4> OpUniformityAndTypes; |
| 374 | std::function<bool(const MachineInstr &)> TestFunc; |
| 375 | PredicateMapping( |
| 376 | std::initializer_list<UniformityLLTOpPredicateID> OpList, |
| 377 | std::function<bool(const MachineInstr &)> TestFunc = nullptr); |
| 378 | |
| 379 | bool match(const MachineInstr &MI, const MachineUniformityInfo &MUI, |
| 380 | const MachineRegisterInfo &MRI) const; |
| 381 | }; |
| 382 | |
| 383 | struct RegBankLegalizeRule { |
| 384 | PredicateMapping Predicate; |
| 385 | RegBankLLTMapping OperandMapping; |
| 386 | }; |
| 387 | |
| 388 | class SetOfRulesForOpcode { |
| 389 | // "Slow Rules". More complex 'Rules[i].Predicate', check them one by one. |
| 390 | SmallVector<RegBankLegalizeRule, 4> Rules; |
| 391 | |
| 392 | // "Fast Rules" |
| 393 | // Instead of testing each 'Rules[i].Predicate' we do direct access to |
| 394 | // RegBankLLTMapping using getFastPredicateSlot. For example if: |
| 395 | // - FastTypes == Standard Uni[0] holds Mapping in case Op 0 is uniform S32 |
| 396 | // - FastTypes == Vector Div[3] holds Mapping in case Op 0 is divergent V4S32 |
| 397 | FastRulesTypes FastTypes = NoFastRules; |
| 398 | #define InvMapping RegBankLLTMapping({InvalidMapping}, {InvalidMapping}) |
| 399 | RegBankLLTMapping Uni[4] = {InvMapping, InvMapping, InvMapping, InvMapping}; |
| 400 | RegBankLLTMapping Div[4] = {InvMapping, InvMapping, InvMapping, InvMapping}; |
| 401 | |
| 402 | public: |
| 403 | SetOfRulesForOpcode(); |
| 404 | SetOfRulesForOpcode(FastRulesTypes FastTypes); |
| 405 | |
| 406 | const RegBankLLTMapping * |
| 407 | findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, |
| 408 | const MachineUniformityInfo &MUI) const; |
| 409 | |
| 410 | void addRule(RegBankLegalizeRule Rule); |
| 411 | |
| 412 | void addFastRuleDivergent(UniformityLLTOpPredicateID Ty, |
| 413 | RegBankLLTMapping RuleApplyIDs); |
| 414 | void addFastRuleUniform(UniformityLLTOpPredicateID Ty, |
| 415 | RegBankLLTMapping RuleApplyIDs); |
| 416 | |
| 417 | private: |
| 418 | int getFastPredicateSlot(UniformityLLTOpPredicateID Ty) const; |
| 419 | }; |
| 420 | |
| 421 | // Essentially 'map<Opcode(or intrinsic_opcode), SetOfRulesForOpcode>' but a |
| 422 | // little more efficient. |
| 423 | class RegBankLegalizeRules { |
| 424 | const GCNSubtarget *ST; |
| 425 | MachineRegisterInfo *MRI; |
| 426 | // Separate maps for G-opcodes and intrinsics since they are in different |
| 427 | // enums. Multiple opcodes can share same set of rules. |
| 428 | // RulesAlias = map<Opcode, KeyOpcode> |
| 429 | // Rules = map<KeyOpcode, SetOfRulesForOpcode> |
| 430 | SmallDenseMap<unsigned, unsigned, 256> GRulesAlias; |
| 431 | SmallDenseMap<unsigned, SetOfRulesForOpcode, 128> GRules; |
| 432 | SmallDenseMap<unsigned, unsigned, 128> IRulesAlias; |
| 433 | SmallDenseMap<unsigned, SetOfRulesForOpcode, 64> IRules; |
| 434 | class RuleSetInitializer { |
| 435 | SetOfRulesForOpcode *RuleSet; |
| 436 | |
| 437 | public: |
| 438 | // Used for clang-format line breaks and to force writing all rules for |
| 439 | // opcode in same place. |
| 440 | template <class AliasMap, class RulesMap> |
| 441 | RuleSetInitializer(std::initializer_list<unsigned> OpcList, |
| 442 | AliasMap &RulesAlias, RulesMap &Rules, |
| 443 | FastRulesTypes FastTypes = NoFastRules) { |
| 444 | unsigned KeyOpcode = *OpcList.begin(); |
| 445 | for (unsigned Opc : OpcList) { |
| 446 | [[maybe_unused]] auto [_, NewInput] = |
| 447 | RulesAlias.try_emplace(Opc, KeyOpcode); |
| 448 | assert(NewInput && "Can't redefine existing Rules" ); |
| 449 | } |
| 450 | |
| 451 | auto [DenseMapIter, NewInput] = Rules.try_emplace(KeyOpcode, FastTypes); |
| 452 | assert(NewInput && "Can't redefine existing Rules" ); |
| 453 | |
| 454 | RuleSet = &DenseMapIter->second; |
| 455 | } |
| 456 | |
| 457 | RuleSetInitializer(const RuleSetInitializer &) = delete; |
| 458 | RuleSetInitializer &operator=(const RuleSetInitializer &) = delete; |
| 459 | RuleSetInitializer(RuleSetInitializer &&) = delete; |
| 460 | RuleSetInitializer &operator=(RuleSetInitializer &&) = delete; |
| 461 | ~RuleSetInitializer() = default; |
| 462 | |
| 463 | RuleSetInitializer &Div(UniformityLLTOpPredicateID Ty, |
| 464 | RegBankLLTMapping RuleApplyIDs, |
| 465 | bool STPred = true) { |
| 466 | if (STPred) |
| 467 | RuleSet->addFastRuleDivergent(Ty, RuleApplyIDs); |
| 468 | return *this; |
| 469 | } |
| 470 | |
| 471 | RuleSetInitializer &Uni(UniformityLLTOpPredicateID Ty, |
| 472 | RegBankLLTMapping RuleApplyIDs, |
| 473 | bool STPred = true) { |
| 474 | if (STPred) |
| 475 | RuleSet->addFastRuleUniform(Ty, RuleApplyIDs); |
| 476 | return *this; |
| 477 | } |
| 478 | |
| 479 | RuleSetInitializer &Any(RegBankLegalizeRule Init, bool STPred = true) { |
| 480 | if (STPred) |
| 481 | RuleSet->addRule(Rule: Init); |
| 482 | return *this; |
| 483 | } |
| 484 | }; |
| 485 | |
| 486 | RuleSetInitializer addRulesForGOpcs(std::initializer_list<unsigned> OpcList, |
| 487 | FastRulesTypes FastTypes = NoFastRules); |
| 488 | |
| 489 | RuleSetInitializer addRulesForIOpcs(std::initializer_list<unsigned> OpcList, |
| 490 | FastRulesTypes FastTypes = NoFastRules); |
| 491 | |
| 492 | public: |
| 493 | // Initialize rules for all opcodes. |
| 494 | RegBankLegalizeRules(const GCNSubtarget &ST, MachineRegisterInfo &MRI); |
| 495 | |
| 496 | // In case we don't want to regenerate same rules, we can use already |
| 497 | // generated rules but need to refresh references to objects that are |
| 498 | // created for this run. |
| 499 | void refreshRefs(const GCNSubtarget &_ST, MachineRegisterInfo &_MRI) { |
| 500 | ST = &_ST; |
| 501 | MRI = &_MRI; |
| 502 | }; |
| 503 | |
| 504 | const SetOfRulesForOpcode *getRulesForOpc(MachineInstr &MI) const; |
| 505 | }; |
| 506 | |
| 507 | } // end namespace AMDGPU |
| 508 | } // end namespace llvm |
| 509 | |
| 510 | #endif |
| 511 | |