1//===- AMDGPURegBankLegalizeRules --------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREGBANKLEGALIZERULES_H
10#define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGBANKLEGALIZERULES_H
11
12#include "llvm/ADT/DenseMap.h"
13#include "llvm/ADT/SmallVector.h"
14#include <functional>
15
16namespace llvm {
17
18class LLT;
19class MachineRegisterInfo;
20class MachineInstr;
21class GCNSubtarget;
22class MachineFunction;
23template <typename T> class GenericUniformityInfo;
24template <typename T> class GenericSSAContext;
25using MachineSSAContext = GenericSSAContext<MachineFunction>;
26using MachineUniformityInfo = GenericUniformityInfo<MachineSSAContext>;
27
28namespace AMDGPU {
29
30/// \returns true if \p Ty is a pointer type with size \p Width.
31bool isAnyPtr(LLT Ty, unsigned Width);
32
33// IDs used to build predicate for RegBankLegalizeRule. Predicate can have one
34// or more IDs and each represents a check for 'uniform or divergent' + LLT or
35// just LLT on register operand.
36// Most often checking one operand is enough to decide which RegBankLLTMapping
37// to apply (see Fast Rules), IDs are useful when two or more operands need to
38// be checked.
39enum UniformityLLTOpPredicateID {
40 // Represents non-register and physical register operands.
41 _,
42 // scalars
43 S1,
44 S16,
45 S32,
46 S64,
47 S128,
48
49 UniS1,
50 UniS16,
51 UniS32,
52 UniS64,
53 UniS128,
54
55 DivS1,
56 DivS16,
57 DivS32,
58 DivS64,
59 DivS128,
60
61 // any LLT, divergent-check only predicate
62 DivAnyTy,
63
64 // pointers
65 P0,
66 P1,
67 P2,
68 P3,
69 P4,
70 P5,
71 P8,
72 Ptr32,
73 Ptr64,
74 Ptr128,
75
76 UniP0,
77 UniP1,
78 UniP2,
79 UniP3,
80 UniP4,
81 UniP5,
82 UniP6,
83 UniP8,
84 UniPtr32,
85 UniPtr64,
86 UniPtr128,
87
88 DivP0,
89 DivP1,
90 DivP2,
91 DivP3,
92 DivP4,
93 DivP5,
94 DivPtr32,
95 DivPtr64,
96 DivPtr128,
97
98 // vectors
99 V2S16,
100 V2S32,
101 V2S64,
102 V3S32,
103 V4S32,
104 V32S32,
105
106 UniV2S16,
107 UniV2S32,
108 UniV4S32,
109 UniV2S64,
110 UniV3S32,
111 UniV6S32,
112 UniV8S16,
113 UniV8S32,
114 UniV16S16,
115 UniV16S32,
116 UniV32S16,
117 UniV32S32,
118
119 DivV2S16,
120 DivV2S32,
121 DivV4S32,
122 DivV2S64,
123 DivV3S32,
124 DivV4S16,
125 DivV8S16,
126 DivV8S32,
127 DivV16S16,
128 DivV16S32,
129 DivV6S32,
130 DivV32S16,
131 DivV32S32,
132
133 // B types
134 B32,
135 B64,
136 B96,
137 B128,
138 B160,
139 B256,
140 B512,
141 BRC,
142
143 UniB32,
144 UniB64,
145 UniB96,
146 UniB128,
147 UniB160,
148 UniB256,
149 UniB512,
150 UniBRC,
151
152 DivB32,
153 DivB64,
154 DivB96,
155 DivB128,
156 DivB160,
157 DivB256,
158 DivB512,
159 DivBRC
160};
161
162// How to apply register bank on register operand.
163// In most cases, this serves as a LLT and register bank assert.
164// Can change operands and insert copies, extends, truncs, and read-any-lanes.
165// Anything more complicated requires LoweringMethod.
166enum RegBankLLTMappingApplyID {
167 InvalidMapping,
168 None,
169 IntrId,
170 Imm,
171 Vcc,
172
173 // any LLT, bank-only apply IDs
174 VgprAnyTy,
175 AgprAnyTy,
176 VgprOrAgprAnyTy,
177
178 // sgpr scalars, pointers, vectors and B-types
179 Sgpr16,
180 Sgpr32,
181 Sgpr64,
182 Sgpr128,
183 SgprP0,
184 SgprP1,
185 SgprP2,
186 SgprP3,
187 SgprP4,
188 SgprP5,
189 SgprP6,
190 SgprP8,
191 SgprPtr32,
192 SgprPtr64,
193 SgprPtr128,
194 SgprV2S16,
195 SgprV4S32,
196 SgprV2S32,
197 SgprB32,
198 SgprB64,
199 SgprB96,
200 SgprB128,
201 SgprB256,
202 SgprB512,
203 SgprBRC,
204
205 // vgpr scalars, pointers, vectors and B-types
206 Vgpr16,
207 Vgpr32,
208 Vgpr64,
209 Vgpr128,
210 VgprP0,
211 VgprP1,
212 VgprP2,
213 VgprP3,
214 VgprP4,
215 VgprP5,
216 VgprPtr32,
217 VgprPtr64,
218 VgprPtr128,
219 VgprV2S16,
220 VgprV2S32,
221 VgprV3S32,
222 VgprB32,
223 VgprB64,
224 VgprB96,
225 VgprB128,
226 VgprB160,
227 VgprB256,
228 VgprB512,
229 VgprBRC,
230 VgprV4S16,
231 VgprV8S16,
232 VgprV16S16,
233 VgprV4S32,
234 VgprV8S32,
235 VgprV2S64,
236
237 // Dst only modifiers: read-any-lane and truncs
238 UniInVcc,
239 UniInVgprS16,
240 UniInVgprS32,
241 UniInVgprS64,
242 UniInVgprV2S16,
243 UniInVgprV2S32,
244 UniInVgprV3S32,
245 UniInVgprV4S32,
246 UniInVgprV2S64,
247 UniInVgprV6S32,
248 UniInVgprV8S16,
249 UniInVgprV8S32,
250 UniInVgprV16S16,
251 UniInVgprV16S32,
252 UniInVgprV32S16,
253 UniInVgprV32S32,
254 UniInVgprB32,
255 UniInVgprB64,
256 UniInVgprB96,
257 UniInVgprB128,
258 UniInVgprB160,
259 UniInVgprB256,
260 UniInVgprB512,
261
262 Sgpr32Trunc,
263
264 // Dst only modifiers: dst was assigned VGPR by RegBankSelect but the
265 // instruction result must be in SGPR. Replace dst with SGPR, then copy the
266 // result back to the original VGPR.
267 Sgpr32ToVgprDst,
268 Sgpr64ToVgprDst,
269
270 // Src only modifiers: execute in waterfall loop if divergent
271 Sgpr32_WF,
272 SgprV4S32_WF,
273
274 // Src only modifiers: execute in waterfall loop for calls
275 SgprP0Call_WF,
276 SgprP4Call_WF,
277
278 // Src only modifiers: for operands that must end up in M0. If divergent,
279 // readfirstlane to SGPR. The result can then be copied to M0 in ISel.
280 SgprB32_M0,
281
282 // Src only modifiers: operand must be SGPR, if in VGPR, insert readfirstlane
283 // to move to SGPR.
284 SgprB32_ReadFirstLane,
285 SgprB64_ReadFirstLane,
286 SgprV4S32_ReadFirstLane,
287 SgprV8S32_ReadFirstLane,
288
289 // Src only modifiers: extends
290 Sgpr32AExt,
291 Sgpr32AExtBoolInReg,
292 Sgpr32SExt,
293 Sgpr32ZExt,
294 Vgpr32AExt,
295 Vgpr32SExt,
296 Vgpr32ZExt,
297
298 VgprV6S32,
299 VgprV16S32,
300 VgprV32S16,
301 VgprV32S32,
302};
303
304// Instruction needs to be replaced with sequence of instructions. Lowering was
305// not done by legalizer since instructions is available in either sgpr or vgpr.
306// For example S64 AND is available on sgpr, for that reason S64 AND is legal in
307// context of Legalizer that only checks LLT. But S64 AND is not available on
308// vgpr. Lower it to two S32 vgpr ANDs.
309enum LoweringMethodID {
310 DoNotLower,
311 VccExtToSel,
312 UniExtToSel,
313 UnpackBitShift,
314 UnpackMinMax,
315 S_BFE,
316 V_BFE,
317 VgprToVccCopy,
318 UniMAD64,
319 UniMul64,
320 DivSMulToMAD,
321 SplitTo32,
322 SplitTo32Mul,
323 ScalarizeToS16,
324 SplitTo32Select,
325 SplitTo32SExtInReg,
326 S_BUF_to_BUF,
327 Ext32To64,
328 UniCstExt,
329 CtPop64To32,
330 SplitLoad,
331 WidenLoad,
332 WidenMMOToS32,
333 UnpackAExt,
334 VerifyAllSgpr,
335 ApplyAllVgpr,
336 UnmergeToShiftTrunc,
337 AextToS32InIncomingBlockGPHI,
338 VerifyAllSgprGPHI,
339 VerifyAllSgprOrVgprGPHI,
340 ApplyINTRIN_IMAGE,
341 ApplyBVH_INTERSECT_RAY,
342 SplitBitCount64To32,
343 ExtrVecEltToSel,
344 ExtrVecEltTo32,
345 InsVecEltToSel,
346 InsVecEltTo32,
347 AbsToNegMax,
348 AbsToS32,
349 DynStackAlloc,
350 DeletePrefetch,
351 LowerSetRounding,
352 LowerGetRounding
353};
354
355enum FastRulesTypes {
356 NoFastRules,
357 Standard, // S16, S32, S64, V2S16
358 StandardB, // B32, B64, B96, B128
359 Vector, // S32, V2S32, V3S32, V4S32
360};
361
362struct RegBankLLTMapping {
363 SmallVector<RegBankLLTMappingApplyID, 2> DstOpMapping;
364 SmallVector<RegBankLLTMappingApplyID, 4> SrcOpMapping;
365 LoweringMethodID LoweringMethod;
366 RegBankLLTMapping(
367 std::initializer_list<RegBankLLTMappingApplyID> DstOpMappingList,
368 std::initializer_list<RegBankLLTMappingApplyID> SrcOpMappingList,
369 LoweringMethodID LoweringMethod = DoNotLower);
370};
371
372struct PredicateMapping {
373 SmallVector<UniformityLLTOpPredicateID, 4> OpUniformityAndTypes;
374 std::function<bool(const MachineInstr &)> TestFunc;
375 PredicateMapping(
376 std::initializer_list<UniformityLLTOpPredicateID> OpList,
377 std::function<bool(const MachineInstr &)> TestFunc = nullptr);
378
379 bool match(const MachineInstr &MI, const MachineUniformityInfo &MUI,
380 const MachineRegisterInfo &MRI) const;
381};
382
383struct RegBankLegalizeRule {
384 PredicateMapping Predicate;
385 RegBankLLTMapping OperandMapping;
386};
387
388class SetOfRulesForOpcode {
389 // "Slow Rules". More complex 'Rules[i].Predicate', check them one by one.
390 SmallVector<RegBankLegalizeRule, 4> Rules;
391
392 // "Fast Rules"
393 // Instead of testing each 'Rules[i].Predicate' we do direct access to
394 // RegBankLLTMapping using getFastPredicateSlot. For example if:
395 // - FastTypes == Standard Uni[0] holds Mapping in case Op 0 is uniform S32
396 // - FastTypes == Vector Div[3] holds Mapping in case Op 0 is divergent V4S32
397 FastRulesTypes FastTypes = NoFastRules;
398#define InvMapping RegBankLLTMapping({InvalidMapping}, {InvalidMapping})
399 RegBankLLTMapping Uni[4] = {InvMapping, InvMapping, InvMapping, InvMapping};
400 RegBankLLTMapping Div[4] = {InvMapping, InvMapping, InvMapping, InvMapping};
401
402public:
403 SetOfRulesForOpcode();
404 SetOfRulesForOpcode(FastRulesTypes FastTypes);
405
406 const RegBankLLTMapping *
407 findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI,
408 const MachineUniformityInfo &MUI) const;
409
410 void addRule(RegBankLegalizeRule Rule);
411
412 void addFastRuleDivergent(UniformityLLTOpPredicateID Ty,
413 RegBankLLTMapping RuleApplyIDs);
414 void addFastRuleUniform(UniformityLLTOpPredicateID Ty,
415 RegBankLLTMapping RuleApplyIDs);
416
417private:
418 int getFastPredicateSlot(UniformityLLTOpPredicateID Ty) const;
419};
420
421// Essentially 'map<Opcode(or intrinsic_opcode), SetOfRulesForOpcode>' but a
422// little more efficient.
423class RegBankLegalizeRules {
424 const GCNSubtarget *ST;
425 MachineRegisterInfo *MRI;
426 // Separate maps for G-opcodes and intrinsics since they are in different
427 // enums. Multiple opcodes can share same set of rules.
428 // RulesAlias = map<Opcode, KeyOpcode>
429 // Rules = map<KeyOpcode, SetOfRulesForOpcode>
430 SmallDenseMap<unsigned, unsigned, 256> GRulesAlias;
431 SmallDenseMap<unsigned, SetOfRulesForOpcode, 128> GRules;
432 SmallDenseMap<unsigned, unsigned, 128> IRulesAlias;
433 SmallDenseMap<unsigned, SetOfRulesForOpcode, 64> IRules;
434 class RuleSetInitializer {
435 SetOfRulesForOpcode *RuleSet;
436
437 public:
438 // Used for clang-format line breaks and to force writing all rules for
439 // opcode in same place.
440 template <class AliasMap, class RulesMap>
441 RuleSetInitializer(std::initializer_list<unsigned> OpcList,
442 AliasMap &RulesAlias, RulesMap &Rules,
443 FastRulesTypes FastTypes = NoFastRules) {
444 unsigned KeyOpcode = *OpcList.begin();
445 for (unsigned Opc : OpcList) {
446 [[maybe_unused]] auto [_, NewInput] =
447 RulesAlias.try_emplace(Opc, KeyOpcode);
448 assert(NewInput && "Can't redefine existing Rules");
449 }
450
451 auto [DenseMapIter, NewInput] = Rules.try_emplace(KeyOpcode, FastTypes);
452 assert(NewInput && "Can't redefine existing Rules");
453
454 RuleSet = &DenseMapIter->second;
455 }
456
457 RuleSetInitializer(const RuleSetInitializer &) = delete;
458 RuleSetInitializer &operator=(const RuleSetInitializer &) = delete;
459 RuleSetInitializer(RuleSetInitializer &&) = delete;
460 RuleSetInitializer &operator=(RuleSetInitializer &&) = delete;
461 ~RuleSetInitializer() = default;
462
463 RuleSetInitializer &Div(UniformityLLTOpPredicateID Ty,
464 RegBankLLTMapping RuleApplyIDs,
465 bool STPred = true) {
466 if (STPred)
467 RuleSet->addFastRuleDivergent(Ty, RuleApplyIDs);
468 return *this;
469 }
470
471 RuleSetInitializer &Uni(UniformityLLTOpPredicateID Ty,
472 RegBankLLTMapping RuleApplyIDs,
473 bool STPred = true) {
474 if (STPred)
475 RuleSet->addFastRuleUniform(Ty, RuleApplyIDs);
476 return *this;
477 }
478
479 RuleSetInitializer &Any(RegBankLegalizeRule Init, bool STPred = true) {
480 if (STPred)
481 RuleSet->addRule(Rule: Init);
482 return *this;
483 }
484 };
485
486 RuleSetInitializer addRulesForGOpcs(std::initializer_list<unsigned> OpcList,
487 FastRulesTypes FastTypes = NoFastRules);
488
489 RuleSetInitializer addRulesForIOpcs(std::initializer_list<unsigned> OpcList,
490 FastRulesTypes FastTypes = NoFastRules);
491
492public:
493 // Initialize rules for all opcodes.
494 RegBankLegalizeRules(const GCNSubtarget &ST, MachineRegisterInfo &MRI);
495
496 // In case we don't want to regenerate same rules, we can use already
497 // generated rules but need to refresh references to objects that are
498 // created for this run.
499 void refreshRefs(const GCNSubtarget &_ST, MachineRegisterInfo &_MRI) {
500 ST = &_ST;
501 MRI = &_MRI;
502 };
503
504 const SetOfRulesForOpcode *getRulesForOpc(MachineInstr &MI) const;
505};
506
507} // end namespace AMDGPU
508} // end namespace llvm
509
510#endif
511