1//===- AMDGPURegBankLegalizeRules --------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREGBANKLEGALIZERULES_H
10#define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGBANKLEGALIZERULES_H
11
12#include "llvm/ADT/DenseMap.h"
13#include "llvm/ADT/SmallVector.h"
14#include <functional>
15
16namespace llvm {
17
18class LLT;
19class MachineRegisterInfo;
20class MachineInstr;
21class GCNSubtarget;
22class MachineFunction;
23template <typename T> class GenericUniformityInfo;
24template <typename T> class GenericSSAContext;
25using MachineSSAContext = GenericSSAContext<MachineFunction>;
26using MachineUniformityInfo = GenericUniformityInfo<MachineSSAContext>;
27
28namespace AMDGPU {
29
30/// \returns true if \p Ty is a pointer type with size \p Width.
31bool isAnyPtr(LLT Ty, unsigned Width);
32
33// IDs used to build predicate for RegBankLegalizeRule. Predicate can have one
34// or more IDs and each represents a check for 'uniform or divergent' + LLT or
35// just LLT on register operand.
36// Most often checking one operand is enough to decide which RegBankLLTMapping
37// to apply (see Fast Rules), IDs are useful when two or more operands need to
38// be checked.
39enum UniformityLLTOpPredicateID {
40 // Represents non-register and physical register operands.
41 _,
42 // scalars
43 S1,
44 S16,
45 S32,
46 S64,
47 S128,
48
49 UniS1,
50 UniS16,
51 UniS32,
52 UniS64,
53 UniS128,
54
55 DivS1,
56 DivS16,
57 DivS32,
58 DivS64,
59 DivS128,
60
61 // pointers
62 P0,
63 P1,
64 P2,
65 P3,
66 P4,
67 P5,
68 P8,
69 Ptr32,
70 Ptr64,
71 Ptr128,
72
73 UniP0,
74 UniP1,
75 UniP2,
76 UniP3,
77 UniP4,
78 UniP5,
79 UniP8,
80 UniPtr32,
81 UniPtr64,
82 UniPtr128,
83
84 DivP0,
85 DivP1,
86 DivP2,
87 DivP3,
88 DivP4,
89 DivP5,
90 DivPtr32,
91 DivPtr64,
92 DivPtr128,
93
94 // vectors
95 V2S16,
96 V2S32,
97 V2S64,
98 V3S32,
99 V4S32,
100
101 UniV2S16,
102 UniV2S32,
103 UniV2S64,
104
105 DivV2S16,
106 DivV2S32,
107 DivV2S64,
108 DivV3S32,
109 DivV4S16,
110
111 // B types
112 B32,
113 B64,
114 B96,
115 B128,
116 B160,
117 B256,
118 B512,
119
120 UniB32,
121 UniB64,
122 UniB96,
123 UniB128,
124 UniB160,
125 UniB256,
126 UniB512,
127 UniBRC,
128
129 DivB32,
130 DivB64,
131 DivB96,
132 DivB128,
133 DivB160,
134 DivB256,
135 DivB512,
136 DivBRC
137};
138
139// How to apply register bank on register operand.
140// In most cases, this serves as a LLT and register bank assert.
141// Can change operands and insert copies, extends, truncs, and read-any-lanes.
142// Anything more complicated requires LoweringMethod.
143enum RegBankLLTMappingApplyID {
144 InvalidMapping,
145 None,
146 IntrId,
147 Imm,
148 Vcc,
149
150 // sgpr scalars, pointers, vectors and B-types
151 Sgpr16,
152 Sgpr32,
153 Sgpr64,
154 Sgpr128,
155 SgprP0,
156 SgprP1,
157 SgprP2,
158 SgprP3,
159 SgprP4,
160 SgprP5,
161 SgprP8,
162 SgprPtr32,
163 SgprPtr64,
164 SgprPtr128,
165 SgprV2S16,
166 SgprV4S32,
167 SgprV2S32,
168 SgprB32,
169 SgprB64,
170 SgprB96,
171 SgprB128,
172 SgprB256,
173 SgprB512,
174 SgprBRC,
175
176 // vgpr scalars, pointers, vectors and B-types
177 Vgpr16,
178 Vgpr32,
179 Vgpr64,
180 Vgpr128,
181 VgprP0,
182 VgprP1,
183 VgprP2,
184 VgprP3,
185 VgprP4,
186 VgprP5,
187 VgprPtr32,
188 VgprPtr64,
189 VgprPtr128,
190 VgprV2S16,
191 VgprV2S32,
192 VgprV3S32,
193 VgprB32,
194 VgprB64,
195 VgprB96,
196 VgprB128,
197 VgprB160,
198 VgprB256,
199 VgprB512,
200 VgprBRC,
201 VgprV4S16,
202 VgprV4S32,
203 VgprV8S32,
204 VgprV2S64,
205
206 // Dst only modifiers: read-any-lane and truncs
207 UniInVcc,
208 UniInVgprS16,
209 UniInVgprS32,
210 UniInVgprS64,
211 UniInVgprV2S16,
212 UniInVgprV2S32,
213 UniInVgprV4S32,
214 UniInVgprV2S64,
215 UniInVgprB32,
216 UniInVgprB64,
217 UniInVgprB96,
218 UniInVgprB128,
219 UniInVgprB160,
220 UniInVgprB256,
221 UniInVgprB512,
222
223 Sgpr32Trunc,
224
225 // Dst only modifiers: dst was assigned VGPR by RegBankSelect but the
226 // instruction result must be in SGPR. Replace dst with SGPR, then copy the
227 // result back to the original VGPR.
228 Sgpr32ToVgprDst,
229 Sgpr64ToVgprDst,
230
231 // Src only modifiers: execute in waterfall loop if divergent
232 Sgpr32_WF,
233 SgprV4S32_WF,
234
235 // Src only modifiers: execute in waterfall loop for calls
236 SgprP0Call_WF,
237 SgprP4Call_WF,
238
239 // Src only modifiers: for operands that must end up in M0. If divergent,
240 // readfirstlane to SGPR. The result can then be copied to M0 in ISel.
241 SgprB32_M0,
242
243 // Src only modifiers: operand must be SGPR, if in VGPR, insert readfirstlane
244 // to move to SGPR.
245 SgprB32_ReadFirstLane,
246 SgprB64_ReadFirstLane,
247
248 // Src only modifiers: extends
249 Sgpr32AExt,
250 Sgpr32AExtBoolInReg,
251 Sgpr32SExt,
252 Sgpr32ZExt,
253 Vgpr32AExt,
254 Vgpr32SExt,
255 Vgpr32ZExt,
256};
257
258// Instruction needs to be replaced with sequence of instructions. Lowering was
259// not done by legalizer since instructions is available in either sgpr or vgpr.
260// For example S64 AND is available on sgpr, for that reason S64 AND is legal in
261// context of Legalizer that only checks LLT. But S64 AND is not available on
262// vgpr. Lower it to two S32 vgpr ANDs.
263enum LoweringMethodID {
264 DoNotLower,
265 VccExtToSel,
266 UniExtToSel,
267 UnpackBitShift,
268 UnpackMinMax,
269 S_BFE,
270 V_BFE,
271 VgprToVccCopy,
272 UniMAD64,
273 UniMul64,
274 DivSMulToMAD,
275 SplitTo32,
276 SplitTo32Mul,
277 ScalarizeToS16,
278 SplitTo32Select,
279 SplitTo32SExtInReg,
280 Ext32To64,
281 UniCstExt,
282 SplitLoad,
283 WidenLoad,
284 WidenMMOToS32,
285 UnpackAExt,
286 VerifyAllSgpr,
287 ApplyAllVgpr,
288 UnmergeToShiftTrunc,
289 AextToS32InIncomingBlockGPHI,
290 VerifyAllSgprGPHI,
291 VerifyAllSgprOrVgprGPHI,
292 ApplyINTRIN_IMAGE,
293 SplitBitCount64To32
294};
295
296enum FastRulesTypes {
297 NoFastRules,
298 Standard, // S16, S32, S64, V2S16
299 StandardB, // B32, B64, B96, B128
300 Vector, // S32, V2S32, V3S32, V4S32
301};
302
303struct RegBankLLTMapping {
304 SmallVector<RegBankLLTMappingApplyID, 2> DstOpMapping;
305 SmallVector<RegBankLLTMappingApplyID, 4> SrcOpMapping;
306 LoweringMethodID LoweringMethod;
307 RegBankLLTMapping(
308 std::initializer_list<RegBankLLTMappingApplyID> DstOpMappingList,
309 std::initializer_list<RegBankLLTMappingApplyID> SrcOpMappingList,
310 LoweringMethodID LoweringMethod = DoNotLower);
311};
312
313struct PredicateMapping {
314 SmallVector<UniformityLLTOpPredicateID, 4> OpUniformityAndTypes;
315 std::function<bool(const MachineInstr &)> TestFunc;
316 PredicateMapping(
317 std::initializer_list<UniformityLLTOpPredicateID> OpList,
318 std::function<bool(const MachineInstr &)> TestFunc = nullptr);
319
320 bool match(const MachineInstr &MI, const MachineUniformityInfo &MUI,
321 const MachineRegisterInfo &MRI) const;
322};
323
324struct RegBankLegalizeRule {
325 PredicateMapping Predicate;
326 RegBankLLTMapping OperandMapping;
327};
328
329class SetOfRulesForOpcode {
330 // "Slow Rules". More complex 'Rules[i].Predicate', check them one by one.
331 SmallVector<RegBankLegalizeRule, 4> Rules;
332
333 // "Fast Rules"
334 // Instead of testing each 'Rules[i].Predicate' we do direct access to
335 // RegBankLLTMapping using getFastPredicateSlot. For example if:
336 // - FastTypes == Standard Uni[0] holds Mapping in case Op 0 is uniform S32
337 // - FastTypes == Vector Div[3] holds Mapping in case Op 0 is divergent V4S32
338 FastRulesTypes FastTypes = NoFastRules;
339#define InvMapping RegBankLLTMapping({InvalidMapping}, {InvalidMapping})
340 RegBankLLTMapping Uni[4] = {InvMapping, InvMapping, InvMapping, InvMapping};
341 RegBankLLTMapping Div[4] = {InvMapping, InvMapping, InvMapping, InvMapping};
342
343public:
344 SetOfRulesForOpcode();
345 SetOfRulesForOpcode(FastRulesTypes FastTypes);
346
347 const RegBankLLTMapping *
348 findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI,
349 const MachineUniformityInfo &MUI) const;
350
351 void addRule(RegBankLegalizeRule Rule);
352
353 void addFastRuleDivergent(UniformityLLTOpPredicateID Ty,
354 RegBankLLTMapping RuleApplyIDs);
355 void addFastRuleUniform(UniformityLLTOpPredicateID Ty,
356 RegBankLLTMapping RuleApplyIDs);
357
358private:
359 int getFastPredicateSlot(UniformityLLTOpPredicateID Ty) const;
360};
361
362// Essentially 'map<Opcode(or intrinsic_opcode), SetOfRulesForOpcode>' but a
363// little more efficient.
364class RegBankLegalizeRules {
365 const GCNSubtarget *ST;
366 MachineRegisterInfo *MRI;
367 // Separate maps for G-opcodes and intrinsics since they are in different
368 // enums. Multiple opcodes can share same set of rules.
369 // RulesAlias = map<Opcode, KeyOpcode>
370 // Rules = map<KeyOpcode, SetOfRulesForOpcode>
371 SmallDenseMap<unsigned, unsigned, 256> GRulesAlias;
372 SmallDenseMap<unsigned, SetOfRulesForOpcode, 128> GRules;
373 SmallDenseMap<unsigned, unsigned, 128> IRulesAlias;
374 SmallDenseMap<unsigned, SetOfRulesForOpcode, 64> IRules;
375 class RuleSetInitializer {
376 SetOfRulesForOpcode *RuleSet;
377
378 public:
379 // Used for clang-format line breaks and to force writing all rules for
380 // opcode in same place.
381 template <class AliasMap, class RulesMap>
382 RuleSetInitializer(std::initializer_list<unsigned> OpcList,
383 AliasMap &RulesAlias, RulesMap &Rules,
384 FastRulesTypes FastTypes = NoFastRules) {
385 unsigned KeyOpcode = *OpcList.begin();
386 for (unsigned Opc : OpcList) {
387 [[maybe_unused]] auto [_, NewInput] =
388 RulesAlias.try_emplace(Opc, KeyOpcode);
389 assert(NewInput && "Can't redefine existing Rules");
390 }
391
392 auto [DenseMapIter, NewInput] = Rules.try_emplace(KeyOpcode, FastTypes);
393 assert(NewInput && "Can't redefine existing Rules");
394
395 RuleSet = &DenseMapIter->second;
396 }
397
398 RuleSetInitializer(const RuleSetInitializer &) = delete;
399 RuleSetInitializer &operator=(const RuleSetInitializer &) = delete;
400 RuleSetInitializer(RuleSetInitializer &&) = delete;
401 RuleSetInitializer &operator=(RuleSetInitializer &&) = delete;
402 ~RuleSetInitializer() = default;
403
404 RuleSetInitializer &Div(UniformityLLTOpPredicateID Ty,
405 RegBankLLTMapping RuleApplyIDs,
406 bool STPred = true) {
407 if (STPred)
408 RuleSet->addFastRuleDivergent(Ty, RuleApplyIDs);
409 return *this;
410 }
411
412 RuleSetInitializer &Uni(UniformityLLTOpPredicateID Ty,
413 RegBankLLTMapping RuleApplyIDs,
414 bool STPred = true) {
415 if (STPred)
416 RuleSet->addFastRuleUniform(Ty, RuleApplyIDs);
417 return *this;
418 }
419
420 RuleSetInitializer &Any(RegBankLegalizeRule Init, bool STPred = true) {
421 if (STPred)
422 RuleSet->addRule(Rule: Init);
423 return *this;
424 }
425 };
426
427 RuleSetInitializer addRulesForGOpcs(std::initializer_list<unsigned> OpcList,
428 FastRulesTypes FastTypes = NoFastRules);
429
430 RuleSetInitializer addRulesForIOpcs(std::initializer_list<unsigned> OpcList,
431 FastRulesTypes FastTypes = NoFastRules);
432
433public:
434 // Initialize rules for all opcodes.
435 RegBankLegalizeRules(const GCNSubtarget &ST, MachineRegisterInfo &MRI);
436
437 // In case we don't want to regenerate same rules, we can use already
438 // generated rules but need to refresh references to objects that are
439 // created for this run.
440 void refreshRefs(const GCNSubtarget &_ST, MachineRegisterInfo &_MRI) {
441 ST = &_ST;
442 MRI = &_MRI;
443 };
444
445 const SetOfRulesForOpcode *getRulesForOpc(MachineInstr &MI) const;
446};
447
448} // end namespace AMDGPU
449} // end namespace llvm
450
451#endif
452