1//===-------------- GCNRewritePartialRegUses.cpp --------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// RenameIndependentSubregs pass leaves large partially used super registers,
10/// for example:
11/// undef %0.sub4:VReg_1024 = ...
12/// %0.sub5:VReg_1024 = ...
13/// %0.sub6:VReg_1024 = ...
14/// %0.sub7:VReg_1024 = ...
15/// use %0.sub4_sub5_sub6_sub7
16/// use %0.sub6_sub7
17///
18/// GCNRewritePartialRegUses goes right after RenameIndependentSubregs and
19/// rewrites such partially used super registers with registers of minimal size:
20/// undef %0.sub0:VReg_128 = ...
21/// %0.sub1:VReg_128 = ...
22/// %0.sub2:VReg_128 = ...
23/// %0.sub3:VReg_128 = ...
24/// use %0.sub0_sub1_sub2_sub3
25/// use %0.sub2_sub3
26///
27/// This allows to avoid subreg lanemasks tracking during register pressure
28/// calculation and creates more possibilities for the code unaware of lanemasks
29//===----------------------------------------------------------------------===//
30
31#include "AMDGPU.h"
32#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
33#include "SIRegisterInfo.h"
34#include "llvm/CodeGen/LiveInterval.h"
35#include "llvm/CodeGen/LiveIntervals.h"
36#include "llvm/CodeGen/MachineFunctionPass.h"
37#include "llvm/CodeGen/MachineInstrBuilder.h"
38#include "llvm/CodeGen/MachineRegisterInfo.h"
39#include "llvm/CodeGen/TargetInstrInfo.h"
40#include "llvm/InitializePasses.h"
41#include "llvm/Pass.h"
42
43using namespace llvm;
44
45#define DEBUG_TYPE "rewrite-partial-reg-uses"
46
47namespace {
48
49class GCNRewritePartialRegUses : public MachineFunctionPass {
50public:
51 static char ID;
52 GCNRewritePartialRegUses() : MachineFunctionPass(ID) {}
53
54 StringRef getPassName() const override {
55 return "Rewrite Partial Register Uses";
56 }
57
58 void getAnalysisUsage(AnalysisUsage &AU) const override {
59 AU.setPreservesCFG();
60 AU.addPreserved<LiveIntervalsWrapperPass>();
61 AU.addPreserved<SlotIndexesWrapperPass>();
62 MachineFunctionPass::getAnalysisUsage(AU);
63 }
64
65 bool runOnMachineFunction(MachineFunction &MF) override;
66
67private:
68 MachineRegisterInfo *MRI;
69 const SIRegisterInfo *TRI;
70 const TargetInstrInfo *TII;
71 LiveIntervals *LIS;
72
73 /// Rewrite partially used register Reg by shifting all its subregisters to
74 /// the right and replacing the original register with a register of minimal
75 /// size. Return true if the change has been made.
76 bool rewriteReg(Register Reg) const;
77
78 /// Value type for SubRegMap below.
79 struct SubRegInfo {
80 /// Register class required to hold the value stored in the SubReg.
81 const TargetRegisterClass *RC;
82
83 /// Index for the right-shifted subregister. If 0 this is the "covering"
84 /// subreg i.e. subreg that covers all others. Covering subreg becomes the
85 /// whole register after the replacement.
86 unsigned SubReg = AMDGPU::NoSubRegister;
87 SubRegInfo(const TargetRegisterClass *RC_ = nullptr) : RC(RC_) {}
88 };
89
90 /// Map OldSubReg -> { RC, NewSubReg }. Used as in/out container.
91 using SubRegMap = SmallDenseMap<unsigned, SubRegInfo>;
92
93 /// Given register class RC and the set of used subregs as keys in the SubRegs
94 /// map return new register class and indexes of right-shifted subregs as
95 /// values in SubRegs map such that the resulting regclass would contain
96 /// registers of minimal size.
97 const TargetRegisterClass *getMinSizeReg(const TargetRegisterClass *RC,
98 SubRegMap &SubRegs) const;
99
100 /// Given regclass RC and pairs of [OldSubReg, SubRegRC] in SubRegs try to
101 /// find new regclass such that:
102 /// 1. It has subregs obtained by shifting each OldSubReg by RShift number
103 /// of bits to the right. Every "shifted" subreg should have the same
104 /// SubRegRC. If CoverSubregIdx is not zero it's a subreg that "covers"
105 /// all other subregs in pairs. Basically such subreg becomes a whole
106 /// register.
107 /// 2. Resulting register class contains registers of minimal size but not
108 /// less than RegNumBits.
109 ///
110 /// SubRegs is map of OldSubReg -> [SubRegRC, NewSubReg] and is used as in/out
111 /// parameter:
112 /// OldSubReg - input parameter,
113 /// SubRegRC - input parameter (cannot be null),
114 /// NewSubReg - output, contains shifted subregs on return.
115 const TargetRegisterClass *
116 getRegClassWithShiftedSubregs(const TargetRegisterClass *RC, unsigned RShift,
117 unsigned RegNumBits, unsigned CoverSubregIdx,
118 SubRegMap &SubRegs) const;
119
120 /// Update live intervals after rewriting OldReg to NewReg with SubRegs map
121 /// describing OldSubReg -> NewSubReg mapping.
122 void updateLiveIntervals(Register OldReg, Register NewReg,
123 SubRegMap &SubRegs) const;
124
125 /// Helper methods.
126
127 /// Return reg class expected by a MO's parent instruction for a given MO.
128 const TargetRegisterClass *getOperandRegClass(MachineOperand &MO) const;
129
130 /// Find right-shifted by RShift amount version of the SubReg if it exists,
131 /// return 0 otherwise.
132 unsigned shiftSubReg(unsigned SubReg, unsigned RShift) const;
133
134 /// Find subreg index with a given Offset and Size, return 0 if there is no
135 /// such subregister index. The result is cached in SubRegs data-member.
136 unsigned getSubReg(unsigned Offset, unsigned Size) const;
137
138 /// Cache for getSubReg method: {Offset, Size} -> SubReg index.
139 mutable SmallDenseMap<std::pair<unsigned, unsigned>, unsigned> SubRegs;
140
141 /// Return bit mask that contains all register classes that are projected into
142 /// RC by SubRegIdx. The result is cached in SuperRegMasks data-member.
143 const uint32_t *getSuperRegClassMask(const TargetRegisterClass *RC,
144 unsigned SubRegIdx) const;
145
146 /// Cache for getSuperRegClassMask method: { RC, SubRegIdx } -> Class bitmask.
147 mutable SmallDenseMap<std::pair<const TargetRegisterClass *, unsigned>,
148 const uint32_t *>
149 SuperRegMasks;
150
151 /// Return bitmask containing all allocatable register classes with registers
152 /// aligned at AlignNumBits. The result is cached in
153 /// AllocatableAndAlignedRegClassMasks data-member.
154 const BitVector &
155 getAllocatableAndAlignedRegClassMask(unsigned AlignNumBits) const;
156
157 /// Cache for getAllocatableAndAlignedRegClassMask method:
158 /// AlignNumBits -> Class bitmask.
159 mutable SmallDenseMap<unsigned, BitVector> AllocatableAndAlignedRegClassMasks;
160};
161
162} // end anonymous namespace
163
164// TODO: move this to the tablegen and use binary search by Offset.
165unsigned GCNRewritePartialRegUses::getSubReg(unsigned Offset,
166 unsigned Size) const {
167 const auto [I, Inserted] = SubRegs.try_emplace(Key: {Offset, Size}, Args: 0);
168 if (Inserted) {
169 for (unsigned Idx = 1, E = TRI->getNumSubRegIndices(); Idx < E; ++Idx) {
170 if (TRI->getSubRegIdxOffset(Idx) == Offset &&
171 TRI->getSubRegIdxSize(Idx) == Size) {
172 I->second = Idx;
173 break;
174 }
175 }
176 }
177 return I->second;
178}
179
180unsigned GCNRewritePartialRegUses::shiftSubReg(unsigned SubReg,
181 unsigned RShift) const {
182 unsigned Offset = TRI->getSubRegIdxOffset(Idx: SubReg) - RShift;
183 return getSubReg(Offset, Size: TRI->getSubRegIdxSize(Idx: SubReg));
184}
185
186const uint32_t *
187GCNRewritePartialRegUses::getSuperRegClassMask(const TargetRegisterClass *RC,
188 unsigned SubRegIdx) const {
189 const auto [I, Inserted] =
190 SuperRegMasks.try_emplace(Key: {RC, SubRegIdx}, Args: nullptr);
191 if (Inserted) {
192 for (SuperRegClassIterator RCI(RC, TRI); RCI.isValid(); ++RCI) {
193 if (RCI.getSubReg() == SubRegIdx) {
194 I->second = RCI.getMask();
195 break;
196 }
197 }
198 }
199 return I->second;
200}
201
202const BitVector &GCNRewritePartialRegUses::getAllocatableAndAlignedRegClassMask(
203 unsigned AlignNumBits) const {
204 const auto [I, Inserted] =
205 AllocatableAndAlignedRegClassMasks.try_emplace(Key: AlignNumBits);
206 if (Inserted) {
207 BitVector &BV = I->second;
208 BV.resize(N: TRI->getNumRegClasses());
209 for (unsigned ClassID = 0; ClassID < TRI->getNumRegClasses(); ++ClassID) {
210 auto *RC = TRI->getRegClass(RCID: ClassID);
211 if (RC->isAllocatable() && TRI->isRegClassAligned(RC, AlignNumBits))
212 BV.set(ClassID);
213 }
214 }
215 return I->second;
216}
217
218const TargetRegisterClass *
219GCNRewritePartialRegUses::getRegClassWithShiftedSubregs(
220 const TargetRegisterClass *RC, unsigned RShift, unsigned RegNumBits,
221 unsigned CoverSubregIdx, SubRegMap &SubRegs) const {
222
223 unsigned RCAlign = TRI->getRegClassAlignmentNumBits(RC);
224 LLVM_DEBUG(dbgs() << " Shift " << RShift << ", reg align " << RCAlign
225 << '\n');
226
227 BitVector ClassMask(getAllocatableAndAlignedRegClassMask(AlignNumBits: RCAlign));
228 for (auto &[OldSubReg, SRI] : SubRegs) {
229 auto &[SubRegRC, NewSubReg] = SRI;
230 assert(SubRegRC);
231
232 LLVM_DEBUG(dbgs() << " " << TRI->getSubRegIndexName(OldSubReg) << ':'
233 << TRI->getRegClassName(SubRegRC)
234 << (SubRegRC->isAllocatable() ? "" : " not alloc")
235 << " -> ");
236
237 if (OldSubReg == CoverSubregIdx) {
238 // Covering subreg will become a full register, RC should be allocatable.
239 assert(SubRegRC->isAllocatable());
240 NewSubReg = AMDGPU::NoSubRegister;
241 LLVM_DEBUG(dbgs() << "whole reg");
242 } else {
243 NewSubReg = shiftSubReg(SubReg: OldSubReg, RShift);
244 if (!NewSubReg) {
245 LLVM_DEBUG(dbgs() << "none\n");
246 return nullptr;
247 }
248 LLVM_DEBUG(dbgs() << TRI->getSubRegIndexName(NewSubReg));
249 }
250
251 const uint32_t *Mask = NewSubReg ? getSuperRegClassMask(RC: SubRegRC, SubRegIdx: NewSubReg)
252 : SubRegRC->getSubClassMask();
253 if (!Mask)
254 llvm_unreachable("no register class mask?");
255
256 ClassMask.clearBitsNotInMask(Mask);
257 // Don't try to early exit because checking if ClassMask has set bits isn't
258 // that cheap and we expect it to pass in most cases.
259 LLVM_DEBUG(dbgs() << ", num regclasses " << ClassMask.count() << '\n');
260 }
261
262 // ClassMask is the set of all register classes such that each class is
263 // allocatable, aligned, has all shifted subregs and each subreg has required
264 // register class (see SubRegRC above). Now select first (that is largest)
265 // register class with registers of minimal but not less than RegNumBits size.
266 // We have to check register size because we may encounter classes of smaller
267 // registers like VReg_1 in some situations.
268 const TargetRegisterClass *MinRC = nullptr;
269 unsigned MinNumBits = std::numeric_limits<unsigned>::max();
270 for (unsigned ClassID : ClassMask.set_bits()) {
271 auto *RC = TRI->getRegClass(RCID: ClassID);
272 unsigned NumBits = TRI->getRegSizeInBits(RC: *RC);
273 if (NumBits < MinNumBits && NumBits >= RegNumBits) {
274 MinNumBits = NumBits;
275 MinRC = RC;
276 }
277 if (MinNumBits == RegNumBits)
278 break;
279 }
280#ifndef NDEBUG
281 if (MinRC) {
282 assert(MinRC->isAllocatable() && TRI->isRegClassAligned(MinRC, RCAlign));
283 for (auto [SubReg, SRI] : SubRegs)
284 // Check that all registers in MinRC support SRI.SubReg subregister.
285 assert(MinRC == TRI->getSubClassWithSubReg(MinRC, SRI.SubReg));
286 }
287#endif
288 // There might be zero RShift - in this case we just trying to find smaller
289 // register.
290 return (MinRC != RC || RShift != 0) ? MinRC : nullptr;
291}
292
293const TargetRegisterClass *
294GCNRewritePartialRegUses::getMinSizeReg(const TargetRegisterClass *RC,
295 SubRegMap &SubRegs) const {
296 unsigned CoverSubreg = AMDGPU::NoSubRegister;
297 unsigned Offset = std::numeric_limits<unsigned>::max();
298 unsigned End = 0;
299 for (auto [SubReg, SRI] : SubRegs) {
300 unsigned SubRegOffset = TRI->getSubRegIdxOffset(Idx: SubReg);
301 unsigned SubRegEnd = SubRegOffset + TRI->getSubRegIdxSize(Idx: SubReg);
302 if (SubRegOffset < Offset) {
303 Offset = SubRegOffset;
304 CoverSubreg = AMDGPU::NoSubRegister;
305 }
306 if (SubRegEnd > End) {
307 End = SubRegEnd;
308 CoverSubreg = AMDGPU::NoSubRegister;
309 }
310 if (SubRegOffset == Offset && SubRegEnd == End)
311 CoverSubreg = SubReg;
312 }
313 // If covering subreg is found shift everything so the covering subreg would
314 // be in the rightmost position.
315 if (CoverSubreg != AMDGPU::NoSubRegister)
316 return getRegClassWithShiftedSubregs(RC, RShift: Offset, RegNumBits: End - Offset, CoverSubregIdx: CoverSubreg,
317 SubRegs);
318
319 // Otherwise find subreg with maximum required alignment and shift it and all
320 // other subregs to the rightmost possible position with respect to the
321 // alignment.
322 unsigned MaxAlign = 0;
323 for (auto [SubReg, SRI] : SubRegs)
324 MaxAlign = std::max(a: MaxAlign, b: TRI->getSubRegAlignmentNumBits(RC, SubReg));
325
326 unsigned FirstMaxAlignedSubRegOffset = std::numeric_limits<unsigned>::max();
327 for (auto [SubReg, SRI] : SubRegs) {
328 if (TRI->getSubRegAlignmentNumBits(RC, SubReg) != MaxAlign)
329 continue;
330 FirstMaxAlignedSubRegOffset =
331 std::min(a: FirstMaxAlignedSubRegOffset, b: TRI->getSubRegIdxOffset(Idx: SubReg));
332 if (FirstMaxAlignedSubRegOffset == Offset)
333 break;
334 }
335
336 unsigned NewOffsetOfMaxAlignedSubReg =
337 alignTo(Value: FirstMaxAlignedSubRegOffset - Offset, Align: MaxAlign);
338
339 if (NewOffsetOfMaxAlignedSubReg > FirstMaxAlignedSubRegOffset)
340 llvm_unreachable("misaligned subreg");
341
342 unsigned RShift = FirstMaxAlignedSubRegOffset - NewOffsetOfMaxAlignedSubReg;
343 return getRegClassWithShiftedSubregs(RC, RShift, RegNumBits: End - RShift, CoverSubregIdx: 0, SubRegs);
344}
345
346// Only the subrange's lanemasks of the original interval need to be modified.
347// Subrange for a covering subreg becomes the main range.
348void GCNRewritePartialRegUses::updateLiveIntervals(Register OldReg,
349 Register NewReg,
350 SubRegMap &SubRegs) const {
351 if (!LIS->hasInterval(Reg: OldReg))
352 return;
353
354 auto &OldLI = LIS->getInterval(Reg: OldReg);
355 auto &NewLI = LIS->createEmptyInterval(Reg: NewReg);
356
357 auto &Allocator = LIS->getVNInfoAllocator();
358 NewLI.setWeight(OldLI.weight());
359
360 for (auto &SR : OldLI.subranges()) {
361 auto I = find_if(Range&: SubRegs, P: [&](auto &P) {
362 return SR.LaneMask == TRI->getSubRegIndexLaneMask(SubIdx: P.first);
363 });
364
365 if (I == SubRegs.end()) {
366 // There might be a situation when subranges don't exactly match used
367 // subregs, for example:
368 // %120 [160r,1392r:0) 0@160r
369 // L000000000000C000 [160r,1392r:0) 0@160r
370 // L0000000000003000 [160r,1392r:0) 0@160r
371 // L0000000000000C00 [160r,1392r:0) 0@160r
372 // L0000000000000300 [160r,1392r:0) 0@160r
373 // L0000000000000003 [160r,1104r:0) 0@160r
374 // L000000000000000C [160r,1104r:0) 0@160r
375 // L0000000000000030 [160r,1104r:0) 0@160r
376 // L00000000000000C0 [160r,1104r:0) 0@160r
377 // but used subregs are:
378 // sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7, L000000000000FFFF
379 // sub0_sub1_sub2_sub3, L00000000000000FF
380 // sub4_sub5_sub6_sub7, L000000000000FF00
381 // In this example subregs sub0_sub1_sub2_sub3 and sub4_sub5_sub6_sub7
382 // have several subranges with the same lifetime. For such cases just
383 // recreate the interval.
384 LIS->removeInterval(Reg: OldReg);
385 LIS->removeInterval(Reg: NewReg);
386 LIS->createAndComputeVirtRegInterval(Reg: NewReg);
387 return;
388 }
389
390 if (unsigned NewSubReg = I->second.SubReg)
391 NewLI.createSubRangeFrom(Allocator,
392 LaneMask: TRI->getSubRegIndexLaneMask(SubIdx: NewSubReg), CopyFrom: SR);
393 else // This is the covering subreg (0 index) - set it as main range.
394 NewLI.assign(Other: SR, Allocator);
395
396 SubRegs.erase(I);
397 }
398 if (NewLI.empty())
399 NewLI.assign(Other: OldLI, Allocator);
400 NewLI.verify(MRI);
401 LIS->removeInterval(Reg: OldReg);
402}
403
404const TargetRegisterClass *
405GCNRewritePartialRegUses::getOperandRegClass(MachineOperand &MO) const {
406 MachineInstr *MI = MO.getParent();
407 return TII->getRegClass(MCID: TII->get(Opcode: MI->getOpcode()), OpNum: MI->getOperandNo(I: &MO), TRI,
408 MF: *MI->getParent()->getParent());
409}
410
411bool GCNRewritePartialRegUses::rewriteReg(Register Reg) const {
412 auto Range = MRI->reg_nodbg_operands(Reg);
413 if (Range.empty() || any_of(Range, P: [](MachineOperand &MO) {
414 return MO.getSubReg() == AMDGPU::NoSubRegister; // Whole reg used. [1]
415 }))
416 return false;
417
418 auto *RC = MRI->getRegClass(Reg);
419 LLVM_DEBUG(dbgs() << "Try to rewrite partial reg " << printReg(Reg, TRI)
420 << ':' << TRI->getRegClassName(RC) << '\n');
421
422 // Collect used subregs and their reg classes infered from instruction
423 // operands.
424 SubRegMap SubRegs;
425 for (MachineOperand &MO : Range) {
426 const unsigned SubReg = MO.getSubReg();
427 assert(SubReg != AMDGPU::NoSubRegister); // Due to [1].
428 LLVM_DEBUG(dbgs() << " " << TRI->getSubRegIndexName(SubReg) << ':');
429
430 const auto [I, Inserted] = SubRegs.try_emplace(Key: SubReg);
431 const TargetRegisterClass *&SubRegRC = I->second.RC;
432
433 if (Inserted)
434 SubRegRC = TRI->getSubRegisterClass(RC, SubReg);
435
436 if (SubRegRC) {
437 if (const TargetRegisterClass *OpDescRC = getOperandRegClass(MO)) {
438 LLVM_DEBUG(dbgs() << TRI->getRegClassName(SubRegRC) << " & "
439 << TRI->getRegClassName(OpDescRC) << " = ");
440 SubRegRC = TRI->getCommonSubClass(A: SubRegRC, B: OpDescRC);
441 }
442 }
443
444 if (!SubRegRC) {
445 LLVM_DEBUG(dbgs() << "couldn't find target regclass\n");
446 return false;
447 }
448 LLVM_DEBUG(dbgs() << TRI->getRegClassName(SubRegRC) << '\n');
449 }
450
451 auto *NewRC = getMinSizeReg(RC, SubRegs);
452 if (!NewRC) {
453 LLVM_DEBUG(dbgs() << " No improvement achieved\n");
454 return false;
455 }
456
457 Register NewReg = MRI->createVirtualRegister(RegClass: NewRC);
458 LLVM_DEBUG(dbgs() << " Success " << printReg(Reg, TRI) << ':'
459 << TRI->getRegClassName(RC) << " -> "
460 << printReg(NewReg, TRI) << ':'
461 << TRI->getRegClassName(NewRC) << '\n');
462
463 for (auto &MO : make_early_inc_range(Range: MRI->reg_operands(Reg))) {
464 MO.setReg(NewReg);
465 // Debug info can refer to the whole reg, just leave it as it is for now.
466 // TODO: create some DI shift expression?
467 if (MO.isDebug() && MO.getSubReg() == 0)
468 continue;
469 unsigned SubReg = SubRegs[MO.getSubReg()].SubReg;
470 MO.setSubReg(SubReg);
471 if (SubReg == AMDGPU::NoSubRegister && MO.isDef())
472 MO.setIsUndef(false);
473 }
474
475 if (LIS)
476 updateLiveIntervals(OldReg: Reg, NewReg, SubRegs);
477
478 return true;
479}
480
481bool GCNRewritePartialRegUses::runOnMachineFunction(MachineFunction &MF) {
482 MRI = &MF.getRegInfo();
483 TRI = static_cast<const SIRegisterInfo *>(MRI->getTargetRegisterInfo());
484 TII = MF.getSubtarget().getInstrInfo();
485 auto *LISWrapper = getAnalysisIfAvailable<LiveIntervalsWrapperPass>();
486 LIS = LISWrapper ? &LISWrapper->getLIS() : nullptr;
487 bool Changed = false;
488 for (size_t I = 0, E = MRI->getNumVirtRegs(); I < E; ++I) {
489 Changed |= rewriteReg(Reg: Register::index2VirtReg(Index: I));
490 }
491 return Changed;
492}
493
494char GCNRewritePartialRegUses::ID;
495
496char &llvm::GCNRewritePartialRegUsesID = GCNRewritePartialRegUses::ID;
497
498INITIALIZE_PASS_BEGIN(GCNRewritePartialRegUses, DEBUG_TYPE,
499 "Rewrite Partial Register Uses", false, false)
500INITIALIZE_PASS_END(GCNRewritePartialRegUses, DEBUG_TYPE,
501 "Rewrite Partial Register Uses", false, false)
502