1//===-------------- GCNRewritePartialRegUses.cpp --------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// RenameIndependentSubregs pass leaves large partially used super registers,
10/// for example:
11/// undef %0.sub4:VReg_1024 = ...
12/// %0.sub5:VReg_1024 = ...
13/// %0.sub6:VReg_1024 = ...
14/// %0.sub7:VReg_1024 = ...
15/// use %0.sub4_sub5_sub6_sub7
16/// use %0.sub6_sub7
17///
18/// GCNRewritePartialRegUses goes right after RenameIndependentSubregs and
19/// rewrites such partially used super registers with registers of minimal size:
20/// undef %0.sub0:VReg_128 = ...
21/// %0.sub1:VReg_128 = ...
22/// %0.sub2:VReg_128 = ...
23/// %0.sub3:VReg_128 = ...
24/// use %0.sub0_sub1_sub2_sub3
25/// use %0.sub2_sub3
26///
27/// This allows to avoid subreg lanemasks tracking during register pressure
28/// calculation and creates more possibilities for the code unaware of lanemasks
29//===----------------------------------------------------------------------===//
30
31#include "GCNRewritePartialRegUses.h"
32#include "AMDGPU.h"
33#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
34#include "SIRegisterInfo.h"
35#include "llvm/CodeGen/LiveInterval.h"
36#include "llvm/CodeGen/LiveIntervals.h"
37#include "llvm/CodeGen/MachineFunctionPass.h"
38#include "llvm/CodeGen/MachineRegisterInfo.h"
39#include "llvm/CodeGen/TargetInstrInfo.h"
40#include "llvm/Pass.h"
41
42using namespace llvm;
43
44#define DEBUG_TYPE "rewrite-partial-reg-uses"
45
46namespace {
47
48class GCNRewritePartialRegUsesImpl {
49 MachineRegisterInfo *MRI;
50 const SIRegisterInfo *TRI;
51 const TargetInstrInfo *TII;
52 LiveIntervals *LIS;
53
54 /// Rewrite partially used register Reg by shifting all its subregisters to
55 /// the right and replacing the original register with a register of minimal
56 /// size. Return true if the change has been made.
57 bool rewriteReg(Register Reg) const;
58
59 /// Map OldSubReg -> NewSubReg. Used as in/out container.
60 using SubRegMap = SmallDenseMap<unsigned, unsigned>;
61
62 /// Given register class RC and the set of used subregs as keys in the SubRegs
63 /// map return new register class and indexes of right-shifted subregs as
64 /// values in SubRegs map such that the resulting regclass would contain
65 /// registers of minimal size.
66 const TargetRegisterClass *getMinSizeReg(const TargetRegisterClass *RC,
67 SubRegMap &SubRegs) const;
68
69 /// Given regclass RC and pairs of [OldSubReg, NewSubReg] in SubRegs try to
70 /// find new regclass such that:
71 /// 1. It has subregs obtained by shifting each OldSubReg by RShift number
72 /// of bits to the right. Every "shifted" subreg should have the same
73 /// SubRegRC. If CoverSubregIdx is not zero it's a subreg that "covers"
74 /// all other subregs in pairs. Basically such subreg becomes a whole
75 /// register.
76 /// 2. Resulting register class contains registers of minimal size.
77 ///
78 /// SubRegs is map of OldSubReg -> NewSubReg and is used as in/out
79 /// parameter:
80 /// OldSubReg - input parameter,
81 /// NewSubReg - output, contains shifted subregs on return.
82 const TargetRegisterClass *
83 getRegClassWithShiftedSubregs(const TargetRegisterClass *RC, unsigned RShift,
84 unsigned CoverSubregIdx,
85 SubRegMap &SubRegs) const;
86
87 /// Update live intervals after rewriting OldReg to NewReg with SubRegs map
88 /// describing OldSubReg -> NewSubReg mapping.
89 void updateLiveIntervals(Register OldReg, Register NewReg,
90 SubRegMap &SubRegs) const;
91
92 /// Helper methods.
93
94 /// Find right-shifted by RShift amount version of the SubReg if it exists,
95 /// return 0 otherwise.
96 unsigned shiftSubReg(unsigned SubReg, unsigned RShift) const;
97
98 /// Find subreg index with a given Offset and Size, return 0 if there is no
99 /// such subregister index. The result is cached in SubRegs data-member.
100 unsigned getSubReg(unsigned Offset, unsigned Size) const;
101
102 /// Cache for getSubReg method: {Offset, Size} -> SubReg index.
103 mutable SmallDenseMap<std::pair<unsigned, unsigned>, unsigned> SubRegs;
104
105 /// Return bit mask that contains all register classes that are projected into
106 /// RC by SubRegIdx. The result is cached in SuperRegMasks data-member.
107 const uint32_t *getSuperRegClassMask(const TargetRegisterClass *RC,
108 unsigned SubRegIdx) const;
109
110 /// Cache for getSuperRegClassMask method: { RC, SubRegIdx } -> Class bitmask.
111 mutable SmallDenseMap<std::pair<const TargetRegisterClass *, unsigned>,
112 const uint32_t *>
113 SuperRegMasks;
114
115 /// Return bitmask containing all allocatable register classes with registers
116 /// aligned at AlignNumBits. The result is cached in
117 /// AllocatableAndAlignedRegClassMasks data-member.
118 const BitVector &
119 getAllocatableAndAlignedRegClassMask(unsigned AlignNumBits) const;
120
121 /// Cache for getAllocatableAndAlignedRegClassMask method:
122 /// AlignNumBits -> Class bitmask.
123 mutable SmallDenseMap<unsigned, BitVector> AllocatableAndAlignedRegClassMasks;
124
125public:
126 GCNRewritePartialRegUsesImpl(LiveIntervals *LS) : LIS(LS) {}
127 bool run(MachineFunction &MF);
128};
129
130class GCNRewritePartialRegUsesLegacy : public MachineFunctionPass {
131public:
132 static char ID;
133 GCNRewritePartialRegUsesLegacy() : MachineFunctionPass(ID) {}
134
135 StringRef getPassName() const override {
136 return "Rewrite Partial Register Uses";
137 }
138
139 void getAnalysisUsage(AnalysisUsage &AU) const override {
140 AU.setPreservesCFG();
141 AU.addPreserved<LiveIntervalsWrapperPass>();
142 AU.addPreserved<SlotIndexesWrapperPass>();
143 MachineFunctionPass::getAnalysisUsage(AU);
144 }
145
146 bool runOnMachineFunction(MachineFunction &MF) override;
147};
148
149} // end anonymous namespace
150
151// TODO: move this to the tablegen and use binary search by Offset.
152unsigned GCNRewritePartialRegUsesImpl::getSubReg(unsigned Offset,
153 unsigned Size) const {
154 const auto [I, Inserted] = SubRegs.try_emplace(Key: {Offset, Size}, Args: 0);
155 if (Inserted) {
156 for (unsigned Idx = 1, E = TRI->getNumSubRegIndices(); Idx < E; ++Idx) {
157 if (TRI->getSubRegIdxOffset(Idx) == Offset &&
158 TRI->getSubRegIdxSize(Idx) == Size) {
159 I->second = Idx;
160 break;
161 }
162 }
163 }
164 return I->second;
165}
166
167unsigned GCNRewritePartialRegUsesImpl::shiftSubReg(unsigned SubReg,
168 unsigned RShift) const {
169 unsigned Offset = TRI->getSubRegIdxOffset(Idx: SubReg) - RShift;
170 return getSubReg(Offset, Size: TRI->getSubRegIdxSize(Idx: SubReg));
171}
172
173const uint32_t *GCNRewritePartialRegUsesImpl::getSuperRegClassMask(
174 const TargetRegisterClass *RC, unsigned SubRegIdx) const {
175 const auto [I, Inserted] =
176 SuperRegMasks.try_emplace(Key: {RC, SubRegIdx}, Args: nullptr);
177 if (Inserted) {
178 for (SuperRegClassIterator RCI(RC, TRI); RCI.isValid(); ++RCI) {
179 if (RCI.getSubReg() == SubRegIdx) {
180 I->second = RCI.getMask();
181 break;
182 }
183 }
184 }
185 return I->second;
186}
187
188const BitVector &
189GCNRewritePartialRegUsesImpl::getAllocatableAndAlignedRegClassMask(
190 unsigned AlignNumBits) const {
191 const auto [I, Inserted] =
192 AllocatableAndAlignedRegClassMasks.try_emplace(Key: AlignNumBits);
193 if (Inserted) {
194 BitVector &BV = I->second;
195 BV.resize(N: TRI->getNumRegClasses());
196 for (unsigned ClassID = 0; ClassID < TRI->getNumRegClasses(); ++ClassID) {
197 auto *RC = TRI->getRegClass(RCID: ClassID);
198 if (RC->isAllocatable() && TRI->isRegClassAligned(RC, AlignNumBits))
199 BV.set(ClassID);
200 }
201 }
202 return I->second;
203}
204
205const TargetRegisterClass *
206GCNRewritePartialRegUsesImpl::getRegClassWithShiftedSubregs(
207 const TargetRegisterClass *RC, unsigned RShift, unsigned CoverSubregIdx,
208 SubRegMap &SubRegs) const {
209
210 unsigned RCAlign = TRI->getRegClassAlignmentNumBits(RC);
211 LLVM_DEBUG(dbgs() << " Shift " << RShift << ", reg align " << RCAlign
212 << '\n');
213
214 BitVector ClassMask(getAllocatableAndAlignedRegClassMask(AlignNumBits: RCAlign));
215 for (auto &[OldSubReg, NewSubReg] : SubRegs) {
216 LLVM_DEBUG(dbgs() << " " << TRI->getSubRegIndexName(OldSubReg) << ':');
217
218 auto *SubRegRC = TRI->getSubRegisterClass(RC, OldSubReg);
219 if (!SubRegRC) {
220 LLVM_DEBUG(dbgs() << "couldn't find target regclass\n");
221 return nullptr;
222 }
223 LLVM_DEBUG(dbgs() << TRI->getRegClassName(SubRegRC)
224 << (SubRegRC->isAllocatable() ? "" : " not alloc")
225 << " -> ");
226
227 if (OldSubReg == CoverSubregIdx) {
228 // Covering subreg will become a full register, RC should be allocatable.
229 assert(SubRegRC->isAllocatable());
230 NewSubReg = AMDGPU::NoSubRegister;
231 LLVM_DEBUG(dbgs() << "whole reg");
232 } else {
233 NewSubReg = shiftSubReg(SubReg: OldSubReg, RShift);
234 if (!NewSubReg) {
235 LLVM_DEBUG(dbgs() << "none\n");
236 return nullptr;
237 }
238 LLVM_DEBUG(dbgs() << TRI->getSubRegIndexName(NewSubReg));
239 }
240
241 const uint32_t *Mask = NewSubReg ? getSuperRegClassMask(RC: SubRegRC, SubRegIdx: NewSubReg)
242 : SubRegRC->getSubClassMask();
243 if (!Mask)
244 llvm_unreachable("no register class mask?");
245
246 ClassMask.clearBitsNotInMask(Mask);
247 // Don't try to early exit because checking if ClassMask has set bits isn't
248 // that cheap and we expect it to pass in most cases.
249 LLVM_DEBUG(dbgs() << ", num regclasses " << ClassMask.count() << '\n');
250 }
251
252 // ClassMask is the set of all register classes such that each class is
253 // allocatable, aligned, has all shifted subregs and each subreg has required
254 // register class (see SubRegRC above). Now select first (that is largest)
255 // register class with registers of minimal size.
256 const TargetRegisterClass *MinRC = nullptr;
257 unsigned MinNumBits = std::numeric_limits<unsigned>::max();
258 for (unsigned ClassID : ClassMask.set_bits()) {
259 auto *RC = TRI->getRegClass(RCID: ClassID);
260 unsigned NumBits = TRI->getRegSizeInBits(RC: *RC);
261 if (NumBits < MinNumBits) {
262 MinNumBits = NumBits;
263 MinRC = RC;
264 }
265 }
266#ifndef NDEBUG
267 if (MinRC) {
268 assert(MinRC->isAllocatable() && TRI->isRegClassAligned(MinRC, RCAlign));
269 for (auto [OldSubReg, NewSubReg] : SubRegs)
270 // Check that all registers in MinRC support NewSubReg subregister.
271 assert(MinRC == TRI->getSubClassWithSubReg(MinRC, NewSubReg));
272 }
273#endif
274 // There might be zero RShift - in this case we just trying to find smaller
275 // register.
276 return (MinRC != RC || RShift != 0) ? MinRC : nullptr;
277}
278
279const TargetRegisterClass *
280GCNRewritePartialRegUsesImpl::getMinSizeReg(const TargetRegisterClass *RC,
281 SubRegMap &SubRegs) const {
282 unsigned CoverSubreg = AMDGPU::NoSubRegister;
283 unsigned Offset = std::numeric_limits<unsigned>::max();
284 unsigned End = 0;
285 for (auto [SubReg, SRI] : SubRegs) {
286 unsigned SubRegOffset = TRI->getSubRegIdxOffset(Idx: SubReg);
287 unsigned SubRegEnd = SubRegOffset + TRI->getSubRegIdxSize(Idx: SubReg);
288 if (SubRegOffset < Offset) {
289 Offset = SubRegOffset;
290 CoverSubreg = AMDGPU::NoSubRegister;
291 }
292 if (SubRegEnd > End) {
293 End = SubRegEnd;
294 CoverSubreg = AMDGPU::NoSubRegister;
295 }
296 if (SubRegOffset == Offset && SubRegEnd == End)
297 CoverSubreg = SubReg;
298 }
299 // If covering subreg is found shift everything so the covering subreg would
300 // be in the rightmost position.
301 if (CoverSubreg != AMDGPU::NoSubRegister)
302 return getRegClassWithShiftedSubregs(RC, RShift: Offset, CoverSubregIdx: CoverSubreg, SubRegs);
303
304 // Otherwise find subreg with maximum required alignment and shift it and all
305 // other subregs to the rightmost possible position with respect to the
306 // alignment.
307 unsigned MaxAlign = 0;
308 for (auto [SubReg, SRI] : SubRegs)
309 MaxAlign = std::max(a: MaxAlign, b: TRI->getSubRegAlignmentNumBits(RC, SubReg));
310
311 unsigned FirstMaxAlignedSubRegOffset = std::numeric_limits<unsigned>::max();
312 for (auto [SubReg, SRI] : SubRegs) {
313 if (TRI->getSubRegAlignmentNumBits(RC, SubReg) != MaxAlign)
314 continue;
315 FirstMaxAlignedSubRegOffset =
316 std::min(a: FirstMaxAlignedSubRegOffset, b: TRI->getSubRegIdxOffset(Idx: SubReg));
317 if (FirstMaxAlignedSubRegOffset == Offset)
318 break;
319 }
320
321 unsigned NewOffsetOfMaxAlignedSubReg =
322 alignTo(Value: FirstMaxAlignedSubRegOffset - Offset, Align: MaxAlign);
323
324 if (NewOffsetOfMaxAlignedSubReg > FirstMaxAlignedSubRegOffset)
325 llvm_unreachable("misaligned subreg");
326
327 unsigned RShift = FirstMaxAlignedSubRegOffset - NewOffsetOfMaxAlignedSubReg;
328 return getRegClassWithShiftedSubregs(RC, RShift, CoverSubregIdx: 0, SubRegs);
329}
330
331// Only the subrange's lanemasks of the original interval need to be modified.
332// Subrange for a covering subreg becomes the main range.
333void GCNRewritePartialRegUsesImpl::updateLiveIntervals(
334 Register OldReg, Register NewReg, SubRegMap &SubRegs) const {
335 if (!LIS->hasInterval(Reg: OldReg))
336 return;
337
338 auto &OldLI = LIS->getInterval(Reg: OldReg);
339 auto &NewLI = LIS->createEmptyInterval(Reg: NewReg);
340
341 auto &Allocator = LIS->getVNInfoAllocator();
342 NewLI.setWeight(OldLI.weight());
343
344 for (auto &SR : OldLI.subranges()) {
345 auto I = find_if(Range&: SubRegs, P: [&](auto &P) {
346 return SR.LaneMask == TRI->getSubRegIndexLaneMask(SubIdx: P.first);
347 });
348
349 if (I == SubRegs.end()) {
350 // There might be a situation when subranges don't exactly match used
351 // subregs, for example:
352 // %120 [160r,1392r:0) 0@160r
353 // L000000000000C000 [160r,1392r:0) 0@160r
354 // L0000000000003000 [160r,1392r:0) 0@160r
355 // L0000000000000C00 [160r,1392r:0) 0@160r
356 // L0000000000000300 [160r,1392r:0) 0@160r
357 // L0000000000000003 [160r,1104r:0) 0@160r
358 // L000000000000000C [160r,1104r:0) 0@160r
359 // L0000000000000030 [160r,1104r:0) 0@160r
360 // L00000000000000C0 [160r,1104r:0) 0@160r
361 // but used subregs are:
362 // sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7, L000000000000FFFF
363 // sub0_sub1_sub2_sub3, L00000000000000FF
364 // sub4_sub5_sub6_sub7, L000000000000FF00
365 // In this example subregs sub0_sub1_sub2_sub3 and sub4_sub5_sub6_sub7
366 // have several subranges with the same lifetime. For such cases just
367 // recreate the interval.
368 LIS->removeInterval(Reg: OldReg);
369 LIS->removeInterval(Reg: NewReg);
370 LIS->createAndComputeVirtRegInterval(Reg: NewReg);
371 return;
372 }
373
374 if (unsigned NewSubReg = I->second)
375 NewLI.createSubRangeFrom(Allocator,
376 LaneMask: TRI->getSubRegIndexLaneMask(SubIdx: NewSubReg), CopyFrom: SR);
377 else // This is the covering subreg (0 index) - set it as main range.
378 NewLI.assign(Other: SR, Allocator);
379
380 SubRegs.erase(I);
381 }
382 if (NewLI.empty())
383 NewLI.assign(Other: OldLI, Allocator);
384 assert(NewLI.verify(MRI));
385 LIS->removeInterval(Reg: OldReg);
386}
387
388bool GCNRewritePartialRegUsesImpl::rewriteReg(Register Reg) const {
389
390 // Collect used subregs.
391 SubRegMap SubRegs;
392 for (MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) {
393 if (MO.getSubReg() == AMDGPU::NoSubRegister)
394 return false; // Whole reg used.
395 SubRegs.try_emplace(Key: MO.getSubReg());
396 }
397
398 if (SubRegs.empty())
399 return false;
400
401 auto *RC = MRI->getRegClass(Reg);
402 LLVM_DEBUG(dbgs() << "Try to rewrite partial reg " << printReg(Reg, TRI)
403 << ':' << TRI->getRegClassName(RC) << '\n');
404
405 auto *NewRC = getMinSizeReg(RC, SubRegs);
406 if (!NewRC) {
407 LLVM_DEBUG(dbgs() << " No improvement achieved\n");
408 return false;
409 }
410
411 Register NewReg = MRI->createVirtualRegister(RegClass: NewRC);
412 LLVM_DEBUG(dbgs() << " Success " << printReg(Reg, TRI) << ':'
413 << TRI->getRegClassName(RC) << " -> "
414 << printReg(NewReg, TRI) << ':'
415 << TRI->getRegClassName(NewRC) << '\n');
416
417 for (auto &MO : make_early_inc_range(Range: MRI->reg_operands(Reg))) {
418 MO.setReg(NewReg);
419 // Debug info can refer to the whole reg, just leave it as it is for now.
420 // TODO: create some DI shift expression?
421 if (MO.isDebug() && MO.getSubReg() == 0)
422 continue;
423 unsigned NewSubReg = SubRegs[MO.getSubReg()];
424 MO.setSubReg(NewSubReg);
425 if (NewSubReg == AMDGPU::NoSubRegister && MO.isDef())
426 MO.setIsUndef(false);
427 }
428
429 if (LIS)
430 updateLiveIntervals(OldReg: Reg, NewReg, SubRegs);
431
432 return true;
433}
434
435bool GCNRewritePartialRegUsesImpl::run(MachineFunction &MF) {
436 MRI = &MF.getRegInfo();
437 TRI = static_cast<const SIRegisterInfo *>(MRI->getTargetRegisterInfo());
438 TII = MF.getSubtarget().getInstrInfo();
439 bool Changed = false;
440 for (size_t I = 0, E = MRI->getNumVirtRegs(); I < E; ++I) {
441 Changed |= rewriteReg(Reg: Register::index2VirtReg(Index: I));
442 }
443 return Changed;
444}
445
446bool GCNRewritePartialRegUsesLegacy::runOnMachineFunction(MachineFunction &MF) {
447 LiveIntervalsWrapperPass *LISWrapper =
448 getAnalysisIfAvailable<LiveIntervalsWrapperPass>();
449 LiveIntervals *LIS = LISWrapper ? &LISWrapper->getLIS() : nullptr;
450 GCNRewritePartialRegUsesImpl Impl(LIS);
451 return Impl.run(MF);
452}
453
454PreservedAnalyses
455GCNRewritePartialRegUsesPass::run(MachineFunction &MF,
456 MachineFunctionAnalysisManager &MFAM) {
457 auto *LIS = MFAM.getCachedResult<LiveIntervalsAnalysis>(IR&: MF);
458 if (!GCNRewritePartialRegUsesImpl(LIS).run(MF))
459 return PreservedAnalyses::all();
460
461 auto PA = getMachineFunctionPassPreservedAnalyses();
462 PA.preserveSet<CFGAnalyses>();
463 PA.preserve<LiveIntervalsAnalysis>();
464 PA.preserve<SlotIndexesAnalysis>();
465 return PA;
466}
467
468char GCNRewritePartialRegUsesLegacy::ID;
469
470char &llvm::GCNRewritePartialRegUsesID = GCNRewritePartialRegUsesLegacy::ID;
471
472INITIALIZE_PASS_BEGIN(GCNRewritePartialRegUsesLegacy, DEBUG_TYPE,
473 "Rewrite Partial Register Uses", false, false)
474INITIALIZE_PASS_END(GCNRewritePartialRegUsesLegacy, DEBUG_TYPE,
475 "Rewrite Partial Register Uses", false, false)
476