1//===-- SIFormMemoryClauses.cpp -------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file This pass extends the live ranges of registers used as pointers in
10/// sequences of adjacent SMEM and VMEM instructions if XNACK is enabled. A
11/// load that would overwrite a pointer would require breaking the soft clause.
12/// Artificially extend the live ranges of the pointer operands by adding
13/// implicit-def early-clobber operands throughout the soft clause.
14///
15//===----------------------------------------------------------------------===//
16
17#include "AMDGPU.h"
18#include "GCNRegPressure.h"
19#include "SIMachineFunctionInfo.h"
20#include "llvm/InitializePasses.h"
21
22using namespace llvm;
23
24#define DEBUG_TYPE "si-form-memory-clauses"
25
26// Clauses longer then 15 instructions would overflow one of the counters
27// and stall. They can stall even earlier if there are outstanding counters.
28static cl::opt<unsigned>
29MaxClause("amdgpu-max-memory-clause", cl::Hidden, cl::init(Val: 15),
30 cl::desc("Maximum length of a memory clause, instructions"));
31
32namespace {
33
34class SIFormMemoryClauses : public MachineFunctionPass {
35 using RegUse = DenseMap<unsigned, std::pair<unsigned, LaneBitmask>>;
36
37public:
38 static char ID;
39
40public:
41 SIFormMemoryClauses() : MachineFunctionPass(ID) {
42 initializeSIFormMemoryClausesPass(*PassRegistry::getPassRegistry());
43 }
44
45 bool runOnMachineFunction(MachineFunction &MF) override;
46
47 StringRef getPassName() const override {
48 return "SI Form memory clauses";
49 }
50
51 void getAnalysisUsage(AnalysisUsage &AU) const override {
52 AU.addRequired<LiveIntervalsWrapperPass>();
53 AU.setPreservesAll();
54 MachineFunctionPass::getAnalysisUsage(AU);
55 }
56
57 MachineFunctionProperties getClearedProperties() const override {
58 return MachineFunctionProperties().set(
59 MachineFunctionProperties::Property::IsSSA);
60 }
61
62private:
63 bool canBundle(const MachineInstr &MI, const RegUse &Defs,
64 const RegUse &Uses) const;
65 bool checkPressure(const MachineInstr &MI, GCNDownwardRPTracker &RPT);
66 void collectRegUses(const MachineInstr &MI, RegUse &Defs, RegUse &Uses) const;
67 bool processRegUses(const MachineInstr &MI, RegUse &Defs, RegUse &Uses,
68 GCNDownwardRPTracker &RPT);
69
70 const GCNSubtarget *ST;
71 const SIRegisterInfo *TRI;
72 const MachineRegisterInfo *MRI;
73 SIMachineFunctionInfo *MFI;
74
75 unsigned LastRecordedOccupancy;
76 unsigned MaxVGPRs;
77 unsigned MaxSGPRs;
78};
79
80} // End anonymous namespace.
81
82INITIALIZE_PASS_BEGIN(SIFormMemoryClauses, DEBUG_TYPE,
83 "SI Form memory clauses", false, false)
84INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
85INITIALIZE_PASS_END(SIFormMemoryClauses, DEBUG_TYPE,
86 "SI Form memory clauses", false, false)
87
88
89char SIFormMemoryClauses::ID = 0;
90
91char &llvm::SIFormMemoryClausesID = SIFormMemoryClauses::ID;
92
93FunctionPass *llvm::createSIFormMemoryClausesPass() {
94 return new SIFormMemoryClauses();
95}
96
97static bool isVMEMClauseInst(const MachineInstr &MI) {
98 return SIInstrInfo::isFLAT(MI) || SIInstrInfo::isVMEM(MI);
99}
100
101static bool isSMEMClauseInst(const MachineInstr &MI) {
102 return SIInstrInfo::isSMRD(MI);
103}
104
105// There no sense to create store clauses, they do not define anything,
106// thus there is nothing to set early-clobber.
107static bool isValidClauseInst(const MachineInstr &MI, bool IsVMEMClause) {
108 assert(!MI.isDebugInstr() && "debug instructions should not reach here");
109 if (MI.isBundled())
110 return false;
111 if (!MI.mayLoad() || MI.mayStore())
112 return false;
113 if (SIInstrInfo::isAtomic(MI))
114 return false;
115 if (IsVMEMClause && !isVMEMClauseInst(MI))
116 return false;
117 if (!IsVMEMClause && !isSMEMClauseInst(MI))
118 return false;
119 // If this is a load instruction where the result has been coalesced with an operand, then we cannot clause it.
120 for (const MachineOperand &ResMO : MI.defs()) {
121 Register ResReg = ResMO.getReg();
122 for (const MachineOperand &MO : MI.all_uses()) {
123 if (MO.getReg() == ResReg)
124 return false;
125 }
126 break; // Only check the first def.
127 }
128 return true;
129}
130
131static unsigned getMopState(const MachineOperand &MO) {
132 unsigned S = 0;
133 if (MO.isImplicit())
134 S |= RegState::Implicit;
135 if (MO.isDead())
136 S |= RegState::Dead;
137 if (MO.isUndef())
138 S |= RegState::Undef;
139 if (MO.isKill())
140 S |= RegState::Kill;
141 if (MO.isEarlyClobber())
142 S |= RegState::EarlyClobber;
143 if (MO.getReg().isPhysical() && MO.isRenamable())
144 S |= RegState::Renamable;
145 return S;
146}
147
148// Returns false if there is a use of a def already in the map.
149// In this case we must break the clause.
150bool SIFormMemoryClauses::canBundle(const MachineInstr &MI, const RegUse &Defs,
151 const RegUse &Uses) const {
152 // Check interference with defs.
153 for (const MachineOperand &MO : MI.operands()) {
154 // TODO: Prologue/Epilogue Insertion pass does not process bundled
155 // instructions.
156 if (MO.isFI())
157 return false;
158
159 if (!MO.isReg())
160 continue;
161
162 Register Reg = MO.getReg();
163
164 // If it is tied we will need to write same register as we read.
165 if (MO.isTied())
166 return false;
167
168 const RegUse &Map = MO.isDef() ? Uses : Defs;
169 auto Conflict = Map.find(Val: Reg);
170 if (Conflict == Map.end())
171 continue;
172
173 if (Reg.isPhysical())
174 return false;
175
176 LaneBitmask Mask = TRI->getSubRegIndexLaneMask(SubIdx: MO.getSubReg());
177 if ((Conflict->second.second & Mask).any())
178 return false;
179 }
180
181 return true;
182}
183
184// Since all defs in the clause are early clobber we can run out of registers.
185// Function returns false if pressure would hit the limit if instruction is
186// bundled into a memory clause.
187bool SIFormMemoryClauses::checkPressure(const MachineInstr &MI,
188 GCNDownwardRPTracker &RPT) {
189 // NB: skip advanceBeforeNext() call. Since all defs will be marked
190 // early-clobber they will all stay alive at least to the end of the
191 // clause. Therefor we should not decrease pressure even if load
192 // pointer becomes dead and could otherwise be reused for destination.
193 RPT.advanceToNext();
194 GCNRegPressure MaxPressure = RPT.moveMaxPressure();
195 unsigned Occupancy = MaxPressure.getOccupancy(ST: *ST);
196
197 // Don't push over half the register budget. We don't want to introduce
198 // spilling just to form a soft clause.
199 //
200 // FIXME: This pressure check is fundamentally broken. First, this is checking
201 // the global pressure, not the pressure at this specific point in the
202 // program. Second, it's not accounting for the increased liveness of the use
203 // operands due to the early clobber we will introduce. Third, the pressure
204 // tracking does not account for the alignment requirements for SGPRs, or the
205 // fragmentation of registers the allocator will need to satisfy.
206 if (Occupancy >= MFI->getMinAllowedOccupancy() &&
207 MaxPressure.getVGPRNum(UnifiedVGPRFile: ST->hasGFX90AInsts()) <= MaxVGPRs / 2 &&
208 MaxPressure.getSGPRNum() <= MaxSGPRs / 2) {
209 LastRecordedOccupancy = Occupancy;
210 return true;
211 }
212 return false;
213}
214
215// Collect register defs and uses along with their lane masks and states.
216void SIFormMemoryClauses::collectRegUses(const MachineInstr &MI,
217 RegUse &Defs, RegUse &Uses) const {
218 for (const MachineOperand &MO : MI.operands()) {
219 if (!MO.isReg())
220 continue;
221 Register Reg = MO.getReg();
222 if (!Reg)
223 continue;
224
225 LaneBitmask Mask = Reg.isVirtual()
226 ? TRI->getSubRegIndexLaneMask(SubIdx: MO.getSubReg())
227 : LaneBitmask::getAll();
228 RegUse &Map = MO.isDef() ? Defs : Uses;
229
230 auto Loc = Map.find(Val: Reg);
231 unsigned State = getMopState(MO);
232 if (Loc == Map.end()) {
233 Map[Reg] = std::pair(State, Mask);
234 } else {
235 Loc->second.first |= State;
236 Loc->second.second |= Mask;
237 }
238 }
239}
240
241// Check register def/use conflicts, occupancy limits and collect def/use maps.
242// Return true if instruction can be bundled with previous. If it cannot
243// def/use maps are not updated.
244bool SIFormMemoryClauses::processRegUses(const MachineInstr &MI,
245 RegUse &Defs, RegUse &Uses,
246 GCNDownwardRPTracker &RPT) {
247 if (!canBundle(MI, Defs, Uses))
248 return false;
249
250 if (!checkPressure(MI, RPT))
251 return false;
252
253 collectRegUses(MI, Defs, Uses);
254 return true;
255}
256
257bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) {
258 if (skipFunction(F: MF.getFunction()))
259 return false;
260
261 ST = &MF.getSubtarget<GCNSubtarget>();
262 if (!ST->isXNACKEnabled())
263 return false;
264
265 const SIInstrInfo *TII = ST->getInstrInfo();
266 TRI = ST->getRegisterInfo();
267 MRI = &MF.getRegInfo();
268 MFI = MF.getInfo<SIMachineFunctionInfo>();
269 LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
270 SlotIndexes *Ind = LIS->getSlotIndexes();
271 bool Changed = false;
272
273 MaxVGPRs = TRI->getAllocatableSet(MF, RC: &AMDGPU::VGPR_32RegClass).count();
274 MaxSGPRs = TRI->getAllocatableSet(MF, RC: &AMDGPU::SGPR_32RegClass).count();
275 unsigned FuncMaxClause = MF.getFunction().getFnAttributeAsParsedInteger(
276 Kind: "amdgpu-max-memory-clause", Default: MaxClause);
277
278 for (MachineBasicBlock &MBB : MF) {
279 GCNDownwardRPTracker RPT(*LIS);
280 MachineBasicBlock::instr_iterator Next;
281 for (auto I = MBB.instr_begin(), E = MBB.instr_end(); I != E; I = Next) {
282 MachineInstr &MI = *I;
283 Next = std::next(x: I);
284
285 if (MI.isMetaInstruction())
286 continue;
287
288 bool IsVMEM = isVMEMClauseInst(MI);
289
290 if (!isValidClauseInst(MI, IsVMEMClause: IsVMEM))
291 continue;
292
293 if (!RPT.getNext().isValid())
294 RPT.reset(MI);
295 else { // Advance the state to the current MI.
296 RPT.advance(End: MachineBasicBlock::const_iterator(MI));
297 RPT.advanceBeforeNext();
298 }
299
300 const GCNRPTracker::LiveRegSet LiveRegsCopy(RPT.getLiveRegs());
301 RegUse Defs, Uses;
302 if (!processRegUses(MI, Defs, Uses, RPT)) {
303 RPT.reset(MI, LiveRegs: &LiveRegsCopy);
304 continue;
305 }
306
307 MachineBasicBlock::iterator LastClauseInst = Next;
308 unsigned Length = 1;
309 for ( ; Next != E && Length < FuncMaxClause; ++Next) {
310 // Debug instructions should not change the kill insertion.
311 if (Next->isMetaInstruction())
312 continue;
313
314 if (!isValidClauseInst(MI: *Next, IsVMEMClause: IsVMEM))
315 break;
316
317 // A load from pointer which was loaded inside the same bundle is an
318 // impossible clause because we will need to write and read the same
319 // register inside. In this case processRegUses will return false.
320 if (!processRegUses(MI: *Next, Defs, Uses, RPT))
321 break;
322
323 LastClauseInst = Next;
324 ++Length;
325 }
326 if (Length < 2) {
327 RPT.reset(MI, LiveRegs: &LiveRegsCopy);
328 continue;
329 }
330
331 Changed = true;
332 MFI->limitOccupancy(Limit: LastRecordedOccupancy);
333
334 assert(!LastClauseInst->isMetaInstruction());
335
336 SlotIndex ClauseLiveInIdx = LIS->getInstructionIndex(Instr: MI);
337 SlotIndex ClauseLiveOutIdx =
338 LIS->getInstructionIndex(Instr: *LastClauseInst).getNextIndex();
339
340 // Track the last inserted kill.
341 MachineInstrBuilder Kill;
342
343 // Insert one kill per register, with operands covering all necessary
344 // subregisters.
345 for (auto &&R : Uses) {
346 Register Reg = R.first;
347 if (Reg.isPhysical())
348 continue;
349
350 // Collect the register operands we should extend the live ranges of.
351 SmallVector<std::tuple<unsigned, unsigned>> KillOps;
352 const LiveInterval &LI = LIS->getInterval(Reg: R.first);
353
354 if (!LI.hasSubRanges()) {
355 if (!LI.liveAt(index: ClauseLiveOutIdx)) {
356 KillOps.emplace_back(Args: R.second.first | RegState::Kill,
357 Args: AMDGPU::NoSubRegister);
358 }
359 } else {
360 LaneBitmask KilledMask;
361 for (const LiveInterval::SubRange &SR : LI.subranges()) {
362 if (SR.liveAt(index: ClauseLiveInIdx) && !SR.liveAt(index: ClauseLiveOutIdx))
363 KilledMask |= SR.LaneMask;
364 }
365
366 if (KilledMask.none())
367 continue;
368
369 SmallVector<unsigned> KilledIndexes;
370 bool Success = TRI->getCoveringSubRegIndexes(
371 MRI: *MRI, RC: MRI->getRegClass(Reg), LaneMask: KilledMask, Indexes&: KilledIndexes);
372 (void)Success;
373 assert(Success && "Failed to find subregister mask to cover lanes");
374 for (unsigned SubReg : KilledIndexes) {
375 KillOps.emplace_back(Args: R.second.first | RegState::Kill, Args&: SubReg);
376 }
377 }
378
379 if (KillOps.empty())
380 continue;
381
382 // We only want to extend the live ranges of used registers. If they
383 // already have existing uses beyond the bundle, we don't need the kill.
384 //
385 // It's possible all of the use registers were already live past the
386 // bundle.
387 Kill = BuildMI(BB&: *MI.getParent(), I: std::next(x: LastClauseInst),
388 MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::KILL));
389 for (auto &Op : KillOps)
390 Kill.addUse(RegNo: Reg, Flags: std::get<0>(t&: Op), SubReg: std::get<1>(t&: Op));
391 Ind->insertMachineInstrInMaps(MI&: *Kill);
392 }
393
394 // Restore the state after processing the end of the bundle.
395 RPT.reset(MI, LiveRegs: &LiveRegsCopy);
396
397 if (!Kill)
398 continue;
399
400 for (auto &&R : Defs) {
401 Register Reg = R.first;
402 Uses.erase(Val: Reg);
403 if (Reg.isPhysical())
404 continue;
405 LIS->removeInterval(Reg);
406 LIS->createAndComputeVirtRegInterval(Reg);
407 }
408
409 for (auto &&R : Uses) {
410 Register Reg = R.first;
411 if (Reg.isPhysical())
412 continue;
413 LIS->removeInterval(Reg);
414 LIS->createAndComputeVirtRegInterval(Reg);
415 }
416 }
417 }
418
419 return Changed;
420}
421