1 | //===-- SIModeRegister.cpp - Mode Register --------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// \file |
9 | /// This pass inserts changes to the Mode register settings as required. |
10 | /// Note that currently it only deals with the Double Precision Floating Point |
11 | /// rounding mode setting, but is intended to be generic enough to be easily |
12 | /// expanded. |
13 | /// |
14 | //===----------------------------------------------------------------------===// |
15 | // |
16 | #include "AMDGPU.h" |
17 | #include "GCNSubtarget.h" |
18 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
19 | #include "llvm/ADT/Statistic.h" |
20 | #include "llvm/CodeGen/MachineFunctionPass.h" |
21 | #include <queue> |
22 | |
23 | #define DEBUG_TYPE "si-mode-register" |
24 | |
25 | STATISTIC(NumSetregInserted, "Number of setreg of mode register inserted." ); |
26 | |
27 | using namespace llvm; |
28 | |
29 | struct Status { |
30 | // Mask is a bitmask where a '1' indicates the corresponding Mode bit has a |
31 | // known value |
32 | unsigned Mask = 0; |
33 | unsigned Mode = 0; |
34 | |
35 | Status() = default; |
36 | |
37 | Status(unsigned NewMask, unsigned NewMode) : Mask(NewMask), Mode(NewMode) { |
38 | Mode &= Mask; |
39 | }; |
40 | |
41 | // merge two status values such that only values that don't conflict are |
42 | // preserved |
43 | Status merge(const Status &S) const { |
44 | return Status((Mask | S.Mask), ((Mode & ~S.Mask) | (S.Mode & S.Mask))); |
45 | } |
46 | |
47 | // merge an unknown value by using the unknown value's mask to remove bits |
48 | // from the result |
49 | Status mergeUnknown(unsigned newMask) { |
50 | return Status(Mask & ~newMask, Mode & ~newMask); |
51 | } |
52 | |
53 | // intersect two Status values to produce a mode and mask that is a subset |
54 | // of both values |
55 | Status intersect(const Status &S) const { |
56 | unsigned NewMask = (Mask & S.Mask) & (Mode ^ ~S.Mode); |
57 | unsigned NewMode = (Mode & NewMask); |
58 | return Status(NewMask, NewMode); |
59 | } |
60 | |
61 | // produce the delta required to change the Mode to the required Mode |
62 | Status delta(const Status &S) const { |
63 | return Status((S.Mask & (Mode ^ S.Mode)) | (~Mask & S.Mask), S.Mode); |
64 | } |
65 | |
66 | bool operator==(const Status &S) const { |
67 | return (Mask == S.Mask) && (Mode == S.Mode); |
68 | } |
69 | |
70 | bool operator!=(const Status &S) const { return !(*this == S); } |
71 | |
72 | bool isCompatible(Status &S) { |
73 | return ((Mask & S.Mask) == S.Mask) && ((Mode & S.Mask) == S.Mode); |
74 | } |
75 | |
76 | bool isCombinable(Status &S) { return !(Mask & S.Mask) || isCompatible(S); } |
77 | }; |
78 | |
79 | class BlockData { |
80 | public: |
81 | // The Status that represents the mode register settings required by the |
82 | // FirstInsertionPoint (if any) in this block. Calculated in Phase 1. |
83 | Status Require; |
84 | |
85 | // The Status that represents the net changes to the Mode register made by |
86 | // this block, Calculated in Phase 1. |
87 | Status Change; |
88 | |
89 | // The Status that represents the mode register settings on exit from this |
90 | // block. Calculated in Phase 2. |
91 | Status Exit; |
92 | |
93 | // The Status that represents the intersection of exit Mode register settings |
94 | // from all predecessor blocks. Calculated in Phase 2, and used by Phase 3. |
95 | Status Pred; |
96 | |
97 | // In Phase 1 we record the first instruction that has a mode requirement, |
98 | // which is used in Phase 3 if we need to insert a mode change. |
99 | MachineInstr *FirstInsertionPoint = nullptr; |
100 | |
101 | // A flag to indicate whether an Exit value has been set (we can't tell by |
102 | // examining the Exit value itself as all values may be valid results). |
103 | bool ExitSet = false; |
104 | |
105 | BlockData() = default; |
106 | }; |
107 | |
108 | namespace { |
109 | |
110 | class SIModeRegister : public MachineFunctionPass { |
111 | public: |
112 | static char ID; |
113 | |
114 | std::vector<std::unique_ptr<BlockData>> BlockInfo; |
115 | std::queue<MachineBasicBlock *> Phase2List; |
116 | |
117 | // The default mode register setting currently only caters for the floating |
118 | // point double precision rounding mode. |
119 | // We currently assume the default rounding mode is Round to Nearest |
120 | // NOTE: this should come from a per function rounding mode setting once such |
121 | // a setting exists. |
122 | unsigned DefaultMode = FP_ROUND_ROUND_TO_NEAREST; |
123 | Status DefaultStatus = |
124 | Status(FP_ROUND_MODE_DP(0x3), FP_ROUND_MODE_DP(DefaultMode)); |
125 | |
126 | bool Changed = false; |
127 | |
128 | public: |
129 | SIModeRegister() : MachineFunctionPass(ID) {} |
130 | |
131 | bool runOnMachineFunction(MachineFunction &MF) override; |
132 | |
133 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
134 | AU.setPreservesCFG(); |
135 | MachineFunctionPass::getAnalysisUsage(AU); |
136 | } |
137 | |
138 | void processBlockPhase1(MachineBasicBlock &MBB, const SIInstrInfo *TII); |
139 | |
140 | void processBlockPhase2(MachineBasicBlock &MBB, const SIInstrInfo *TII); |
141 | |
142 | void processBlockPhase3(MachineBasicBlock &MBB, const SIInstrInfo *TII); |
143 | |
144 | Status getInstructionMode(MachineInstr &MI, const SIInstrInfo *TII); |
145 | |
146 | void insertSetreg(MachineBasicBlock &MBB, MachineInstr *I, |
147 | const SIInstrInfo *TII, Status InstrMode); |
148 | }; |
149 | } // End anonymous namespace. |
150 | |
151 | INITIALIZE_PASS(SIModeRegister, DEBUG_TYPE, |
152 | "Insert required mode register values" , false, false) |
153 | |
154 | char SIModeRegister::ID = 0; |
155 | |
156 | char &llvm::SIModeRegisterID = SIModeRegister::ID; |
157 | |
158 | FunctionPass *llvm::createSIModeRegisterPass() { return new SIModeRegister(); } |
159 | |
160 | // Determine the Mode register setting required for this instruction. |
161 | // Instructions which don't use the Mode register return a null Status. |
162 | // Note this currently only deals with instructions that use the floating point |
163 | // double precision setting. |
164 | Status SIModeRegister::getInstructionMode(MachineInstr &MI, |
165 | const SIInstrInfo *TII) { |
166 | if (TII->usesFPDPRounding(MI) || |
167 | MI.getOpcode() == AMDGPU::FPTRUNC_UPWARD_PSEUDO || |
168 | MI.getOpcode() == AMDGPU::FPTRUNC_DOWNWARD_PSEUDO) { |
169 | switch (MI.getOpcode()) { |
170 | case AMDGPU::V_INTERP_P1LL_F16: |
171 | case AMDGPU::V_INTERP_P1LV_F16: |
172 | case AMDGPU::V_INTERP_P2_F16: |
173 | // f16 interpolation instructions need double precision round to zero |
174 | return Status(FP_ROUND_MODE_DP(3), |
175 | FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_ZERO)); |
176 | case AMDGPU::FPTRUNC_UPWARD_PSEUDO: { |
177 | // Replacing the pseudo by a real instruction in place |
178 | if (TII->getSubtarget().hasTrue16BitInsts()) { |
179 | MachineBasicBlock &MBB = *MI.getParent(); |
180 | MachineInstrBuilder B(*MBB.getParent(), MI); |
181 | MI.setDesc(TII->get(Opcode: AMDGPU::V_CVT_F16_F32_t16_e64)); |
182 | MachineOperand Src0 = MI.getOperand(i: 1); |
183 | MI.removeOperand(OpNo: 1); |
184 | B.addImm(Val: 0); // src0_modifiers |
185 | B.add(MO: Src0); // re-add src0 operand |
186 | B.addImm(Val: 0); // clamp |
187 | B.addImm(Val: 0); // omod |
188 | } else |
189 | MI.setDesc(TII->get(Opcode: AMDGPU::V_CVT_F16_F32_e32)); |
190 | return Status(FP_ROUND_MODE_DP(3), |
191 | FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_INF)); |
192 | } |
193 | case AMDGPU::FPTRUNC_DOWNWARD_PSEUDO: { |
194 | // Replacing the pseudo by a real instruction in place |
195 | if (TII->getSubtarget().hasTrue16BitInsts()) { |
196 | MachineBasicBlock &MBB = *MI.getParent(); |
197 | MachineInstrBuilder B(*MBB.getParent(), MI); |
198 | MI.setDesc(TII->get(Opcode: AMDGPU::V_CVT_F16_F32_t16_e64)); |
199 | MachineOperand Src0 = MI.getOperand(i: 1); |
200 | MI.removeOperand(OpNo: 1); |
201 | B.addImm(Val: 0); // src0_modifiers |
202 | B.add(MO: Src0); // re-add src0 operand |
203 | B.addImm(Val: 0); // clamp |
204 | B.addImm(Val: 0); // omod |
205 | } else |
206 | MI.setDesc(TII->get(Opcode: AMDGPU::V_CVT_F16_F32_e32)); |
207 | return Status(FP_ROUND_MODE_DP(3), |
208 | FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEGINF)); |
209 | } |
210 | default: |
211 | return DefaultStatus; |
212 | } |
213 | } |
214 | return Status(); |
215 | } |
216 | |
217 | // Insert a setreg instruction to update the Mode register. |
218 | // It is possible (though unlikely) for an instruction to require a change to |
219 | // the value of disjoint parts of the Mode register when we don't know the |
220 | // value of the intervening bits. In that case we need to use more than one |
221 | // setreg instruction. |
222 | void SIModeRegister::insertSetreg(MachineBasicBlock &MBB, MachineInstr *MI, |
223 | const SIInstrInfo *TII, Status InstrMode) { |
224 | while (InstrMode.Mask) { |
225 | unsigned Offset = llvm::countr_zero<unsigned>(Val: InstrMode.Mask); |
226 | unsigned Width = llvm::countr_one<unsigned>(Value: InstrMode.Mask >> Offset); |
227 | unsigned Value = (InstrMode.Mode >> Offset) & ((1 << Width) - 1); |
228 | using namespace AMDGPU::Hwreg; |
229 | BuildMI(BB&: MBB, I: MI, MIMD: nullptr, MCID: TII->get(Opcode: AMDGPU::S_SETREG_IMM32_B32)) |
230 | .addImm(Val: Value) |
231 | .addImm(Val: HwregEncoding::encode(Values: ID_MODE, Values: Offset, Values: Width)); |
232 | ++NumSetregInserted; |
233 | Changed = true; |
234 | InstrMode.Mask &= ~(((1 << Width) - 1) << Offset); |
235 | } |
236 | } |
237 | |
238 | // In Phase 1 we iterate through the instructions of the block and for each |
239 | // instruction we get its mode usage. If the instruction uses the Mode register |
240 | // we: |
241 | // - update the Change status, which tracks the changes to the Mode register |
242 | // made by this block |
243 | // - if this instruction's requirements are compatible with the current setting |
244 | // of the Mode register we merge the modes |
245 | // - if it isn't compatible and an InsertionPoint isn't set, then we set the |
246 | // InsertionPoint to the current instruction, and we remember the current |
247 | // mode |
248 | // - if it isn't compatible and InsertionPoint is set we insert a seteg before |
249 | // that instruction (unless this instruction forms part of the block's |
250 | // entry requirements in which case the insertion is deferred until Phase 3 |
251 | // when predecessor exit values are known), and move the insertion point to |
252 | // this instruction |
253 | // - if this is a setreg instruction we treat it as an incompatible instruction. |
254 | // This is sub-optimal but avoids some nasty corner cases, and is expected to |
255 | // occur very rarely. |
256 | // - on exit we have set the Require, Change, and initial Exit modes. |
257 | void SIModeRegister::processBlockPhase1(MachineBasicBlock &MBB, |
258 | const SIInstrInfo *TII) { |
259 | auto NewInfo = std::make_unique<BlockData>(); |
260 | MachineInstr *InsertionPoint = nullptr; |
261 | // RequirePending is used to indicate whether we are collecting the initial |
262 | // requirements for the block, and need to defer the first InsertionPoint to |
263 | // Phase 3. It is set to false once we have set FirstInsertionPoint, or when |
264 | // we discover an explicit setreg that means this block doesn't have any |
265 | // initial requirements. |
266 | bool RequirePending = true; |
267 | Status IPChange; |
268 | for (MachineInstr &MI : MBB) { |
269 | Status InstrMode = getInstructionMode(MI, TII); |
270 | if (MI.getOpcode() == AMDGPU::S_SETREG_B32 || |
271 | MI.getOpcode() == AMDGPU::S_SETREG_B32_mode || |
272 | MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 || |
273 | MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32_mode) { |
274 | // We preserve any explicit mode register setreg instruction we encounter, |
275 | // as we assume it has been inserted by a higher authority (this is |
276 | // likely to be a very rare occurrence). |
277 | unsigned Dst = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::simm16)->getImm(); |
278 | using namespace AMDGPU::Hwreg; |
279 | auto [Id, Offset, Width] = HwregEncoding::decode(Encoded: Dst); |
280 | if (Id != ID_MODE) |
281 | continue; |
282 | |
283 | unsigned Mask = maskTrailingOnes<unsigned>(N: Width) << Offset; |
284 | |
285 | // If an InsertionPoint is set we will insert a setreg there. |
286 | if (InsertionPoint) { |
287 | insertSetreg(MBB, MI: InsertionPoint, TII, InstrMode: IPChange.delta(S: NewInfo->Change)); |
288 | InsertionPoint = nullptr; |
289 | } |
290 | // If this is an immediate then we know the value being set, but if it is |
291 | // not an immediate then we treat the modified bits of the mode register |
292 | // as unknown. |
293 | if (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 || |
294 | MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32_mode) { |
295 | unsigned Val = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::imm)->getImm(); |
296 | unsigned Mode = (Val << Offset) & Mask; |
297 | Status Setreg = Status(Mask, Mode); |
298 | // If we haven't already set the initial requirements for the block we |
299 | // don't need to as the requirements start from this explicit setreg. |
300 | RequirePending = false; |
301 | NewInfo->Change = NewInfo->Change.merge(S: Setreg); |
302 | } else { |
303 | NewInfo->Change = NewInfo->Change.mergeUnknown(newMask: Mask); |
304 | } |
305 | } else if (!NewInfo->Change.isCompatible(S&: InstrMode)) { |
306 | // This instruction uses the Mode register and its requirements aren't |
307 | // compatible with the current mode. |
308 | if (InsertionPoint) { |
309 | // If the required mode change cannot be included in the current |
310 | // InsertionPoint changes, we need a setreg and start a new |
311 | // InsertionPoint. |
312 | if (!IPChange.delta(S: NewInfo->Change).isCombinable(S&: InstrMode)) { |
313 | if (RequirePending) { |
314 | // This is the first insertionPoint in the block so we will defer |
315 | // the insertion of the setreg to Phase 3 where we know whether or |
316 | // not it is actually needed. |
317 | NewInfo->FirstInsertionPoint = InsertionPoint; |
318 | NewInfo->Require = NewInfo->Change; |
319 | RequirePending = false; |
320 | } else { |
321 | insertSetreg(MBB, MI: InsertionPoint, TII, |
322 | InstrMode: IPChange.delta(S: NewInfo->Change)); |
323 | IPChange = NewInfo->Change; |
324 | } |
325 | // Set the new InsertionPoint |
326 | InsertionPoint = &MI; |
327 | } |
328 | NewInfo->Change = NewInfo->Change.merge(S: InstrMode); |
329 | } else { |
330 | // No InsertionPoint is currently set - this is either the first in |
331 | // the block or we have previously seen an explicit setreg. |
332 | InsertionPoint = &MI; |
333 | IPChange = NewInfo->Change; |
334 | NewInfo->Change = NewInfo->Change.merge(S: InstrMode); |
335 | } |
336 | } |
337 | } |
338 | if (RequirePending) { |
339 | // If we haven't yet set the initial requirements for the block we set them |
340 | // now. |
341 | NewInfo->FirstInsertionPoint = InsertionPoint; |
342 | NewInfo->Require = NewInfo->Change; |
343 | } else if (InsertionPoint) { |
344 | // We need to insert a setreg at the InsertionPoint |
345 | insertSetreg(MBB, MI: InsertionPoint, TII, InstrMode: IPChange.delta(S: NewInfo->Change)); |
346 | } |
347 | NewInfo->Exit = NewInfo->Change; |
348 | BlockInfo[MBB.getNumber()] = std::move(NewInfo); |
349 | } |
350 | |
351 | // In Phase 2 we revisit each block and calculate the common Mode register |
352 | // value provided by all predecessor blocks. If the Exit value for the block |
353 | // is changed, then we add the successor blocks to the worklist so that the |
354 | // exit value is propagated. |
355 | void SIModeRegister::processBlockPhase2(MachineBasicBlock &MBB, |
356 | const SIInstrInfo *TII) { |
357 | bool RevisitRequired = false; |
358 | bool ExitSet = false; |
359 | unsigned ThisBlock = MBB.getNumber(); |
360 | if (MBB.pred_empty()) { |
361 | // There are no predecessors, so use the default starting status. |
362 | BlockInfo[ThisBlock]->Pred = DefaultStatus; |
363 | ExitSet = true; |
364 | } else { |
365 | // Build a status that is common to all the predecessors by intersecting |
366 | // all the predecessor exit status values. |
367 | // Mask bits (which represent the Mode bits with a known value) can only be |
368 | // added by explicit SETREG instructions or the initial default value - |
369 | // the intersection process may remove Mask bits. |
370 | // If we find a predecessor that has not yet had an exit value determined |
371 | // (this can happen for example if a block is its own predecessor) we defer |
372 | // use of that value as the Mask will be all zero, and we will revisit this |
373 | // block again later (unless the only predecessor without an exit value is |
374 | // this block). |
375 | MachineBasicBlock::pred_iterator P = MBB.pred_begin(), E = MBB.pred_end(); |
376 | MachineBasicBlock &PB = *(*P); |
377 | unsigned PredBlock = PB.getNumber(); |
378 | if ((ThisBlock == PredBlock) && (std::next(x: P) == E)) { |
379 | BlockInfo[ThisBlock]->Pred = DefaultStatus; |
380 | ExitSet = true; |
381 | } else if (BlockInfo[PredBlock]->ExitSet) { |
382 | BlockInfo[ThisBlock]->Pred = BlockInfo[PredBlock]->Exit; |
383 | ExitSet = true; |
384 | } else if (PredBlock != ThisBlock) |
385 | RevisitRequired = true; |
386 | |
387 | for (P = std::next(x: P); P != E; P = std::next(x: P)) { |
388 | MachineBasicBlock *Pred = *P; |
389 | unsigned PredBlock = Pred->getNumber(); |
390 | if (BlockInfo[PredBlock]->ExitSet) { |
391 | if (BlockInfo[ThisBlock]->ExitSet) { |
392 | BlockInfo[ThisBlock]->Pred = |
393 | BlockInfo[ThisBlock]->Pred.intersect(S: BlockInfo[PredBlock]->Exit); |
394 | } else { |
395 | BlockInfo[ThisBlock]->Pred = BlockInfo[PredBlock]->Exit; |
396 | } |
397 | ExitSet = true; |
398 | } else if (PredBlock != ThisBlock) |
399 | RevisitRequired = true; |
400 | } |
401 | } |
402 | Status TmpStatus = |
403 | BlockInfo[ThisBlock]->Pred.merge(S: BlockInfo[ThisBlock]->Change); |
404 | if (BlockInfo[ThisBlock]->Exit != TmpStatus) { |
405 | BlockInfo[ThisBlock]->Exit = TmpStatus; |
406 | // Add the successors to the work list so we can propagate the changed exit |
407 | // status. |
408 | for (MachineBasicBlock *Succ : MBB.successors()) |
409 | Phase2List.push(x: Succ); |
410 | } |
411 | BlockInfo[ThisBlock]->ExitSet = ExitSet; |
412 | if (RevisitRequired) |
413 | Phase2List.push(x: &MBB); |
414 | } |
415 | |
416 | // In Phase 3 we revisit each block and if it has an insertion point defined we |
417 | // check whether the predecessor mode meets the block's entry requirements. If |
418 | // not we insert an appropriate setreg instruction to modify the Mode register. |
419 | void SIModeRegister::processBlockPhase3(MachineBasicBlock &MBB, |
420 | const SIInstrInfo *TII) { |
421 | unsigned ThisBlock = MBB.getNumber(); |
422 | if (!BlockInfo[ThisBlock]->Pred.isCompatible(S&: BlockInfo[ThisBlock]->Require)) { |
423 | Status Delta = |
424 | BlockInfo[ThisBlock]->Pred.delta(S: BlockInfo[ThisBlock]->Require); |
425 | if (BlockInfo[ThisBlock]->FirstInsertionPoint) |
426 | insertSetreg(MBB, MI: BlockInfo[ThisBlock]->FirstInsertionPoint, TII, InstrMode: Delta); |
427 | else |
428 | insertSetreg(MBB, MI: &MBB.instr_front(), TII, InstrMode: Delta); |
429 | } |
430 | } |
431 | |
432 | bool SIModeRegister::runOnMachineFunction(MachineFunction &MF) { |
433 | // Constrained FP intrinsics are used to support non-default rounding modes. |
434 | // strictfp attribute is required to mark functions with strict FP semantics |
435 | // having constrained FP intrinsics. This pass fixes up operations that uses |
436 | // a non-default rounding mode for non-strictfp functions. But it should not |
437 | // assume or modify any default rounding modes in case of strictfp functions. |
438 | const Function &F = MF.getFunction(); |
439 | if (F.hasFnAttribute(Kind: llvm::Attribute::StrictFP)) |
440 | return Changed; |
441 | BlockInfo.resize(new_size: MF.getNumBlockIDs()); |
442 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
443 | const SIInstrInfo *TII = ST.getInstrInfo(); |
444 | |
445 | // Processing is performed in a number of phases |
446 | |
447 | // Phase 1 - determine the initial mode required by each block, and add setreg |
448 | // instructions for intra block requirements. |
449 | for (MachineBasicBlock &BB : MF) |
450 | processBlockPhase1(MBB&: BB, TII); |
451 | |
452 | // Phase 2 - determine the exit mode from each block. We add all blocks to the |
453 | // list here, but will also add any that need to be revisited during Phase 2 |
454 | // processing. |
455 | for (MachineBasicBlock &BB : MF) |
456 | Phase2List.push(x: &BB); |
457 | while (!Phase2List.empty()) { |
458 | processBlockPhase2(MBB&: *Phase2List.front(), TII); |
459 | Phase2List.pop(); |
460 | } |
461 | |
462 | // Phase 3 - add an initial setreg to each block where the required entry mode |
463 | // is not satisfied by the exit mode of all its predecessors. |
464 | for (MachineBasicBlock &BB : MF) |
465 | processBlockPhase3(MBB&: BB, TII); |
466 | |
467 | BlockInfo.clear(); |
468 | |
469 | return Changed; |
470 | } |
471 | |