1 | //===-- AArch64A57FPLoadBalancing.cpp - Balance FP ops statically on A57---===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // For best-case performance on Cortex-A57, we should try to use a balanced |
9 | // mix of odd and even D-registers when performing a critical sequence of |
10 | // independent, non-quadword FP/ASIMD floating-point multiply or |
11 | // multiply-accumulate operations. |
12 | // |
13 | // This pass attempts to detect situations where the register allocation may |
14 | // adversely affect this load balancing and to change the registers used so as |
15 | // to better utilize the CPU. |
16 | // |
17 | // Ideally we'd just take each multiply or multiply-accumulate in turn and |
18 | // allocate it alternating even or odd registers. However, multiply-accumulates |
19 | // are most efficiently performed in the same functional unit as their |
20 | // accumulation operand. Therefore this pass tries to find maximal sequences |
21 | // ("Chains") of multiply-accumulates linked via their accumulation operand, |
22 | // and assign them all the same "color" (oddness/evenness). |
23 | // |
24 | // This optimization affects S-register and D-register floating point |
25 | // multiplies and FMADD/FMAs, as well as vector (floating point only) muls and |
26 | // FMADD/FMA. Q register instructions (and 128-bit vector instructions) are |
27 | // not affected. |
28 | //===----------------------------------------------------------------------===// |
29 | |
30 | #include "AArch64.h" |
31 | #include "AArch64InstrInfo.h" |
32 | #include "AArch64Subtarget.h" |
33 | #include "llvm/ADT/EquivalenceClasses.h" |
34 | #include "llvm/CodeGen/MachineFunction.h" |
35 | #include "llvm/CodeGen/MachineFunctionPass.h" |
36 | #include "llvm/CodeGen/MachineInstr.h" |
37 | #include "llvm/CodeGen/MachineInstrBuilder.h" |
38 | #include "llvm/CodeGen/MachineRegisterInfo.h" |
39 | #include "llvm/CodeGen/RegisterClassInfo.h" |
40 | #include "llvm/CodeGen/RegisterScavenging.h" |
41 | #include "llvm/Support/CommandLine.h" |
42 | #include "llvm/Support/Debug.h" |
43 | #include "llvm/Support/raw_ostream.h" |
44 | using namespace llvm; |
45 | |
46 | #define DEBUG_TYPE "aarch64-a57-fp-load-balancing" |
47 | |
48 | // Enforce the algorithm to use the scavenged register even when the original |
49 | // destination register is the correct color. Used for testing. |
50 | static cl::opt<bool> |
51 | TransformAll("aarch64-a57-fp-load-balancing-force-all" , |
52 | cl::desc("Always modify dest registers regardless of color" ), |
53 | cl::init(Val: false), cl::Hidden); |
54 | |
55 | // Never use the balance information obtained from chains - return a specific |
56 | // color always. Used for testing. |
57 | static cl::opt<unsigned> |
58 | OverrideBalance("aarch64-a57-fp-load-balancing-override" , |
59 | cl::desc("Ignore balance information, always return " |
60 | "(1: Even, 2: Odd)." ), |
61 | cl::init(Val: 0), cl::Hidden); |
62 | |
63 | //===----------------------------------------------------------------------===// |
64 | // Helper functions |
65 | |
66 | // Is the instruction a type of multiply on 64-bit (or 32-bit) FPRs? |
67 | static bool isMul(MachineInstr *MI) { |
68 | switch (MI->getOpcode()) { |
69 | case AArch64::FMULSrr: |
70 | case AArch64::FNMULSrr: |
71 | case AArch64::FMULDrr: |
72 | case AArch64::FNMULDrr: |
73 | return true; |
74 | default: |
75 | return false; |
76 | } |
77 | } |
78 | |
79 | // Is the instruction a type of FP multiply-accumulate on 64-bit (or 32-bit) FPRs? |
80 | static bool isMla(MachineInstr *MI) { |
81 | switch (MI->getOpcode()) { |
82 | case AArch64::FMSUBSrrr: |
83 | case AArch64::FMADDSrrr: |
84 | case AArch64::FNMSUBSrrr: |
85 | case AArch64::FNMADDSrrr: |
86 | case AArch64::FMSUBDrrr: |
87 | case AArch64::FMADDDrrr: |
88 | case AArch64::FNMSUBDrrr: |
89 | case AArch64::FNMADDDrrr: |
90 | return true; |
91 | default: |
92 | return false; |
93 | } |
94 | } |
95 | |
96 | //===----------------------------------------------------------------------===// |
97 | |
98 | namespace { |
99 | /// A "color", which is either even or odd. Yes, these aren't really colors |
100 | /// but the algorithm is conceptually doing two-color graph coloring. |
101 | enum class Color { Even, Odd }; |
102 | #ifndef NDEBUG |
103 | static const char *ColorNames[2] = { "Even" , "Odd" }; |
104 | #endif |
105 | |
106 | class Chain; |
107 | |
108 | class AArch64A57FPLoadBalancing : public MachineFunctionPass { |
109 | MachineRegisterInfo *MRI; |
110 | const TargetRegisterInfo *TRI; |
111 | RegisterClassInfo RCI; |
112 | |
113 | public: |
114 | static char ID; |
115 | explicit AArch64A57FPLoadBalancing() : MachineFunctionPass(ID) {} |
116 | |
117 | bool runOnMachineFunction(MachineFunction &F) override; |
118 | |
119 | MachineFunctionProperties getRequiredProperties() const override { |
120 | return MachineFunctionProperties().setNoVRegs(); |
121 | } |
122 | |
123 | StringRef getPassName() const override { |
124 | return "A57 FP Anti-dependency breaker" ; |
125 | } |
126 | |
127 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
128 | AU.setPreservesCFG(); |
129 | MachineFunctionPass::getAnalysisUsage(AU); |
130 | } |
131 | |
132 | private: |
133 | bool runOnBasicBlock(MachineBasicBlock &MBB); |
134 | bool colorChainSet(std::vector<Chain*> GV, MachineBasicBlock &MBB, |
135 | int &Balance); |
136 | bool colorChain(Chain *G, Color C, MachineBasicBlock &MBB); |
137 | int scavengeRegister(Chain *G, Color C, MachineBasicBlock &MBB); |
138 | void scanInstruction(MachineInstr *MI, unsigned Idx, |
139 | std::map<unsigned, Chain*> &Active, |
140 | std::vector<std::unique_ptr<Chain>> &AllChains); |
141 | void maybeKillChain(MachineOperand &MO, unsigned Idx, |
142 | std::map<unsigned, Chain*> &RegChains); |
143 | Color getColor(unsigned Register); |
144 | Chain *getAndEraseNext(Color PreferredColor, std::vector<Chain*> &L); |
145 | }; |
146 | } |
147 | |
148 | char AArch64A57FPLoadBalancing::ID = 0; |
149 | |
150 | INITIALIZE_PASS_BEGIN(AArch64A57FPLoadBalancing, DEBUG_TYPE, |
151 | "AArch64 A57 FP Load-Balancing" , false, false) |
152 | INITIALIZE_PASS_END(AArch64A57FPLoadBalancing, DEBUG_TYPE, |
153 | "AArch64 A57 FP Load-Balancing" , false, false) |
154 | |
155 | namespace { |
156 | /// A Chain is a sequence of instructions that are linked together by |
157 | /// an accumulation operand. For example: |
158 | /// |
159 | /// fmul def d0, ? |
160 | /// fmla def d1, ?, ?, killed d0 |
161 | /// fmla def d2, ?, ?, killed d1 |
162 | /// |
163 | /// There may be other instructions interleaved in the sequence that |
164 | /// do not belong to the chain. These other instructions must not use |
165 | /// the "chain" register at any point. |
166 | /// |
167 | /// We currently only support chains where the "chain" operand is killed |
168 | /// at each link in the chain for simplicity. |
169 | /// A chain has three important instructions - Start, Last and Kill. |
170 | /// * The start instruction is the first instruction in the chain. |
171 | /// * Last is the final instruction in the chain. |
172 | /// * Kill may or may not be defined. If defined, Kill is the instruction |
173 | /// where the outgoing value of the Last instruction is killed. |
174 | /// This information is important as if we know the outgoing value is |
175 | /// killed with no intervening uses, we can safely change its register. |
176 | /// |
177 | /// Without a kill instruction, we must assume the outgoing value escapes |
178 | /// beyond our model and either must not change its register or must |
179 | /// create a fixup FMOV to keep the old register value consistent. |
180 | /// |
181 | class Chain { |
182 | public: |
183 | /// The important (marker) instructions. |
184 | MachineInstr *StartInst, *LastInst, *KillInst; |
185 | /// The index, from the start of the basic block, that each marker |
186 | /// appears. These are stored so we can do quick interval tests. |
187 | unsigned StartInstIdx, LastInstIdx, KillInstIdx; |
188 | /// All instructions in the chain. |
189 | std::set<MachineInstr*> Insts; |
190 | /// True if KillInst cannot be modified. If this is true, |
191 | /// we cannot change LastInst's outgoing register. |
192 | /// This will be true for tied values and regmasks. |
193 | bool KillIsImmutable; |
194 | /// The "color" of LastInst. This will be the preferred chain color, |
195 | /// as changing intermediate nodes is easy but changing the last |
196 | /// instruction can be more tricky. |
197 | Color LastColor; |
198 | |
199 | Chain(MachineInstr *MI, unsigned Idx, Color C) |
200 | : StartInst(MI), LastInst(MI), KillInst(nullptr), |
201 | StartInstIdx(Idx), LastInstIdx(Idx), KillInstIdx(0), |
202 | LastColor(C) { |
203 | Insts.insert(x: MI); |
204 | } |
205 | |
206 | /// Add a new instruction into the chain. The instruction's dest operand |
207 | /// has the given color. |
208 | void add(MachineInstr *MI, unsigned Idx, Color C) { |
209 | LastInst = MI; |
210 | LastInstIdx = Idx; |
211 | LastColor = C; |
212 | assert((KillInstIdx == 0 || LastInstIdx < KillInstIdx) && |
213 | "Chain: broken invariant. A Chain can only be killed after its last " |
214 | "def" ); |
215 | |
216 | Insts.insert(x: MI); |
217 | } |
218 | |
219 | /// Return true if MI is a member of the chain. |
220 | bool contains(MachineInstr &MI) { return Insts.count(x: &MI) > 0; } |
221 | |
222 | /// Return the number of instructions in the chain. |
223 | unsigned size() const { |
224 | return Insts.size(); |
225 | } |
226 | |
227 | /// Inform the chain that its last active register (the dest register of |
228 | /// LastInst) is killed by MI with no intervening uses or defs. |
229 | void setKill(MachineInstr *MI, unsigned Idx, bool Immutable) { |
230 | KillInst = MI; |
231 | KillInstIdx = Idx; |
232 | KillIsImmutable = Immutable; |
233 | assert((KillInstIdx == 0 || LastInstIdx < KillInstIdx) && |
234 | "Chain: broken invariant. A Chain can only be killed after its last " |
235 | "def" ); |
236 | } |
237 | |
238 | /// Return the first instruction in the chain. |
239 | MachineInstr *getStart() const { return StartInst; } |
240 | /// Return the last instruction in the chain. |
241 | MachineInstr *getLast() const { return LastInst; } |
242 | /// Return the "kill" instruction (as set with setKill()) or NULL. |
243 | MachineInstr *getKill() const { return KillInst; } |
244 | /// Return an instruction that can be used as an iterator for the end |
245 | /// of the chain. This is the maximum of KillInst (if set) and LastInst. |
246 | MachineBasicBlock::iterator end() const { |
247 | return ++MachineBasicBlock::iterator(KillInst ? KillInst : LastInst); |
248 | } |
249 | MachineBasicBlock::iterator begin() const { return getStart(); } |
250 | |
251 | /// Can the Kill instruction (assuming one exists) be modified? |
252 | bool isKillImmutable() const { return KillIsImmutable; } |
253 | |
254 | /// Return the preferred color of this chain. |
255 | Color getPreferredColor() { |
256 | if (OverrideBalance != 0) |
257 | return OverrideBalance == 1 ? Color::Even : Color::Odd; |
258 | return LastColor; |
259 | } |
260 | |
261 | /// Return true if this chain (StartInst..KillInst) overlaps with Other. |
262 | bool rangeOverlapsWith(const Chain &Other) const { |
263 | unsigned End = KillInst ? KillInstIdx : LastInstIdx; |
264 | unsigned OtherEnd = Other.KillInst ? |
265 | Other.KillInstIdx : Other.LastInstIdx; |
266 | |
267 | return StartInstIdx <= OtherEnd && Other.StartInstIdx <= End; |
268 | } |
269 | |
270 | /// Return true if this chain starts before Other. |
271 | bool startsBefore(const Chain *Other) const { |
272 | return StartInstIdx < Other->StartInstIdx; |
273 | } |
274 | |
275 | /// Return true if the group will require a fixup MOV at the end. |
276 | bool requiresFixup() const { |
277 | return (getKill() && isKillImmutable()) || !getKill(); |
278 | } |
279 | |
280 | /// Return a simple string representation of the chain. |
281 | std::string str() const { |
282 | std::string S; |
283 | raw_string_ostream OS(S); |
284 | |
285 | OS << "{" ; |
286 | StartInst->print(OS, /* SkipOpers= */IsStandalone: true); |
287 | OS << " -> " ; |
288 | LastInst->print(OS, /* SkipOpers= */IsStandalone: true); |
289 | if (KillInst) { |
290 | OS << " (kill @ " ; |
291 | KillInst->print(OS, /* SkipOpers= */IsStandalone: true); |
292 | OS << ")" ; |
293 | } |
294 | OS << "}" ; |
295 | |
296 | return OS.str(); |
297 | } |
298 | |
299 | }; |
300 | |
301 | } // end anonymous namespace |
302 | |
303 | //===----------------------------------------------------------------------===// |
304 | |
305 | bool AArch64A57FPLoadBalancing::runOnMachineFunction(MachineFunction &F) { |
306 | if (skipFunction(F: F.getFunction())) |
307 | return false; |
308 | |
309 | if (!F.getSubtarget<AArch64Subtarget>().balanceFPOps()) |
310 | return false; |
311 | |
312 | bool Changed = false; |
313 | LLVM_DEBUG(dbgs() << "***** AArch64A57FPLoadBalancing *****\n" ); |
314 | |
315 | MRI = &F.getRegInfo(); |
316 | TRI = F.getRegInfo().getTargetRegisterInfo(); |
317 | RCI.runOnMachineFunction(MF: F); |
318 | |
319 | for (auto &MBB : F) { |
320 | Changed |= runOnBasicBlock(MBB); |
321 | } |
322 | |
323 | return Changed; |
324 | } |
325 | |
326 | bool AArch64A57FPLoadBalancing::runOnBasicBlock(MachineBasicBlock &MBB) { |
327 | bool Changed = false; |
328 | LLVM_DEBUG(dbgs() << "Running on MBB: " << MBB |
329 | << " - scanning instructions...\n" ); |
330 | |
331 | // First, scan the basic block producing a set of chains. |
332 | |
333 | // The currently "active" chains - chains that can be added to and haven't |
334 | // been killed yet. This is keyed by register - all chains can only have one |
335 | // "link" register between each inst in the chain. |
336 | std::map<unsigned, Chain*> ActiveChains; |
337 | std::vector<std::unique_ptr<Chain>> AllChains; |
338 | unsigned Idx = 0; |
339 | for (auto &MI : MBB) |
340 | scanInstruction(MI: &MI, Idx: Idx++, Active&: ActiveChains, AllChains); |
341 | |
342 | LLVM_DEBUG(dbgs() << "Scan complete, " << AllChains.size() |
343 | << " chains created.\n" ); |
344 | |
345 | // Group the chains into disjoint sets based on their liveness range. This is |
346 | // a poor-man's version of graph coloring. Ideally we'd create an interference |
347 | // graph and perform full-on graph coloring on that, but; |
348 | // (a) That's rather heavyweight for only two colors. |
349 | // (b) We expect multiple disjoint interference regions - in practice the live |
350 | // range of chains is quite small and they are clustered between loads |
351 | // and stores. |
352 | EquivalenceClasses<Chain*> EC; |
353 | for (auto &I : AllChains) |
354 | EC.insert(Data: I.get()); |
355 | |
356 | for (auto &I : AllChains) |
357 | for (auto &J : AllChains) |
358 | if (I != J && I->rangeOverlapsWith(Other: *J)) |
359 | EC.unionSets(V1: I.get(), V2: J.get()); |
360 | LLVM_DEBUG(dbgs() << "Created " << EC.getNumClasses() << " disjoint sets.\n" ); |
361 | |
362 | // Now we assume that every member of an equivalence class interferes |
363 | // with every other member of that class, and with no members of other classes. |
364 | |
365 | // Convert the EquivalenceClasses to a simpler set of sets. |
366 | std::vector<std::vector<Chain*> > V; |
367 | for (const auto &E : EC) { |
368 | if (!E->isLeader()) |
369 | continue; |
370 | std::vector<Chain *> Cs(EC.member_begin(ECV: *E), EC.member_end()); |
371 | if (Cs.empty()) continue; |
372 | V.push_back(x: std::move(Cs)); |
373 | } |
374 | |
375 | // Now we have a set of sets, order them by start address so |
376 | // we can iterate over them sequentially. |
377 | llvm::sort(C&: V, |
378 | Comp: [](const std::vector<Chain *> &A, const std::vector<Chain *> &B) { |
379 | return A.front()->startsBefore(Other: B.front()); |
380 | }); |
381 | |
382 | // As we only have two colors, we can track the global (BB-level) balance of |
383 | // odds versus evens. We aim to keep this near zero to keep both execution |
384 | // units fed. |
385 | // Positive means we're even-heavy, negative we're odd-heavy. |
386 | // |
387 | // FIXME: If chains have interdependencies, for example: |
388 | // mul r0, r1, r2 |
389 | // mul r3, r0, r1 |
390 | // We do not model this and may color each one differently, assuming we'll |
391 | // get ILP when we obviously can't. This hasn't been seen to be a problem |
392 | // in practice so far, so we simplify the algorithm by ignoring it. |
393 | int Parity = 0; |
394 | |
395 | for (auto &I : V) |
396 | Changed |= colorChainSet(GV: std::move(I), MBB, Balance&: Parity); |
397 | |
398 | return Changed; |
399 | } |
400 | |
401 | Chain *AArch64A57FPLoadBalancing::getAndEraseNext(Color PreferredColor, |
402 | std::vector<Chain*> &L) { |
403 | if (L.empty()) |
404 | return nullptr; |
405 | |
406 | // We try and get the best candidate from L to color next, given that our |
407 | // preferred color is "PreferredColor". L is ordered from larger to smaller |
408 | // chains. It is beneficial to color the large chains before the small chains, |
409 | // but if we can't find a chain of the maximum length with the preferred color, |
410 | // we fuzz the size and look for slightly smaller chains before giving up and |
411 | // returning a chain that must be recolored. |
412 | |
413 | // FIXME: Does this need to be configurable? |
414 | const unsigned SizeFuzz = 1; |
415 | unsigned MinSize = L.front()->size() - SizeFuzz; |
416 | for (auto I = L.begin(), E = L.end(); I != E; ++I) { |
417 | if ((*I)->size() <= MinSize) { |
418 | // We've gone past the size limit. Return the previous item. |
419 | Chain *Ch = *--I; |
420 | L.erase(position: I); |
421 | return Ch; |
422 | } |
423 | |
424 | if ((*I)->getPreferredColor() == PreferredColor) { |
425 | Chain *Ch = *I; |
426 | L.erase(position: I); |
427 | return Ch; |
428 | } |
429 | } |
430 | |
431 | // Bailout case - just return the first item. |
432 | Chain *Ch = L.front(); |
433 | L.erase(position: L.begin()); |
434 | return Ch; |
435 | } |
436 | |
437 | bool AArch64A57FPLoadBalancing::colorChainSet(std::vector<Chain*> GV, |
438 | MachineBasicBlock &MBB, |
439 | int &Parity) { |
440 | bool Changed = false; |
441 | LLVM_DEBUG(dbgs() << "colorChainSet(): #sets=" << GV.size() << "\n" ); |
442 | |
443 | // Sort by descending size order so that we allocate the most important |
444 | // sets first. |
445 | // Tie-break equivalent sizes by sorting chains requiring fixups before |
446 | // those without fixups. The logic here is that we should look at the |
447 | // chains that we cannot change before we look at those we can, |
448 | // so the parity counter is updated and we know what color we should |
449 | // change them to! |
450 | // Final tie-break with instruction order so pass output is stable (i.e. not |
451 | // dependent on malloc'd pointer values). |
452 | llvm::sort(C&: GV, Comp: [](const Chain *G1, const Chain *G2) { |
453 | if (G1->size() != G2->size()) |
454 | return G1->size() > G2->size(); |
455 | if (G1->requiresFixup() != G2->requiresFixup()) |
456 | return G1->requiresFixup() > G2->requiresFixup(); |
457 | // Make sure startsBefore() produces a stable final order. |
458 | assert((G1 == G2 || (G1->startsBefore(G2) ^ G2->startsBefore(G1))) && |
459 | "Starts before not total order!" ); |
460 | return G1->startsBefore(Other: G2); |
461 | }); |
462 | |
463 | Color PreferredColor = Parity < 0 ? Color::Even : Color::Odd; |
464 | while (Chain *G = getAndEraseNext(PreferredColor, L&: GV)) { |
465 | // Start off by assuming we'll color to our own preferred color. |
466 | Color C = PreferredColor; |
467 | if (Parity == 0) |
468 | // But if we really don't care, use the chain's preferred color. |
469 | C = G->getPreferredColor(); |
470 | |
471 | LLVM_DEBUG(dbgs() << " - Parity=" << Parity |
472 | << ", Color=" << ColorNames[(int)C] << "\n" ); |
473 | |
474 | // If we'll need a fixup FMOV, don't bother. Testing has shown that this |
475 | // happens infrequently and when it does it has at least a 50% chance of |
476 | // slowing code down instead of speeding it up. |
477 | if (G->requiresFixup() && C != G->getPreferredColor()) { |
478 | C = G->getPreferredColor(); |
479 | LLVM_DEBUG(dbgs() << " - " << G->str() |
480 | << " - not worthwhile changing; " |
481 | "color remains " |
482 | << ColorNames[(int)C] << "\n" ); |
483 | } |
484 | |
485 | Changed |= colorChain(G, C, MBB); |
486 | |
487 | Parity += (C == Color::Even) ? G->size() : -G->size(); |
488 | PreferredColor = Parity < 0 ? Color::Even : Color::Odd; |
489 | } |
490 | |
491 | return Changed; |
492 | } |
493 | |
494 | int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C, |
495 | MachineBasicBlock &MBB) { |
496 | // Can we find an appropriate register that is available throughout the life |
497 | // of the chain? Simulate liveness backwards until the end of the chain. |
498 | LiveRegUnits Units(*TRI); |
499 | Units.addLiveOuts(MBB); |
500 | MachineBasicBlock::iterator I = MBB.end(); |
501 | MachineBasicBlock::iterator ChainEnd = G->end(); |
502 | while (I != ChainEnd) { |
503 | --I; |
504 | Units.stepBackward(MI: *I); |
505 | } |
506 | |
507 | // Check which register units are alive throughout the chain. |
508 | MachineBasicBlock::iterator ChainBegin = G->begin(); |
509 | assert(ChainBegin != ChainEnd && "Chain should contain instructions" ); |
510 | do { |
511 | --I; |
512 | Units.accumulate(MI: *I); |
513 | } while (I != ChainBegin); |
514 | |
515 | // Make sure we allocate in-order, to get the cheapest registers first. |
516 | unsigned RegClassID = ChainBegin->getDesc().operands()[0].RegClass; |
517 | auto Ord = RCI.getOrder(RC: TRI->getRegClass(i: RegClassID)); |
518 | for (auto Reg : Ord) { |
519 | if (!Units.available(Reg)) |
520 | continue; |
521 | if (C == getColor(Register: Reg)) |
522 | return Reg; |
523 | } |
524 | |
525 | return -1; |
526 | } |
527 | |
528 | bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C, |
529 | MachineBasicBlock &MBB) { |
530 | bool Changed = false; |
531 | LLVM_DEBUG(dbgs() << " - colorChain(" << G->str() << ", " |
532 | << ColorNames[(int)C] << ")\n" ); |
533 | |
534 | // Try and obtain a free register of the right class. Without a register |
535 | // to play with we cannot continue. |
536 | int Reg = scavengeRegister(G, C, MBB); |
537 | if (Reg == -1) { |
538 | LLVM_DEBUG(dbgs() << "Scavenging (thus coloring) failed!\n" ); |
539 | return false; |
540 | } |
541 | LLVM_DEBUG(dbgs() << " - Scavenged register: " << printReg(Reg, TRI) << "\n" ); |
542 | |
543 | std::map<unsigned, unsigned> Substs; |
544 | for (MachineInstr &I : *G) { |
545 | if (!G->contains(MI&: I) && (&I != G->getKill() || G->isKillImmutable())) |
546 | continue; |
547 | |
548 | // I is a member of G, or I is a mutable instruction that kills G. |
549 | |
550 | std::vector<unsigned> ToErase; |
551 | for (auto &U : I.operands()) { |
552 | if (U.isReg() && U.isUse() && Substs.find(x: U.getReg()) != Substs.end()) { |
553 | Register OrigReg = U.getReg(); |
554 | U.setReg(Substs[OrigReg]); |
555 | if (U.isKill()) |
556 | // Don't erase straight away, because there may be other operands |
557 | // that also reference this substitution! |
558 | ToErase.push_back(x: OrigReg); |
559 | } else if (U.isRegMask()) { |
560 | for (auto J : Substs) { |
561 | if (U.clobbersPhysReg(PhysReg: J.first)) |
562 | ToErase.push_back(x: J.first); |
563 | } |
564 | } |
565 | } |
566 | // Now it's safe to remove the substs identified earlier. |
567 | for (auto J : ToErase) |
568 | Substs.erase(x: J); |
569 | |
570 | // Only change the def if this isn't the last instruction. |
571 | if (&I != G->getKill()) { |
572 | MachineOperand &MO = I.getOperand(i: 0); |
573 | |
574 | bool Change = TransformAll || getColor(Register: MO.getReg()) != C; |
575 | if (G->requiresFixup() && &I == G->getLast()) |
576 | Change = false; |
577 | |
578 | if (Change) { |
579 | Substs[MO.getReg()] = Reg; |
580 | MO.setReg(Reg); |
581 | |
582 | Changed = true; |
583 | } |
584 | } |
585 | } |
586 | assert(Substs.size() == 0 && "No substitutions should be left active!" ); |
587 | |
588 | if (G->getKill()) { |
589 | LLVM_DEBUG(dbgs() << " - Kill instruction seen.\n" ); |
590 | } else { |
591 | // We didn't have a kill instruction, but we didn't seem to need to change |
592 | // the destination register anyway. |
593 | LLVM_DEBUG(dbgs() << " - Destination register not changed.\n" ); |
594 | } |
595 | return Changed; |
596 | } |
597 | |
598 | void AArch64A57FPLoadBalancing::scanInstruction( |
599 | MachineInstr *MI, unsigned Idx, std::map<unsigned, Chain *> &ActiveChains, |
600 | std::vector<std::unique_ptr<Chain>> &AllChains) { |
601 | // Inspect "MI", updating ActiveChains and AllChains. |
602 | |
603 | if (isMul(MI)) { |
604 | |
605 | for (auto &I : MI->uses()) |
606 | maybeKillChain(MO&: I, Idx, RegChains&: ActiveChains); |
607 | for (auto &I : MI->defs()) |
608 | maybeKillChain(MO&: I, Idx, RegChains&: ActiveChains); |
609 | |
610 | // Create a new chain. Multiplies don't require forwarding so can go on any |
611 | // unit. |
612 | Register DestReg = MI->getOperand(i: 0).getReg(); |
613 | |
614 | LLVM_DEBUG(dbgs() << "New chain started for register " |
615 | << printReg(DestReg, TRI) << " at " << *MI); |
616 | |
617 | auto G = std::make_unique<Chain>(args&: MI, args&: Idx, args: getColor(Register: DestReg)); |
618 | ActiveChains[DestReg] = G.get(); |
619 | AllChains.push_back(x: std::move(G)); |
620 | |
621 | } else if (isMla(MI)) { |
622 | |
623 | // It is beneficial to keep MLAs on the same functional unit as their |
624 | // accumulator operand. |
625 | Register DestReg = MI->getOperand(i: 0).getReg(); |
626 | Register AccumReg = MI->getOperand(i: 3).getReg(); |
627 | |
628 | maybeKillChain(MO&: MI->getOperand(i: 1), Idx, RegChains&: ActiveChains); |
629 | maybeKillChain(MO&: MI->getOperand(i: 2), Idx, RegChains&: ActiveChains); |
630 | if (DestReg != AccumReg) |
631 | maybeKillChain(MO&: MI->getOperand(i: 0), Idx, RegChains&: ActiveChains); |
632 | |
633 | if (ActiveChains.find(x: AccumReg) != ActiveChains.end()) { |
634 | LLVM_DEBUG(dbgs() << "Chain found for accumulator register " |
635 | << printReg(AccumReg, TRI) << " in MI " << *MI); |
636 | |
637 | // For simplicity we only chain together sequences of MULs/MLAs where the |
638 | // accumulator register is killed on each instruction. This means we don't |
639 | // need to track other uses of the registers we want to rewrite. |
640 | // |
641 | // FIXME: We could extend to handle the non-kill cases for more coverage. |
642 | if (MI->getOperand(i: 3).isKill()) { |
643 | // Add to chain. |
644 | LLVM_DEBUG(dbgs() << "Instruction was successfully added to chain.\n" ); |
645 | ActiveChains[AccumReg]->add(MI, Idx, C: getColor(Register: DestReg)); |
646 | // Handle cases where the destination is not the same as the accumulator. |
647 | if (DestReg != AccumReg) { |
648 | ActiveChains[DestReg] = ActiveChains[AccumReg]; |
649 | ActiveChains.erase(x: AccumReg); |
650 | } |
651 | return; |
652 | } |
653 | |
654 | LLVM_DEBUG( |
655 | dbgs() << "Cannot add to chain because accumulator operand wasn't " |
656 | << "marked <kill>!\n" ); |
657 | maybeKillChain(MO&: MI->getOperand(i: 3), Idx, RegChains&: ActiveChains); |
658 | } |
659 | |
660 | LLVM_DEBUG(dbgs() << "Creating new chain for dest register " |
661 | << printReg(DestReg, TRI) << "\n" ); |
662 | auto G = std::make_unique<Chain>(args&: MI, args&: Idx, args: getColor(Register: DestReg)); |
663 | ActiveChains[DestReg] = G.get(); |
664 | AllChains.push_back(x: std::move(G)); |
665 | |
666 | } else { |
667 | |
668 | // Non-MUL or MLA instruction. Invalidate any chain in the uses or defs |
669 | // lists. |
670 | for (auto &I : MI->uses()) |
671 | maybeKillChain(MO&: I, Idx, RegChains&: ActiveChains); |
672 | for (auto &I : MI->defs()) |
673 | maybeKillChain(MO&: I, Idx, RegChains&: ActiveChains); |
674 | |
675 | } |
676 | } |
677 | |
678 | void AArch64A57FPLoadBalancing:: |
679 | maybeKillChain(MachineOperand &MO, unsigned Idx, |
680 | std::map<unsigned, Chain*> &ActiveChains) { |
681 | // Given an operand and the set of active chains (keyed by register), |
682 | // determine if a chain should be ended and remove from ActiveChains. |
683 | MachineInstr *MI = MO.getParent(); |
684 | |
685 | if (MO.isReg()) { |
686 | |
687 | // If this is a KILL of a current chain, record it. |
688 | if (MO.isKill() && ActiveChains.find(x: MO.getReg()) != ActiveChains.end()) { |
689 | LLVM_DEBUG(dbgs() << "Kill seen for chain " << printReg(MO.getReg(), TRI) |
690 | << "\n" ); |
691 | ActiveChains[MO.getReg()]->setKill(MI, Idx, /*Immutable=*/MO.isTied()); |
692 | } |
693 | ActiveChains.erase(x: MO.getReg()); |
694 | |
695 | } else if (MO.isRegMask()) { |
696 | |
697 | for (auto I = ActiveChains.begin(), E = ActiveChains.end(); |
698 | I != E;) { |
699 | if (MO.clobbersPhysReg(PhysReg: I->first)) { |
700 | LLVM_DEBUG(dbgs() << "Kill (regmask) seen for chain " |
701 | << printReg(I->first, TRI) << "\n" ); |
702 | I->second->setKill(MI, Idx, /*Immutable=*/true); |
703 | ActiveChains.erase(position: I++); |
704 | } else |
705 | ++I; |
706 | } |
707 | |
708 | } |
709 | } |
710 | |
711 | Color AArch64A57FPLoadBalancing::getColor(unsigned Reg) { |
712 | if ((TRI->getEncodingValue(Reg) % 2) == 0) |
713 | return Color::Even; |
714 | else |
715 | return Color::Odd; |
716 | } |
717 | |
718 | // Factory function used by AArch64TargetMachine to add the pass to the passmanager. |
719 | FunctionPass *llvm::createAArch64A57FPLoadBalancing() { |
720 | return new AArch64A57FPLoadBalancing(); |
721 | } |
722 | |