1 | //===-- AArch64A57FPLoadBalancing.cpp - Balance FP ops statically on A57---===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // For best-case performance on Cortex-A57, we should try to use a balanced |
9 | // mix of odd and even D-registers when performing a critical sequence of |
10 | // independent, non-quadword FP/ASIMD floating-point multiply or |
11 | // multiply-accumulate operations. |
12 | // |
13 | // This pass attempts to detect situations where the register allocation may |
14 | // adversely affect this load balancing and to change the registers used so as |
15 | // to better utilize the CPU. |
16 | // |
17 | // Ideally we'd just take each multiply or multiply-accumulate in turn and |
18 | // allocate it alternating even or odd registers. However, multiply-accumulates |
19 | // are most efficiently performed in the same functional unit as their |
20 | // accumulation operand. Therefore this pass tries to find maximal sequences |
21 | // ("Chains") of multiply-accumulates linked via their accumulation operand, |
22 | // and assign them all the same "color" (oddness/evenness). |
23 | // |
24 | // This optimization affects S-register and D-register floating point |
25 | // multiplies and FMADD/FMAs, as well as vector (floating point only) muls and |
26 | // FMADD/FMA. Q register instructions (and 128-bit vector instructions) are |
27 | // not affected. |
28 | //===----------------------------------------------------------------------===// |
29 | |
30 | #include "AArch64.h" |
31 | #include "AArch64InstrInfo.h" |
32 | #include "AArch64Subtarget.h" |
33 | #include "llvm/ADT/EquivalenceClasses.h" |
34 | #include "llvm/CodeGen/MachineFunction.h" |
35 | #include "llvm/CodeGen/MachineFunctionPass.h" |
36 | #include "llvm/CodeGen/MachineInstr.h" |
37 | #include "llvm/CodeGen/MachineInstrBuilder.h" |
38 | #include "llvm/CodeGen/MachineRegisterInfo.h" |
39 | #include "llvm/CodeGen/RegisterClassInfo.h" |
40 | #include "llvm/CodeGen/RegisterScavenging.h" |
41 | #include "llvm/Support/CommandLine.h" |
42 | #include "llvm/Support/Debug.h" |
43 | #include "llvm/Support/raw_ostream.h" |
44 | using namespace llvm; |
45 | |
46 | #define DEBUG_TYPE "aarch64-a57-fp-load-balancing" |
47 | |
48 | // Enforce the algorithm to use the scavenged register even when the original |
49 | // destination register is the correct color. Used for testing. |
50 | static cl::opt<bool> |
51 | TransformAll("aarch64-a57-fp-load-balancing-force-all" , |
52 | cl::desc("Always modify dest registers regardless of color" ), |
53 | cl::init(Val: false), cl::Hidden); |
54 | |
55 | // Never use the balance information obtained from chains - return a specific |
56 | // color always. Used for testing. |
57 | static cl::opt<unsigned> |
58 | OverrideBalance("aarch64-a57-fp-load-balancing-override" , |
59 | cl::desc("Ignore balance information, always return " |
60 | "(1: Even, 2: Odd)." ), |
61 | cl::init(Val: 0), cl::Hidden); |
62 | |
63 | //===----------------------------------------------------------------------===// |
64 | // Helper functions |
65 | |
66 | // Is the instruction a type of multiply on 64-bit (or 32-bit) FPRs? |
67 | static bool isMul(MachineInstr *MI) { |
68 | switch (MI->getOpcode()) { |
69 | case AArch64::FMULSrr: |
70 | case AArch64::FNMULSrr: |
71 | case AArch64::FMULDrr: |
72 | case AArch64::FNMULDrr: |
73 | return true; |
74 | default: |
75 | return false; |
76 | } |
77 | } |
78 | |
79 | // Is the instruction a type of FP multiply-accumulate on 64-bit (or 32-bit) FPRs? |
80 | static bool isMla(MachineInstr *MI) { |
81 | switch (MI->getOpcode()) { |
82 | case AArch64::FMSUBSrrr: |
83 | case AArch64::FMADDSrrr: |
84 | case AArch64::FNMSUBSrrr: |
85 | case AArch64::FNMADDSrrr: |
86 | case AArch64::FMSUBDrrr: |
87 | case AArch64::FMADDDrrr: |
88 | case AArch64::FNMSUBDrrr: |
89 | case AArch64::FNMADDDrrr: |
90 | return true; |
91 | default: |
92 | return false; |
93 | } |
94 | } |
95 | |
96 | //===----------------------------------------------------------------------===// |
97 | |
98 | namespace { |
99 | /// A "color", which is either even or odd. Yes, these aren't really colors |
100 | /// but the algorithm is conceptually doing two-color graph coloring. |
101 | enum class Color { Even, Odd }; |
102 | #ifndef NDEBUG |
103 | static const char *ColorNames[2] = { "Even" , "Odd" }; |
104 | #endif |
105 | |
106 | class Chain; |
107 | |
108 | class AArch64A57FPLoadBalancing : public MachineFunctionPass { |
109 | MachineRegisterInfo *MRI; |
110 | const TargetRegisterInfo *TRI; |
111 | RegisterClassInfo RCI; |
112 | |
113 | public: |
114 | static char ID; |
115 | explicit AArch64A57FPLoadBalancing() : MachineFunctionPass(ID) { |
116 | initializeAArch64A57FPLoadBalancingPass(*PassRegistry::getPassRegistry()); |
117 | } |
118 | |
119 | bool runOnMachineFunction(MachineFunction &F) override; |
120 | |
121 | MachineFunctionProperties getRequiredProperties() const override { |
122 | return MachineFunctionProperties().set( |
123 | MachineFunctionProperties::Property::NoVRegs); |
124 | } |
125 | |
126 | StringRef getPassName() const override { |
127 | return "A57 FP Anti-dependency breaker" ; |
128 | } |
129 | |
130 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
131 | AU.setPreservesCFG(); |
132 | MachineFunctionPass::getAnalysisUsage(AU); |
133 | } |
134 | |
135 | private: |
136 | bool runOnBasicBlock(MachineBasicBlock &MBB); |
137 | bool colorChainSet(std::vector<Chain*> GV, MachineBasicBlock &MBB, |
138 | int &Balance); |
139 | bool colorChain(Chain *G, Color C, MachineBasicBlock &MBB); |
140 | int scavengeRegister(Chain *G, Color C, MachineBasicBlock &MBB); |
141 | void scanInstruction(MachineInstr *MI, unsigned Idx, |
142 | std::map<unsigned, Chain*> &Active, |
143 | std::vector<std::unique_ptr<Chain>> &AllChains); |
144 | void maybeKillChain(MachineOperand &MO, unsigned Idx, |
145 | std::map<unsigned, Chain*> &RegChains); |
146 | Color getColor(unsigned Register); |
147 | Chain *getAndEraseNext(Color PreferredColor, std::vector<Chain*> &L); |
148 | }; |
149 | } |
150 | |
151 | char AArch64A57FPLoadBalancing::ID = 0; |
152 | |
153 | INITIALIZE_PASS_BEGIN(AArch64A57FPLoadBalancing, DEBUG_TYPE, |
154 | "AArch64 A57 FP Load-Balancing" , false, false) |
155 | INITIALIZE_PASS_END(AArch64A57FPLoadBalancing, DEBUG_TYPE, |
156 | "AArch64 A57 FP Load-Balancing" , false, false) |
157 | |
158 | namespace { |
159 | /// A Chain is a sequence of instructions that are linked together by |
160 | /// an accumulation operand. For example: |
161 | /// |
162 | /// fmul def d0, ? |
163 | /// fmla def d1, ?, ?, killed d0 |
164 | /// fmla def d2, ?, ?, killed d1 |
165 | /// |
166 | /// There may be other instructions interleaved in the sequence that |
167 | /// do not belong to the chain. These other instructions must not use |
168 | /// the "chain" register at any point. |
169 | /// |
170 | /// We currently only support chains where the "chain" operand is killed |
171 | /// at each link in the chain for simplicity. |
172 | /// A chain has three important instructions - Start, Last and Kill. |
173 | /// * The start instruction is the first instruction in the chain. |
174 | /// * Last is the final instruction in the chain. |
175 | /// * Kill may or may not be defined. If defined, Kill is the instruction |
176 | /// where the outgoing value of the Last instruction is killed. |
177 | /// This information is important as if we know the outgoing value is |
178 | /// killed with no intervening uses, we can safely change its register. |
179 | /// |
180 | /// Without a kill instruction, we must assume the outgoing value escapes |
181 | /// beyond our model and either must not change its register or must |
182 | /// create a fixup FMOV to keep the old register value consistent. |
183 | /// |
184 | class Chain { |
185 | public: |
186 | /// The important (marker) instructions. |
187 | MachineInstr *StartInst, *LastInst, *KillInst; |
188 | /// The index, from the start of the basic block, that each marker |
189 | /// appears. These are stored so we can do quick interval tests. |
190 | unsigned StartInstIdx, LastInstIdx, KillInstIdx; |
191 | /// All instructions in the chain. |
192 | std::set<MachineInstr*> Insts; |
193 | /// True if KillInst cannot be modified. If this is true, |
194 | /// we cannot change LastInst's outgoing register. |
195 | /// This will be true for tied values and regmasks. |
196 | bool KillIsImmutable; |
197 | /// The "color" of LastInst. This will be the preferred chain color, |
198 | /// as changing intermediate nodes is easy but changing the last |
199 | /// instruction can be more tricky. |
200 | Color LastColor; |
201 | |
202 | Chain(MachineInstr *MI, unsigned Idx, Color C) |
203 | : StartInst(MI), LastInst(MI), KillInst(nullptr), |
204 | StartInstIdx(Idx), LastInstIdx(Idx), KillInstIdx(0), |
205 | LastColor(C) { |
206 | Insts.insert(x: MI); |
207 | } |
208 | |
209 | /// Add a new instruction into the chain. The instruction's dest operand |
210 | /// has the given color. |
211 | void add(MachineInstr *MI, unsigned Idx, Color C) { |
212 | LastInst = MI; |
213 | LastInstIdx = Idx; |
214 | LastColor = C; |
215 | assert((KillInstIdx == 0 || LastInstIdx < KillInstIdx) && |
216 | "Chain: broken invariant. A Chain can only be killed after its last " |
217 | "def" ); |
218 | |
219 | Insts.insert(x: MI); |
220 | } |
221 | |
222 | /// Return true if MI is a member of the chain. |
223 | bool contains(MachineInstr &MI) { return Insts.count(x: &MI) > 0; } |
224 | |
225 | /// Return the number of instructions in the chain. |
226 | unsigned size() const { |
227 | return Insts.size(); |
228 | } |
229 | |
230 | /// Inform the chain that its last active register (the dest register of |
231 | /// LastInst) is killed by MI with no intervening uses or defs. |
232 | void setKill(MachineInstr *MI, unsigned Idx, bool Immutable) { |
233 | KillInst = MI; |
234 | KillInstIdx = Idx; |
235 | KillIsImmutable = Immutable; |
236 | assert((KillInstIdx == 0 || LastInstIdx < KillInstIdx) && |
237 | "Chain: broken invariant. A Chain can only be killed after its last " |
238 | "def" ); |
239 | } |
240 | |
241 | /// Return the first instruction in the chain. |
242 | MachineInstr *getStart() const { return StartInst; } |
243 | /// Return the last instruction in the chain. |
244 | MachineInstr *getLast() const { return LastInst; } |
245 | /// Return the "kill" instruction (as set with setKill()) or NULL. |
246 | MachineInstr *getKill() const { return KillInst; } |
247 | /// Return an instruction that can be used as an iterator for the end |
248 | /// of the chain. This is the maximum of KillInst (if set) and LastInst. |
249 | MachineBasicBlock::iterator end() const { |
250 | return ++MachineBasicBlock::iterator(KillInst ? KillInst : LastInst); |
251 | } |
252 | MachineBasicBlock::iterator begin() const { return getStart(); } |
253 | |
254 | /// Can the Kill instruction (assuming one exists) be modified? |
255 | bool isKillImmutable() const { return KillIsImmutable; } |
256 | |
257 | /// Return the preferred color of this chain. |
258 | Color getPreferredColor() { |
259 | if (OverrideBalance != 0) |
260 | return OverrideBalance == 1 ? Color::Even : Color::Odd; |
261 | return LastColor; |
262 | } |
263 | |
264 | /// Return true if this chain (StartInst..KillInst) overlaps with Other. |
265 | bool rangeOverlapsWith(const Chain &Other) const { |
266 | unsigned End = KillInst ? KillInstIdx : LastInstIdx; |
267 | unsigned OtherEnd = Other.KillInst ? |
268 | Other.KillInstIdx : Other.LastInstIdx; |
269 | |
270 | return StartInstIdx <= OtherEnd && Other.StartInstIdx <= End; |
271 | } |
272 | |
273 | /// Return true if this chain starts before Other. |
274 | bool startsBefore(const Chain *Other) const { |
275 | return StartInstIdx < Other->StartInstIdx; |
276 | } |
277 | |
278 | /// Return true if the group will require a fixup MOV at the end. |
279 | bool requiresFixup() const { |
280 | return (getKill() && isKillImmutable()) || !getKill(); |
281 | } |
282 | |
283 | /// Return a simple string representation of the chain. |
284 | std::string str() const { |
285 | std::string S; |
286 | raw_string_ostream OS(S); |
287 | |
288 | OS << "{" ; |
289 | StartInst->print(OS, /* SkipOpers= */IsStandalone: true); |
290 | OS << " -> " ; |
291 | LastInst->print(OS, /* SkipOpers= */IsStandalone: true); |
292 | if (KillInst) { |
293 | OS << " (kill @ " ; |
294 | KillInst->print(OS, /* SkipOpers= */IsStandalone: true); |
295 | OS << ")" ; |
296 | } |
297 | OS << "}" ; |
298 | |
299 | return OS.str(); |
300 | } |
301 | |
302 | }; |
303 | |
304 | } // end anonymous namespace |
305 | |
306 | //===----------------------------------------------------------------------===// |
307 | |
308 | bool AArch64A57FPLoadBalancing::runOnMachineFunction(MachineFunction &F) { |
309 | if (skipFunction(F: F.getFunction())) |
310 | return false; |
311 | |
312 | if (!F.getSubtarget<AArch64Subtarget>().balanceFPOps()) |
313 | return false; |
314 | |
315 | bool Changed = false; |
316 | LLVM_DEBUG(dbgs() << "***** AArch64A57FPLoadBalancing *****\n" ); |
317 | |
318 | MRI = &F.getRegInfo(); |
319 | TRI = F.getRegInfo().getTargetRegisterInfo(); |
320 | RCI.runOnMachineFunction(MF: F); |
321 | |
322 | for (auto &MBB : F) { |
323 | Changed |= runOnBasicBlock(MBB); |
324 | } |
325 | |
326 | return Changed; |
327 | } |
328 | |
329 | bool AArch64A57FPLoadBalancing::runOnBasicBlock(MachineBasicBlock &MBB) { |
330 | bool Changed = false; |
331 | LLVM_DEBUG(dbgs() << "Running on MBB: " << MBB |
332 | << " - scanning instructions...\n" ); |
333 | |
334 | // First, scan the basic block producing a set of chains. |
335 | |
336 | // The currently "active" chains - chains that can be added to and haven't |
337 | // been killed yet. This is keyed by register - all chains can only have one |
338 | // "link" register between each inst in the chain. |
339 | std::map<unsigned, Chain*> ActiveChains; |
340 | std::vector<std::unique_ptr<Chain>> AllChains; |
341 | unsigned Idx = 0; |
342 | for (auto &MI : MBB) |
343 | scanInstruction(MI: &MI, Idx: Idx++, Active&: ActiveChains, AllChains); |
344 | |
345 | LLVM_DEBUG(dbgs() << "Scan complete, " << AllChains.size() |
346 | << " chains created.\n" ); |
347 | |
348 | // Group the chains into disjoint sets based on their liveness range. This is |
349 | // a poor-man's version of graph coloring. Ideally we'd create an interference |
350 | // graph and perform full-on graph coloring on that, but; |
351 | // (a) That's rather heavyweight for only two colors. |
352 | // (b) We expect multiple disjoint interference regions - in practice the live |
353 | // range of chains is quite small and they are clustered between loads |
354 | // and stores. |
355 | EquivalenceClasses<Chain*> EC; |
356 | for (auto &I : AllChains) |
357 | EC.insert(Data: I.get()); |
358 | |
359 | for (auto &I : AllChains) |
360 | for (auto &J : AllChains) |
361 | if (I != J && I->rangeOverlapsWith(Other: *J)) |
362 | EC.unionSets(V1: I.get(), V2: J.get()); |
363 | LLVM_DEBUG(dbgs() << "Created " << EC.getNumClasses() << " disjoint sets.\n" ); |
364 | |
365 | // Now we assume that every member of an equivalence class interferes |
366 | // with every other member of that class, and with no members of other classes. |
367 | |
368 | // Convert the EquivalenceClasses to a simpler set of sets. |
369 | std::vector<std::vector<Chain*> > V; |
370 | for (auto I = EC.begin(), E = EC.end(); I != E; ++I) { |
371 | std::vector<Chain*> Cs(EC.member_begin(I), EC.member_end()); |
372 | if (Cs.empty()) continue; |
373 | V.push_back(x: std::move(Cs)); |
374 | } |
375 | |
376 | // Now we have a set of sets, order them by start address so |
377 | // we can iterate over them sequentially. |
378 | llvm::sort(C&: V, |
379 | Comp: [](const std::vector<Chain *> &A, const std::vector<Chain *> &B) { |
380 | return A.front()->startsBefore(Other: B.front()); |
381 | }); |
382 | |
383 | // As we only have two colors, we can track the global (BB-level) balance of |
384 | // odds versus evens. We aim to keep this near zero to keep both execution |
385 | // units fed. |
386 | // Positive means we're even-heavy, negative we're odd-heavy. |
387 | // |
388 | // FIXME: If chains have interdependencies, for example: |
389 | // mul r0, r1, r2 |
390 | // mul r3, r0, r1 |
391 | // We do not model this and may color each one differently, assuming we'll |
392 | // get ILP when we obviously can't. This hasn't been seen to be a problem |
393 | // in practice so far, so we simplify the algorithm by ignoring it. |
394 | int Parity = 0; |
395 | |
396 | for (auto &I : V) |
397 | Changed |= colorChainSet(GV: std::move(I), MBB, Balance&: Parity); |
398 | |
399 | return Changed; |
400 | } |
401 | |
402 | Chain *AArch64A57FPLoadBalancing::getAndEraseNext(Color PreferredColor, |
403 | std::vector<Chain*> &L) { |
404 | if (L.empty()) |
405 | return nullptr; |
406 | |
407 | // We try and get the best candidate from L to color next, given that our |
408 | // preferred color is "PreferredColor". L is ordered from larger to smaller |
409 | // chains. It is beneficial to color the large chains before the small chains, |
410 | // but if we can't find a chain of the maximum length with the preferred color, |
411 | // we fuzz the size and look for slightly smaller chains before giving up and |
412 | // returning a chain that must be recolored. |
413 | |
414 | // FIXME: Does this need to be configurable? |
415 | const unsigned SizeFuzz = 1; |
416 | unsigned MinSize = L.front()->size() - SizeFuzz; |
417 | for (auto I = L.begin(), E = L.end(); I != E; ++I) { |
418 | if ((*I)->size() <= MinSize) { |
419 | // We've gone past the size limit. Return the previous item. |
420 | Chain *Ch = *--I; |
421 | L.erase(position: I); |
422 | return Ch; |
423 | } |
424 | |
425 | if ((*I)->getPreferredColor() == PreferredColor) { |
426 | Chain *Ch = *I; |
427 | L.erase(position: I); |
428 | return Ch; |
429 | } |
430 | } |
431 | |
432 | // Bailout case - just return the first item. |
433 | Chain *Ch = L.front(); |
434 | L.erase(position: L.begin()); |
435 | return Ch; |
436 | } |
437 | |
438 | bool AArch64A57FPLoadBalancing::colorChainSet(std::vector<Chain*> GV, |
439 | MachineBasicBlock &MBB, |
440 | int &Parity) { |
441 | bool Changed = false; |
442 | LLVM_DEBUG(dbgs() << "colorChainSet(): #sets=" << GV.size() << "\n" ); |
443 | |
444 | // Sort by descending size order so that we allocate the most important |
445 | // sets first. |
446 | // Tie-break equivalent sizes by sorting chains requiring fixups before |
447 | // those without fixups. The logic here is that we should look at the |
448 | // chains that we cannot change before we look at those we can, |
449 | // so the parity counter is updated and we know what color we should |
450 | // change them to! |
451 | // Final tie-break with instruction order so pass output is stable (i.e. not |
452 | // dependent on malloc'd pointer values). |
453 | llvm::sort(C&: GV, Comp: [](const Chain *G1, const Chain *G2) { |
454 | if (G1->size() != G2->size()) |
455 | return G1->size() > G2->size(); |
456 | if (G1->requiresFixup() != G2->requiresFixup()) |
457 | return G1->requiresFixup() > G2->requiresFixup(); |
458 | // Make sure startsBefore() produces a stable final order. |
459 | assert((G1 == G2 || (G1->startsBefore(G2) ^ G2->startsBefore(G1))) && |
460 | "Starts before not total order!" ); |
461 | return G1->startsBefore(Other: G2); |
462 | }); |
463 | |
464 | Color PreferredColor = Parity < 0 ? Color::Even : Color::Odd; |
465 | while (Chain *G = getAndEraseNext(PreferredColor, L&: GV)) { |
466 | // Start off by assuming we'll color to our own preferred color. |
467 | Color C = PreferredColor; |
468 | if (Parity == 0) |
469 | // But if we really don't care, use the chain's preferred color. |
470 | C = G->getPreferredColor(); |
471 | |
472 | LLVM_DEBUG(dbgs() << " - Parity=" << Parity |
473 | << ", Color=" << ColorNames[(int)C] << "\n" ); |
474 | |
475 | // If we'll need a fixup FMOV, don't bother. Testing has shown that this |
476 | // happens infrequently and when it does it has at least a 50% chance of |
477 | // slowing code down instead of speeding it up. |
478 | if (G->requiresFixup() && C != G->getPreferredColor()) { |
479 | C = G->getPreferredColor(); |
480 | LLVM_DEBUG(dbgs() << " - " << G->str() |
481 | << " - not worthwhile changing; " |
482 | "color remains " |
483 | << ColorNames[(int)C] << "\n" ); |
484 | } |
485 | |
486 | Changed |= colorChain(G, C, MBB); |
487 | |
488 | Parity += (C == Color::Even) ? G->size() : -G->size(); |
489 | PreferredColor = Parity < 0 ? Color::Even : Color::Odd; |
490 | } |
491 | |
492 | return Changed; |
493 | } |
494 | |
495 | int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C, |
496 | MachineBasicBlock &MBB) { |
497 | // Can we find an appropriate register that is available throughout the life |
498 | // of the chain? Simulate liveness backwards until the end of the chain. |
499 | LiveRegUnits Units(*TRI); |
500 | Units.addLiveOuts(MBB); |
501 | MachineBasicBlock::iterator I = MBB.end(); |
502 | MachineBasicBlock::iterator ChainEnd = G->end(); |
503 | while (I != ChainEnd) { |
504 | --I; |
505 | Units.stepBackward(MI: *I); |
506 | } |
507 | |
508 | // Check which register units are alive throughout the chain. |
509 | MachineBasicBlock::iterator ChainBegin = G->begin(); |
510 | assert(ChainBegin != ChainEnd && "Chain should contain instructions" ); |
511 | do { |
512 | --I; |
513 | Units.accumulate(MI: *I); |
514 | } while (I != ChainBegin); |
515 | |
516 | // Make sure we allocate in-order, to get the cheapest registers first. |
517 | unsigned RegClassID = ChainBegin->getDesc().operands()[0].RegClass; |
518 | auto Ord = RCI.getOrder(RC: TRI->getRegClass(i: RegClassID)); |
519 | for (auto Reg : Ord) { |
520 | if (!Units.available(Reg)) |
521 | continue; |
522 | if (C == getColor(Register: Reg)) |
523 | return Reg; |
524 | } |
525 | |
526 | return -1; |
527 | } |
528 | |
529 | bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C, |
530 | MachineBasicBlock &MBB) { |
531 | bool Changed = false; |
532 | LLVM_DEBUG(dbgs() << " - colorChain(" << G->str() << ", " |
533 | << ColorNames[(int)C] << ")\n" ); |
534 | |
535 | // Try and obtain a free register of the right class. Without a register |
536 | // to play with we cannot continue. |
537 | int Reg = scavengeRegister(G, C, MBB); |
538 | if (Reg == -1) { |
539 | LLVM_DEBUG(dbgs() << "Scavenging (thus coloring) failed!\n" ); |
540 | return false; |
541 | } |
542 | LLVM_DEBUG(dbgs() << " - Scavenged register: " << printReg(Reg, TRI) << "\n" ); |
543 | |
544 | std::map<unsigned, unsigned> Substs; |
545 | for (MachineInstr &I : *G) { |
546 | if (!G->contains(MI&: I) && (&I != G->getKill() || G->isKillImmutable())) |
547 | continue; |
548 | |
549 | // I is a member of G, or I is a mutable instruction that kills G. |
550 | |
551 | std::vector<unsigned> ToErase; |
552 | for (auto &U : I.operands()) { |
553 | if (U.isReg() && U.isUse() && Substs.find(x: U.getReg()) != Substs.end()) { |
554 | Register OrigReg = U.getReg(); |
555 | U.setReg(Substs[OrigReg]); |
556 | if (U.isKill()) |
557 | // Don't erase straight away, because there may be other operands |
558 | // that also reference this substitution! |
559 | ToErase.push_back(x: OrigReg); |
560 | } else if (U.isRegMask()) { |
561 | for (auto J : Substs) { |
562 | if (U.clobbersPhysReg(PhysReg: J.first)) |
563 | ToErase.push_back(x: J.first); |
564 | } |
565 | } |
566 | } |
567 | // Now it's safe to remove the substs identified earlier. |
568 | for (auto J : ToErase) |
569 | Substs.erase(x: J); |
570 | |
571 | // Only change the def if this isn't the last instruction. |
572 | if (&I != G->getKill()) { |
573 | MachineOperand &MO = I.getOperand(i: 0); |
574 | |
575 | bool Change = TransformAll || getColor(Register: MO.getReg()) != C; |
576 | if (G->requiresFixup() && &I == G->getLast()) |
577 | Change = false; |
578 | |
579 | if (Change) { |
580 | Substs[MO.getReg()] = Reg; |
581 | MO.setReg(Reg); |
582 | |
583 | Changed = true; |
584 | } |
585 | } |
586 | } |
587 | assert(Substs.size() == 0 && "No substitutions should be left active!" ); |
588 | |
589 | if (G->getKill()) { |
590 | LLVM_DEBUG(dbgs() << " - Kill instruction seen.\n" ); |
591 | } else { |
592 | // We didn't have a kill instruction, but we didn't seem to need to change |
593 | // the destination register anyway. |
594 | LLVM_DEBUG(dbgs() << " - Destination register not changed.\n" ); |
595 | } |
596 | return Changed; |
597 | } |
598 | |
599 | void AArch64A57FPLoadBalancing::scanInstruction( |
600 | MachineInstr *MI, unsigned Idx, std::map<unsigned, Chain *> &ActiveChains, |
601 | std::vector<std::unique_ptr<Chain>> &AllChains) { |
602 | // Inspect "MI", updating ActiveChains and AllChains. |
603 | |
604 | if (isMul(MI)) { |
605 | |
606 | for (auto &I : MI->uses()) |
607 | maybeKillChain(MO&: I, Idx, RegChains&: ActiveChains); |
608 | for (auto &I : MI->defs()) |
609 | maybeKillChain(MO&: I, Idx, RegChains&: ActiveChains); |
610 | |
611 | // Create a new chain. Multiplies don't require forwarding so can go on any |
612 | // unit. |
613 | Register DestReg = MI->getOperand(i: 0).getReg(); |
614 | |
615 | LLVM_DEBUG(dbgs() << "New chain started for register " |
616 | << printReg(DestReg, TRI) << " at " << *MI); |
617 | |
618 | auto G = std::make_unique<Chain>(args&: MI, args&: Idx, args: getColor(Register: DestReg)); |
619 | ActiveChains[DestReg] = G.get(); |
620 | AllChains.push_back(x: std::move(G)); |
621 | |
622 | } else if (isMla(MI)) { |
623 | |
624 | // It is beneficial to keep MLAs on the same functional unit as their |
625 | // accumulator operand. |
626 | Register DestReg = MI->getOperand(i: 0).getReg(); |
627 | Register AccumReg = MI->getOperand(i: 3).getReg(); |
628 | |
629 | maybeKillChain(MO&: MI->getOperand(i: 1), Idx, RegChains&: ActiveChains); |
630 | maybeKillChain(MO&: MI->getOperand(i: 2), Idx, RegChains&: ActiveChains); |
631 | if (DestReg != AccumReg) |
632 | maybeKillChain(MO&: MI->getOperand(i: 0), Idx, RegChains&: ActiveChains); |
633 | |
634 | if (ActiveChains.find(x: AccumReg) != ActiveChains.end()) { |
635 | LLVM_DEBUG(dbgs() << "Chain found for accumulator register " |
636 | << printReg(AccumReg, TRI) << " in MI " << *MI); |
637 | |
638 | // For simplicity we only chain together sequences of MULs/MLAs where the |
639 | // accumulator register is killed on each instruction. This means we don't |
640 | // need to track other uses of the registers we want to rewrite. |
641 | // |
642 | // FIXME: We could extend to handle the non-kill cases for more coverage. |
643 | if (MI->getOperand(i: 3).isKill()) { |
644 | // Add to chain. |
645 | LLVM_DEBUG(dbgs() << "Instruction was successfully added to chain.\n" ); |
646 | ActiveChains[AccumReg]->add(MI, Idx, C: getColor(Register: DestReg)); |
647 | // Handle cases where the destination is not the same as the accumulator. |
648 | if (DestReg != AccumReg) { |
649 | ActiveChains[DestReg] = ActiveChains[AccumReg]; |
650 | ActiveChains.erase(x: AccumReg); |
651 | } |
652 | return; |
653 | } |
654 | |
655 | LLVM_DEBUG( |
656 | dbgs() << "Cannot add to chain because accumulator operand wasn't " |
657 | << "marked <kill>!\n" ); |
658 | maybeKillChain(MO&: MI->getOperand(i: 3), Idx, RegChains&: ActiveChains); |
659 | } |
660 | |
661 | LLVM_DEBUG(dbgs() << "Creating new chain for dest register " |
662 | << printReg(DestReg, TRI) << "\n" ); |
663 | auto G = std::make_unique<Chain>(args&: MI, args&: Idx, args: getColor(Register: DestReg)); |
664 | ActiveChains[DestReg] = G.get(); |
665 | AllChains.push_back(x: std::move(G)); |
666 | |
667 | } else { |
668 | |
669 | // Non-MUL or MLA instruction. Invalidate any chain in the uses or defs |
670 | // lists. |
671 | for (auto &I : MI->uses()) |
672 | maybeKillChain(MO&: I, Idx, RegChains&: ActiveChains); |
673 | for (auto &I : MI->defs()) |
674 | maybeKillChain(MO&: I, Idx, RegChains&: ActiveChains); |
675 | |
676 | } |
677 | } |
678 | |
679 | void AArch64A57FPLoadBalancing:: |
680 | maybeKillChain(MachineOperand &MO, unsigned Idx, |
681 | std::map<unsigned, Chain*> &ActiveChains) { |
682 | // Given an operand and the set of active chains (keyed by register), |
683 | // determine if a chain should be ended and remove from ActiveChains. |
684 | MachineInstr *MI = MO.getParent(); |
685 | |
686 | if (MO.isReg()) { |
687 | |
688 | // If this is a KILL of a current chain, record it. |
689 | if (MO.isKill() && ActiveChains.find(x: MO.getReg()) != ActiveChains.end()) { |
690 | LLVM_DEBUG(dbgs() << "Kill seen for chain " << printReg(MO.getReg(), TRI) |
691 | << "\n" ); |
692 | ActiveChains[MO.getReg()]->setKill(MI, Idx, /*Immutable=*/MO.isTied()); |
693 | } |
694 | ActiveChains.erase(x: MO.getReg()); |
695 | |
696 | } else if (MO.isRegMask()) { |
697 | |
698 | for (auto I = ActiveChains.begin(), E = ActiveChains.end(); |
699 | I != E;) { |
700 | if (MO.clobbersPhysReg(PhysReg: I->first)) { |
701 | LLVM_DEBUG(dbgs() << "Kill (regmask) seen for chain " |
702 | << printReg(I->first, TRI) << "\n" ); |
703 | I->second->setKill(MI, Idx, /*Immutable=*/true); |
704 | ActiveChains.erase(position: I++); |
705 | } else |
706 | ++I; |
707 | } |
708 | |
709 | } |
710 | } |
711 | |
712 | Color AArch64A57FPLoadBalancing::getColor(unsigned Reg) { |
713 | if ((TRI->getEncodingValue(RegNo: Reg) % 2) == 0) |
714 | return Color::Even; |
715 | else |
716 | return Color::Odd; |
717 | } |
718 | |
719 | // Factory function used by AArch64TargetMachine to add the pass to the passmanager. |
720 | FunctionPass *llvm::createAArch64A57FPLoadBalancing() { |
721 | return new AArch64A57FPLoadBalancing(); |
722 | } |
723 | |