1 | //===-- AArch64StackTaggingPreRA.cpp --- Stack Tagging for AArch64 -----===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "AArch64.h" |
10 | #include "AArch64InstrInfo.h" |
11 | #include "AArch64MachineFunctionInfo.h" |
12 | #include "llvm/ADT/SetVector.h" |
13 | #include "llvm/ADT/Statistic.h" |
14 | #include "llvm/CodeGen/MachineFrameInfo.h" |
15 | #include "llvm/CodeGen/MachineFunction.h" |
16 | #include "llvm/CodeGen/MachineFunctionPass.h" |
17 | #include "llvm/CodeGen/MachineInstrBuilder.h" |
18 | #include "llvm/CodeGen/MachineRegisterInfo.h" |
19 | #include "llvm/CodeGen/MachineTraceMetrics.h" |
20 | #include "llvm/CodeGen/Passes.h" |
21 | #include "llvm/CodeGen/TargetInstrInfo.h" |
22 | #include "llvm/CodeGen/TargetRegisterInfo.h" |
23 | #include "llvm/CodeGen/TargetSubtargetInfo.h" |
24 | #include "llvm/Support/CommandLine.h" |
25 | #include "llvm/Support/Debug.h" |
26 | #include "llvm/Support/raw_ostream.h" |
27 | |
28 | using namespace llvm; |
29 | |
30 | #define DEBUG_TYPE "aarch64-stack-tagging-pre-ra" |
31 | |
32 | enum UncheckedLdStMode { UncheckedNever, UncheckedSafe, UncheckedAlways }; |
33 | |
34 | static cl::opt<UncheckedLdStMode> ClUncheckedLdSt( |
35 | "stack-tagging-unchecked-ld-st" , cl::Hidden, cl::init(Val: UncheckedSafe), |
36 | cl::desc( |
37 | "Unconditionally apply unchecked-ld-st optimization (even for large " |
38 | "stack frames, or in the presence of variable sized allocas)." ), |
39 | cl::values( |
40 | clEnumValN(UncheckedNever, "never" , "never apply unchecked-ld-st" ), |
41 | clEnumValN( |
42 | UncheckedSafe, "safe" , |
43 | "apply unchecked-ld-st when the target is definitely within range" ), |
44 | clEnumValN(UncheckedAlways, "always" , "always apply unchecked-ld-st" ))); |
45 | |
46 | static cl::opt<bool> |
47 | ClFirstSlot("stack-tagging-first-slot-opt" , cl::Hidden, cl::init(Val: true), |
48 | cl::desc("Apply first slot optimization for stack tagging " |
49 | "(eliminate ADDG Rt, Rn, 0, 0)." )); |
50 | |
51 | namespace { |
52 | |
53 | class AArch64StackTaggingPreRA : public MachineFunctionPass { |
54 | MachineFunction *MF; |
55 | AArch64FunctionInfo *AFI; |
56 | MachineFrameInfo *MFI; |
57 | MachineRegisterInfo *MRI; |
58 | const AArch64RegisterInfo *TRI; |
59 | const AArch64InstrInfo *TII; |
60 | |
61 | SmallVector<MachineInstr*, 16> ReTags; |
62 | |
63 | public: |
64 | static char ID; |
65 | AArch64StackTaggingPreRA() : MachineFunctionPass(ID) {} |
66 | |
67 | bool mayUseUncheckedLoadStore(); |
68 | void uncheckUsesOf(unsigned TaggedReg, int FI); |
69 | void uncheckLoadsAndStores(); |
70 | std::optional<int> findFirstSlotCandidate(); |
71 | |
72 | bool runOnMachineFunction(MachineFunction &Func) override; |
73 | StringRef getPassName() const override { |
74 | return "AArch64 Stack Tagging PreRA" ; |
75 | } |
76 | |
77 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
78 | AU.setPreservesCFG(); |
79 | MachineFunctionPass::getAnalysisUsage(AU); |
80 | } |
81 | }; |
82 | } // end anonymous namespace |
83 | |
84 | char AArch64StackTaggingPreRA::ID = 0; |
85 | |
86 | INITIALIZE_PASS_BEGIN(AArch64StackTaggingPreRA, "aarch64-stack-tagging-pre-ra" , |
87 | "AArch64 Stack Tagging PreRA Pass" , false, false) |
88 | INITIALIZE_PASS_END(AArch64StackTaggingPreRA, "aarch64-stack-tagging-pre-ra" , |
89 | "AArch64 Stack Tagging PreRA Pass" , false, false) |
90 | |
91 | FunctionPass *llvm::createAArch64StackTaggingPreRAPass() { |
92 | return new AArch64StackTaggingPreRA(); |
93 | } |
94 | |
95 | static bool isUncheckedLoadOrStoreOpcode(unsigned Opcode) { |
96 | switch (Opcode) { |
97 | case AArch64::LDRBBui: |
98 | case AArch64::LDRHHui: |
99 | case AArch64::LDRWui: |
100 | case AArch64::LDRXui: |
101 | |
102 | case AArch64::LDRBui: |
103 | case AArch64::LDRHui: |
104 | case AArch64::LDRSui: |
105 | case AArch64::LDRDui: |
106 | case AArch64::LDRQui: |
107 | |
108 | case AArch64::LDRSHWui: |
109 | case AArch64::LDRSHXui: |
110 | |
111 | case AArch64::LDRSBWui: |
112 | case AArch64::LDRSBXui: |
113 | |
114 | case AArch64::LDRSWui: |
115 | |
116 | case AArch64::STRBBui: |
117 | case AArch64::STRHHui: |
118 | case AArch64::STRWui: |
119 | case AArch64::STRXui: |
120 | |
121 | case AArch64::STRBui: |
122 | case AArch64::STRHui: |
123 | case AArch64::STRSui: |
124 | case AArch64::STRDui: |
125 | case AArch64::STRQui: |
126 | |
127 | case AArch64::LDPWi: |
128 | case AArch64::LDPXi: |
129 | case AArch64::LDPSi: |
130 | case AArch64::LDPDi: |
131 | case AArch64::LDPQi: |
132 | |
133 | case AArch64::LDPSWi: |
134 | |
135 | case AArch64::STPWi: |
136 | case AArch64::STPXi: |
137 | case AArch64::STPSi: |
138 | case AArch64::STPDi: |
139 | case AArch64::STPQi: |
140 | return true; |
141 | default: |
142 | return false; |
143 | } |
144 | } |
145 | |
146 | bool AArch64StackTaggingPreRA::mayUseUncheckedLoadStore() { |
147 | if (ClUncheckedLdSt == UncheckedNever) |
148 | return false; |
149 | else if (ClUncheckedLdSt == UncheckedAlways) |
150 | return true; |
151 | |
152 | // This estimate can be improved if we had harder guarantees about stack frame |
153 | // layout. With LocalStackAllocation we can estimate SP offset to any |
154 | // preallocated slot. AArch64FrameLowering::orderFrameObjects could put tagged |
155 | // objects ahead of non-tagged ones, but that's not always desirable. |
156 | // |
157 | // Underestimating SP offset here may require the use of LDG to materialize |
158 | // the tagged address of the stack slot, along with a scratch register |
159 | // allocation (post-regalloc!). |
160 | // |
161 | // For now we do the safe thing here and require that the entire stack frame |
162 | // is within range of the shortest of the unchecked instructions. |
163 | unsigned FrameSize = 0; |
164 | for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) |
165 | FrameSize += MFI->getObjectSize(ObjectIdx: i); |
166 | bool EntireFrameReachableFromSP = FrameSize < 0xf00; |
167 | return !MFI->hasVarSizedObjects() && EntireFrameReachableFromSP; |
168 | } |
169 | |
170 | void AArch64StackTaggingPreRA::uncheckUsesOf(unsigned TaggedReg, int FI) { |
171 | for (MachineInstr &UseI : |
172 | llvm::make_early_inc_range(Range: MRI->use_instructions(Reg: TaggedReg))) { |
173 | if (isUncheckedLoadOrStoreOpcode(Opcode: UseI.getOpcode())) { |
174 | // FI operand is always the one before the immediate offset. |
175 | unsigned OpIdx = TII->getLoadStoreImmIdx(Opc: UseI.getOpcode()) - 1; |
176 | if (UseI.getOperand(i: OpIdx).isReg() && |
177 | UseI.getOperand(i: OpIdx).getReg() == TaggedReg) { |
178 | UseI.getOperand(i: OpIdx).ChangeToFrameIndex(Idx: FI); |
179 | UseI.getOperand(i: OpIdx).setTargetFlags(AArch64II::MO_TAGGED); |
180 | } |
181 | } else if (UseI.isCopy() && UseI.getOperand(i: 0).getReg().isVirtual()) { |
182 | uncheckUsesOf(TaggedReg: UseI.getOperand(i: 0).getReg(), FI); |
183 | } |
184 | } |
185 | } |
186 | |
187 | void AArch64StackTaggingPreRA::uncheckLoadsAndStores() { |
188 | for (auto *I : ReTags) { |
189 | Register TaggedReg = I->getOperand(i: 0).getReg(); |
190 | int FI = I->getOperand(i: 1).getIndex(); |
191 | uncheckUsesOf(TaggedReg, FI); |
192 | } |
193 | } |
194 | |
195 | namespace { |
196 | struct SlotWithTag { |
197 | int FI; |
198 | int Tag; |
199 | SlotWithTag(int FI, int Tag) : FI(FI), Tag(Tag) {} |
200 | explicit SlotWithTag(const MachineInstr &MI) |
201 | : FI(MI.getOperand(i: 1).getIndex()), Tag(MI.getOperand(i: 4).getImm()) {} |
202 | bool operator==(const SlotWithTag &Other) const { |
203 | return FI == Other.FI && Tag == Other.Tag; |
204 | } |
205 | }; |
206 | } // namespace |
207 | |
208 | namespace llvm { |
209 | template <> struct DenseMapInfo<SlotWithTag> { |
210 | static inline SlotWithTag getEmptyKey() { return {-2, -2}; } |
211 | static inline SlotWithTag getTombstoneKey() { return {-3, -3}; } |
212 | static unsigned getHashValue(const SlotWithTag &V) { |
213 | return hash_combine(args: DenseMapInfo<int>::getHashValue(Val: V.FI), |
214 | args: DenseMapInfo<int>::getHashValue(Val: V.Tag)); |
215 | } |
216 | static bool isEqual(const SlotWithTag &A, const SlotWithTag &B) { |
217 | return A == B; |
218 | } |
219 | }; |
220 | } // namespace llvm |
221 | |
222 | static bool isSlotPreAllocated(MachineFrameInfo *MFI, int FI) { |
223 | return MFI->getUseLocalStackAllocationBlock() && |
224 | MFI->isObjectPreAllocated(ObjectIdx: FI); |
225 | } |
226 | |
227 | // Pin one of the tagged slots to offset 0 from the tagged base pointer. |
228 | // This would make its address available in a virtual register (IRG's def), as |
229 | // opposed to requiring an ADDG instruction to materialize. This effectively |
230 | // eliminates a vreg (by replacing it with direct uses of IRG, which is usually |
231 | // live almost everywhere anyway), and therefore needs to happen before |
232 | // regalloc. |
233 | std::optional<int> AArch64StackTaggingPreRA::findFirstSlotCandidate() { |
234 | // Find the best (FI, Tag) pair to pin to offset 0. |
235 | // Looking at the possible uses of a tagged address, the advantage of pinning |
236 | // is: |
237 | // - COPY to physical register. |
238 | // Does not matter, this would trade a MOV instruction for an ADDG. |
239 | // - ST*G matter, but those mostly appear near the function prologue where all |
240 | // the tagged addresses need to be materialized anyway; also, counting ST*G |
241 | // uses would overweight large allocas that require more than one ST*G |
242 | // instruction. |
243 | // - Load/Store instructions in the address operand do not require a tagged |
244 | // pointer, so they also do not benefit. These operands have already been |
245 | // eliminated (see uncheckLoadsAndStores) so all remaining load/store |
246 | // instructions count. |
247 | // - Any other instruction may benefit from being pinned to offset 0. |
248 | LLVM_DEBUG(dbgs() << "AArch64StackTaggingPreRA::findFirstSlotCandidate\n" ); |
249 | if (!ClFirstSlot) |
250 | return std::nullopt; |
251 | |
252 | DenseMap<SlotWithTag, int> RetagScore; |
253 | SlotWithTag MaxScoreST{-1, -1}; |
254 | int MaxScore = -1; |
255 | for (auto *I : ReTags) { |
256 | SlotWithTag ST{*I}; |
257 | if (isSlotPreAllocated(MFI, FI: ST.FI)) |
258 | continue; |
259 | |
260 | Register RetagReg = I->getOperand(i: 0).getReg(); |
261 | if (!RetagReg.isVirtual()) |
262 | continue; |
263 | |
264 | int Score = 0; |
265 | SmallVector<Register, 8> WorkList; |
266 | WorkList.push_back(Elt: RetagReg); |
267 | |
268 | while (!WorkList.empty()) { |
269 | Register UseReg = WorkList.pop_back_val(); |
270 | for (auto &UseI : MRI->use_instructions(Reg: UseReg)) { |
271 | unsigned Opcode = UseI.getOpcode(); |
272 | if (Opcode == AArch64::STGi || Opcode == AArch64::ST2Gi || |
273 | Opcode == AArch64::STZGi || Opcode == AArch64::STZ2Gi || |
274 | Opcode == AArch64::STGPi || Opcode == AArch64::STGloop || |
275 | Opcode == AArch64::STZGloop || Opcode == AArch64::STGloop_wback || |
276 | Opcode == AArch64::STZGloop_wback) |
277 | continue; |
278 | if (UseI.isCopy()) { |
279 | Register DstReg = UseI.getOperand(i: 0).getReg(); |
280 | if (DstReg.isVirtual()) |
281 | WorkList.push_back(Elt: DstReg); |
282 | continue; |
283 | } |
284 | LLVM_DEBUG(dbgs() << "[" << ST.FI << ":" << ST.Tag << "] use of " |
285 | << printReg(UseReg) << " in " << UseI << "\n" ); |
286 | Score++; |
287 | } |
288 | } |
289 | |
290 | int TotalScore = RetagScore[ST] += Score; |
291 | if (TotalScore > MaxScore || |
292 | (TotalScore == MaxScore && ST.FI > MaxScoreST.FI)) { |
293 | MaxScore = TotalScore; |
294 | MaxScoreST = ST; |
295 | } |
296 | } |
297 | |
298 | if (MaxScoreST.FI < 0) |
299 | return std::nullopt; |
300 | |
301 | // If FI's tag is already 0, we are done. |
302 | if (MaxScoreST.Tag == 0) |
303 | return MaxScoreST.FI; |
304 | |
305 | // Otherwise, find a random victim pair (FI, Tag) where Tag == 0. |
306 | SlotWithTag SwapST{-1, -1}; |
307 | for (auto *I : ReTags) { |
308 | SlotWithTag ST{*I}; |
309 | if (ST.Tag == 0) { |
310 | SwapST = ST; |
311 | break; |
312 | } |
313 | } |
314 | |
315 | // Swap tags between the victim and the highest scoring pair. |
316 | // If SwapWith is still (-1, -1), that's fine, too - we'll simply take tag for |
317 | // the highest score slot without changing anything else. |
318 | for (auto *&I : ReTags) { |
319 | SlotWithTag ST{*I}; |
320 | MachineOperand &TagOp = I->getOperand(i: 4); |
321 | if (ST == MaxScoreST) { |
322 | TagOp.setImm(0); |
323 | } else if (ST == SwapST) { |
324 | TagOp.setImm(MaxScoreST.Tag); |
325 | } |
326 | } |
327 | return MaxScoreST.FI; |
328 | } |
329 | |
330 | bool AArch64StackTaggingPreRA::runOnMachineFunction(MachineFunction &Func) { |
331 | MF = &Func; |
332 | MRI = &MF->getRegInfo(); |
333 | AFI = MF->getInfo<AArch64FunctionInfo>(); |
334 | TII = static_cast<const AArch64InstrInfo *>(MF->getSubtarget().getInstrInfo()); |
335 | TRI = static_cast<const AArch64RegisterInfo *>( |
336 | MF->getSubtarget().getRegisterInfo()); |
337 | MFI = &MF->getFrameInfo(); |
338 | ReTags.clear(); |
339 | |
340 | assert(MRI->isSSA()); |
341 | |
342 | LLVM_DEBUG(dbgs() << "********** AArch64 Stack Tagging PreRA **********\n" |
343 | << "********** Function: " << MF->getName() << '\n'); |
344 | |
345 | SmallSetVector<int, 8> TaggedSlots; |
346 | for (auto &BB : *MF) { |
347 | for (auto &I : BB) { |
348 | if (I.getOpcode() == AArch64::TAGPstack) { |
349 | ReTags.push_back(Elt: &I); |
350 | int FI = I.getOperand(i: 1).getIndex(); |
351 | TaggedSlots.insert(X: FI); |
352 | // There should be no offsets in TAGP yet. |
353 | assert(I.getOperand(2).getImm() == 0); |
354 | } |
355 | } |
356 | } |
357 | |
358 | // Take over from SSP. It does nothing for tagged slots, and should not really |
359 | // have been enabled in the first place. |
360 | for (int FI : TaggedSlots) |
361 | MFI->setObjectSSPLayout(ObjectIdx: FI, Kind: MachineFrameInfo::SSPLK_None); |
362 | |
363 | if (ReTags.empty()) |
364 | return false; |
365 | |
366 | if (mayUseUncheckedLoadStore()) |
367 | uncheckLoadsAndStores(); |
368 | |
369 | // Find a slot that is used with zero tag offset, like ADDG #fi, 0. |
370 | // If the base tagged pointer is set up to the address of this slot, |
371 | // the ADDG instruction can be eliminated. |
372 | std::optional<int> BaseSlot = findFirstSlotCandidate(); |
373 | if (BaseSlot) |
374 | AFI->setTaggedBasePointerIndex(*BaseSlot); |
375 | |
376 | for (auto *I : ReTags) { |
377 | int FI = I->getOperand(i: 1).getIndex(); |
378 | int Tag = I->getOperand(i: 4).getImm(); |
379 | Register Base = I->getOperand(i: 3).getReg(); |
380 | if (Tag == 0 && FI == BaseSlot) { |
381 | BuildMI(BB&: *I->getParent(), I, MIMD: {}, MCID: TII->get(Opcode: AArch64::COPY), |
382 | DestReg: I->getOperand(i: 0).getReg()) |
383 | .addReg(RegNo: Base); |
384 | I->eraseFromParent(); |
385 | } |
386 | } |
387 | |
388 | return true; |
389 | } |
390 | |