| 1 | //===-- AArch64StackTaggingPreRA.cpp --- Stack Tagging for AArch64 -----===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | #include "AArch64.h" |
| 10 | #include "AArch64InstrInfo.h" |
| 11 | #include "AArch64MachineFunctionInfo.h" |
| 12 | #include "llvm/ADT/SetVector.h" |
| 13 | #include "llvm/ADT/Statistic.h" |
| 14 | #include "llvm/CodeGen/MachineFrameInfo.h" |
| 15 | #include "llvm/CodeGen/MachineFunction.h" |
| 16 | #include "llvm/CodeGen/MachineFunctionPass.h" |
| 17 | #include "llvm/CodeGen/MachineInstrBuilder.h" |
| 18 | #include "llvm/CodeGen/MachineRegisterInfo.h" |
| 19 | #include "llvm/CodeGen/MachineTraceMetrics.h" |
| 20 | #include "llvm/CodeGen/Passes.h" |
| 21 | #include "llvm/CodeGen/TargetInstrInfo.h" |
| 22 | #include "llvm/CodeGen/TargetRegisterInfo.h" |
| 23 | #include "llvm/CodeGen/TargetSubtargetInfo.h" |
| 24 | #include "llvm/Support/CommandLine.h" |
| 25 | #include "llvm/Support/Debug.h" |
| 26 | #include "llvm/Support/raw_ostream.h" |
| 27 | |
| 28 | using namespace llvm; |
| 29 | |
| 30 | #define DEBUG_TYPE "aarch64-stack-tagging-pre-ra" |
| 31 | |
| 32 | enum UncheckedLdStMode { UncheckedNever, UncheckedSafe, UncheckedAlways }; |
| 33 | |
| 34 | static cl::opt<UncheckedLdStMode> ClUncheckedLdSt( |
| 35 | "stack-tagging-unchecked-ld-st" , cl::Hidden, cl::init(Val: UncheckedSafe), |
| 36 | cl::desc( |
| 37 | "Unconditionally apply unchecked-ld-st optimization (even for large " |
| 38 | "stack frames, or in the presence of variable sized allocas)." ), |
| 39 | cl::values( |
| 40 | clEnumValN(UncheckedNever, "never" , "never apply unchecked-ld-st" ), |
| 41 | clEnumValN( |
| 42 | UncheckedSafe, "safe" , |
| 43 | "apply unchecked-ld-st when the target is definitely within range" ), |
| 44 | clEnumValN(UncheckedAlways, "always" , "always apply unchecked-ld-st" ))); |
| 45 | |
| 46 | static cl::opt<bool> |
| 47 | ClFirstSlot("stack-tagging-first-slot-opt" , cl::Hidden, cl::init(Val: true), |
| 48 | cl::desc("Apply first slot optimization for stack tagging " |
| 49 | "(eliminate ADDG Rt, Rn, 0, 0)." )); |
| 50 | |
| 51 | namespace { |
| 52 | |
| 53 | class AArch64StackTaggingPreRA : public MachineFunctionPass { |
| 54 | MachineFunction *MF; |
| 55 | AArch64FunctionInfo *AFI; |
| 56 | MachineFrameInfo *MFI; |
| 57 | MachineRegisterInfo *MRI; |
| 58 | const AArch64RegisterInfo *TRI; |
| 59 | const AArch64InstrInfo *TII; |
| 60 | |
| 61 | SmallVector<MachineInstr*, 16> ReTags; |
| 62 | |
| 63 | public: |
| 64 | static char ID; |
| 65 | AArch64StackTaggingPreRA() : MachineFunctionPass(ID) {} |
| 66 | |
| 67 | bool mayUseUncheckedLoadStore(); |
| 68 | void uncheckUsesOf(unsigned TaggedReg, int FI); |
| 69 | void uncheckLoadsAndStores(); |
| 70 | std::optional<int> findFirstSlotCandidate(); |
| 71 | |
| 72 | bool runOnMachineFunction(MachineFunction &Func) override; |
| 73 | StringRef getPassName() const override { |
| 74 | return "AArch64 Stack Tagging PreRA" ; |
| 75 | } |
| 76 | |
| 77 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
| 78 | AU.setPreservesCFG(); |
| 79 | MachineFunctionPass::getAnalysisUsage(AU); |
| 80 | } |
| 81 | }; |
| 82 | } // end anonymous namespace |
| 83 | |
| 84 | char AArch64StackTaggingPreRA::ID = 0; |
| 85 | |
| 86 | INITIALIZE_PASS_BEGIN(AArch64StackTaggingPreRA, "aarch64-stack-tagging-pre-ra" , |
| 87 | "AArch64 Stack Tagging PreRA Pass" , false, false) |
| 88 | INITIALIZE_PASS_END(AArch64StackTaggingPreRA, "aarch64-stack-tagging-pre-ra" , |
| 89 | "AArch64 Stack Tagging PreRA Pass" , false, false) |
| 90 | |
| 91 | FunctionPass *llvm::createAArch64StackTaggingPreRAPass() { |
| 92 | return new AArch64StackTaggingPreRA(); |
| 93 | } |
| 94 | |
| 95 | static bool isUncheckedLoadOrStoreOpcode(unsigned Opcode) { |
| 96 | switch (Opcode) { |
| 97 | case AArch64::LDRBBui: |
| 98 | case AArch64::LDRHHui: |
| 99 | case AArch64::LDRWui: |
| 100 | case AArch64::LDRXui: |
| 101 | |
| 102 | case AArch64::LDRBui: |
| 103 | case AArch64::LDRHui: |
| 104 | case AArch64::LDRSui: |
| 105 | case AArch64::LDRDui: |
| 106 | case AArch64::LDRQui: |
| 107 | |
| 108 | case AArch64::LDRSHWui: |
| 109 | case AArch64::LDRSHXui: |
| 110 | |
| 111 | case AArch64::LDRSBWui: |
| 112 | case AArch64::LDRSBXui: |
| 113 | |
| 114 | case AArch64::LDRSWui: |
| 115 | |
| 116 | case AArch64::STRBBui: |
| 117 | case AArch64::STRHHui: |
| 118 | case AArch64::STRWui: |
| 119 | case AArch64::STRXui: |
| 120 | |
| 121 | case AArch64::STRBui: |
| 122 | case AArch64::STRHui: |
| 123 | case AArch64::STRSui: |
| 124 | case AArch64::STRDui: |
| 125 | case AArch64::STRQui: |
| 126 | |
| 127 | case AArch64::LDPWi: |
| 128 | case AArch64::LDPXi: |
| 129 | case AArch64::LDPSi: |
| 130 | case AArch64::LDPDi: |
| 131 | case AArch64::LDPQi: |
| 132 | |
| 133 | case AArch64::LDPSWi: |
| 134 | |
| 135 | case AArch64::STPWi: |
| 136 | case AArch64::STPXi: |
| 137 | case AArch64::STPSi: |
| 138 | case AArch64::STPDi: |
| 139 | case AArch64::STPQi: |
| 140 | return true; |
| 141 | default: |
| 142 | return false; |
| 143 | } |
| 144 | } |
| 145 | |
| 146 | bool AArch64StackTaggingPreRA::mayUseUncheckedLoadStore() { |
| 147 | if (ClUncheckedLdSt == UncheckedNever) |
| 148 | return false; |
| 149 | else if (ClUncheckedLdSt == UncheckedAlways) |
| 150 | return true; |
| 151 | |
| 152 | // This estimate can be improved if we had harder guarantees about stack frame |
| 153 | // layout. With LocalStackAllocation we can estimate SP offset to any |
| 154 | // preallocated slot. AArch64FrameLowering::orderFrameObjects could put tagged |
| 155 | // objects ahead of non-tagged ones, but that's not always desirable. |
| 156 | // |
| 157 | // Underestimating SP offset here may require the use of LDG to materialize |
| 158 | // the tagged address of the stack slot, along with a scratch register |
| 159 | // allocation (post-regalloc!). |
| 160 | // |
| 161 | // For now we do the safe thing here and require that the entire stack frame |
| 162 | // is within range of the shortest of the unchecked instructions. |
| 163 | unsigned FrameSize = 0; |
| 164 | for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) |
| 165 | FrameSize += MFI->getObjectSize(ObjectIdx: i); |
| 166 | bool EntireFrameReachableFromSP = FrameSize < 0xf00; |
| 167 | return !MFI->hasVarSizedObjects() && EntireFrameReachableFromSP; |
| 168 | } |
| 169 | |
| 170 | void AArch64StackTaggingPreRA::uncheckUsesOf(unsigned TaggedReg, int FI) { |
| 171 | for (MachineInstr &UseI : |
| 172 | llvm::make_early_inc_range(Range: MRI->use_instructions(Reg: TaggedReg))) { |
| 173 | if (isUncheckedLoadOrStoreOpcode(Opcode: UseI.getOpcode())) { |
| 174 | // FI operand is always the one before the immediate offset. |
| 175 | unsigned OpIdx = TII->getLoadStoreImmIdx(Opc: UseI.getOpcode()) - 1; |
| 176 | if (UseI.getOperand(i: OpIdx).isReg() && |
| 177 | UseI.getOperand(i: OpIdx).getReg() == TaggedReg) { |
| 178 | UseI.getOperand(i: OpIdx).ChangeToFrameIndex(Idx: FI); |
| 179 | UseI.getOperand(i: OpIdx).setTargetFlags(AArch64II::MO_TAGGED); |
| 180 | } |
| 181 | } else if (UseI.isCopy() && UseI.getOperand(i: 0).getReg().isVirtual()) { |
| 182 | uncheckUsesOf(TaggedReg: UseI.getOperand(i: 0).getReg(), FI); |
| 183 | } |
| 184 | } |
| 185 | } |
| 186 | |
| 187 | void AArch64StackTaggingPreRA::uncheckLoadsAndStores() { |
| 188 | for (auto *I : ReTags) { |
| 189 | Register TaggedReg = I->getOperand(i: 0).getReg(); |
| 190 | int FI = I->getOperand(i: 1).getIndex(); |
| 191 | uncheckUsesOf(TaggedReg, FI); |
| 192 | } |
| 193 | } |
| 194 | |
| 195 | namespace { |
| 196 | struct SlotWithTag { |
| 197 | int FI; |
| 198 | int Tag; |
| 199 | SlotWithTag(int FI, int Tag) : FI(FI), Tag(Tag) {} |
| 200 | explicit SlotWithTag(const MachineInstr &MI) |
| 201 | : FI(MI.getOperand(i: 1).getIndex()), Tag(MI.getOperand(i: 4).getImm()) {} |
| 202 | bool operator==(const SlotWithTag &Other) const { |
| 203 | return FI == Other.FI && Tag == Other.Tag; |
| 204 | } |
| 205 | }; |
| 206 | } // namespace |
| 207 | |
| 208 | namespace llvm { |
| 209 | template <> struct DenseMapInfo<SlotWithTag> { |
| 210 | static inline SlotWithTag getEmptyKey() { return {-2, -2}; } |
| 211 | static inline SlotWithTag getTombstoneKey() { return {-3, -3}; } |
| 212 | static unsigned getHashValue(const SlotWithTag &V) { |
| 213 | return hash_combine(args: DenseMapInfo<int>::getHashValue(Val: V.FI), |
| 214 | args: DenseMapInfo<int>::getHashValue(Val: V.Tag)); |
| 215 | } |
| 216 | static bool isEqual(const SlotWithTag &A, const SlotWithTag &B) { |
| 217 | return A == B; |
| 218 | } |
| 219 | }; |
| 220 | } // namespace llvm |
| 221 | |
| 222 | static bool isSlotPreAllocated(MachineFrameInfo *MFI, int FI) { |
| 223 | return MFI->getUseLocalStackAllocationBlock() && |
| 224 | MFI->isObjectPreAllocated(ObjectIdx: FI); |
| 225 | } |
| 226 | |
| 227 | // Pin one of the tagged slots to offset 0 from the tagged base pointer. |
| 228 | // This would make its address available in a virtual register (IRG's def), as |
| 229 | // opposed to requiring an ADDG instruction to materialize. This effectively |
| 230 | // eliminates a vreg (by replacing it with direct uses of IRG, which is usually |
| 231 | // live almost everywhere anyway), and therefore needs to happen before |
| 232 | // regalloc. |
| 233 | std::optional<int> AArch64StackTaggingPreRA::findFirstSlotCandidate() { |
| 234 | // Find the best (FI, Tag) pair to pin to offset 0. |
| 235 | // Looking at the possible uses of a tagged address, the advantage of pinning |
| 236 | // is: |
| 237 | // - COPY to physical register. |
| 238 | // Does not matter, this would trade a MOV instruction for an ADDG. |
| 239 | // - ST*G matter, but those mostly appear near the function prologue where all |
| 240 | // the tagged addresses need to be materialized anyway; also, counting ST*G |
| 241 | // uses would overweight large allocas that require more than one ST*G |
| 242 | // instruction. |
| 243 | // - Load/Store instructions in the address operand do not require a tagged |
| 244 | // pointer, so they also do not benefit. These operands have already been |
| 245 | // eliminated (see uncheckLoadsAndStores) so all remaining load/store |
| 246 | // instructions count. |
| 247 | // - Any other instruction may benefit from being pinned to offset 0. |
| 248 | LLVM_DEBUG(dbgs() << "AArch64StackTaggingPreRA::findFirstSlotCandidate\n" ); |
| 249 | if (!ClFirstSlot) |
| 250 | return std::nullopt; |
| 251 | |
| 252 | DenseMap<SlotWithTag, int> RetagScore; |
| 253 | SlotWithTag MaxScoreST{-1, -1}; |
| 254 | int MaxScore = -1; |
| 255 | for (auto *I : ReTags) { |
| 256 | SlotWithTag ST{*I}; |
| 257 | if (isSlotPreAllocated(MFI, FI: ST.FI)) |
| 258 | continue; |
| 259 | |
| 260 | Register RetagReg = I->getOperand(i: 0).getReg(); |
| 261 | if (!RetagReg.isVirtual()) |
| 262 | continue; |
| 263 | |
| 264 | int Score = 0; |
| 265 | SmallVector<Register, 8> WorkList; |
| 266 | WorkList.push_back(Elt: RetagReg); |
| 267 | |
| 268 | while (!WorkList.empty()) { |
| 269 | Register UseReg = WorkList.pop_back_val(); |
| 270 | for (auto &UseI : MRI->use_instructions(Reg: UseReg)) { |
| 271 | unsigned Opcode = UseI.getOpcode(); |
| 272 | if (Opcode == AArch64::STGi || Opcode == AArch64::ST2Gi || |
| 273 | Opcode == AArch64::STZGi || Opcode == AArch64::STZ2Gi || |
| 274 | Opcode == AArch64::STGPi || Opcode == AArch64::STGloop || |
| 275 | Opcode == AArch64::STZGloop || Opcode == AArch64::STGloop_wback || |
| 276 | Opcode == AArch64::STZGloop_wback) |
| 277 | continue; |
| 278 | if (UseI.isCopy()) { |
| 279 | Register DstReg = UseI.getOperand(i: 0).getReg(); |
| 280 | if (DstReg.isVirtual()) |
| 281 | WorkList.push_back(Elt: DstReg); |
| 282 | continue; |
| 283 | } |
| 284 | LLVM_DEBUG(dbgs() << "[" << ST.FI << ":" << ST.Tag << "] use of " |
| 285 | << printReg(UseReg) << " in " << UseI << "\n" ); |
| 286 | Score++; |
| 287 | } |
| 288 | } |
| 289 | |
| 290 | int TotalScore = RetagScore[ST] += Score; |
| 291 | if (TotalScore > MaxScore || |
| 292 | (TotalScore == MaxScore && ST.FI > MaxScoreST.FI)) { |
| 293 | MaxScore = TotalScore; |
| 294 | MaxScoreST = ST; |
| 295 | } |
| 296 | } |
| 297 | |
| 298 | if (MaxScoreST.FI < 0) |
| 299 | return std::nullopt; |
| 300 | |
| 301 | // If FI's tag is already 0, we are done. |
| 302 | if (MaxScoreST.Tag == 0) |
| 303 | return MaxScoreST.FI; |
| 304 | |
| 305 | // Otherwise, find a random victim pair (FI, Tag) where Tag == 0. |
| 306 | SlotWithTag SwapST{-1, -1}; |
| 307 | for (auto *I : ReTags) { |
| 308 | SlotWithTag ST{*I}; |
| 309 | if (ST.Tag == 0) { |
| 310 | SwapST = ST; |
| 311 | break; |
| 312 | } |
| 313 | } |
| 314 | |
| 315 | // Swap tags between the victim and the highest scoring pair. |
| 316 | // If SwapWith is still (-1, -1), that's fine, too - we'll simply take tag for |
| 317 | // the highest score slot without changing anything else. |
| 318 | for (auto *&I : ReTags) { |
| 319 | SlotWithTag ST{*I}; |
| 320 | MachineOperand &TagOp = I->getOperand(i: 4); |
| 321 | if (ST == MaxScoreST) { |
| 322 | TagOp.setImm(0); |
| 323 | } else if (ST == SwapST) { |
| 324 | TagOp.setImm(MaxScoreST.Tag); |
| 325 | } |
| 326 | } |
| 327 | return MaxScoreST.FI; |
| 328 | } |
| 329 | |
| 330 | bool AArch64StackTaggingPreRA::runOnMachineFunction(MachineFunction &Func) { |
| 331 | MF = &Func; |
| 332 | MRI = &MF->getRegInfo(); |
| 333 | AFI = MF->getInfo<AArch64FunctionInfo>(); |
| 334 | TII = static_cast<const AArch64InstrInfo *>(MF->getSubtarget().getInstrInfo()); |
| 335 | TRI = static_cast<const AArch64RegisterInfo *>( |
| 336 | MF->getSubtarget().getRegisterInfo()); |
| 337 | MFI = &MF->getFrameInfo(); |
| 338 | ReTags.clear(); |
| 339 | |
| 340 | assert(MRI->isSSA()); |
| 341 | |
| 342 | LLVM_DEBUG(dbgs() << "********** AArch64 Stack Tagging PreRA **********\n" |
| 343 | << "********** Function: " << MF->getName() << '\n'); |
| 344 | |
| 345 | SmallSetVector<int, 8> TaggedSlots; |
| 346 | for (auto &BB : *MF) { |
| 347 | for (auto &I : BB) { |
| 348 | if (I.getOpcode() == AArch64::TAGPstack) { |
| 349 | ReTags.push_back(Elt: &I); |
| 350 | int FI = I.getOperand(i: 1).getIndex(); |
| 351 | TaggedSlots.insert(X: FI); |
| 352 | // There should be no offsets in TAGP yet. |
| 353 | assert(I.getOperand(2).getImm() == 0); |
| 354 | } |
| 355 | } |
| 356 | } |
| 357 | |
| 358 | // Take over from SSP. It does nothing for tagged slots, and should not really |
| 359 | // have been enabled in the first place. |
| 360 | for (int FI : TaggedSlots) |
| 361 | MFI->setObjectSSPLayout(ObjectIdx: FI, Kind: MachineFrameInfo::SSPLK_None); |
| 362 | |
| 363 | if (ReTags.empty()) |
| 364 | return false; |
| 365 | |
| 366 | if (mayUseUncheckedLoadStore()) |
| 367 | uncheckLoadsAndStores(); |
| 368 | |
| 369 | // Find a slot that is used with zero tag offset, like ADDG #fi, 0. |
| 370 | // If the base tagged pointer is set up to the address of this slot, |
| 371 | // the ADDG instruction can be eliminated. |
| 372 | std::optional<int> BaseSlot = findFirstSlotCandidate(); |
| 373 | if (BaseSlot) |
| 374 | AFI->setTaggedBasePointerIndex(*BaseSlot); |
| 375 | |
| 376 | for (auto *I : ReTags) { |
| 377 | int FI = I->getOperand(i: 1).getIndex(); |
| 378 | int Tag = I->getOperand(i: 4).getImm(); |
| 379 | Register Base = I->getOperand(i: 3).getReg(); |
| 380 | if (Tag == 0 && FI == BaseSlot) { |
| 381 | BuildMI(BB&: *I->getParent(), I, MIMD: {}, MCID: TII->get(Opcode: AArch64::COPY), |
| 382 | DestReg: I->getOperand(i: 0).getReg()) |
| 383 | .addReg(RegNo: Base); |
| 384 | I->eraseFromParent(); |
| 385 | } |
| 386 | } |
| 387 | |
| 388 | return true; |
| 389 | } |
| 390 | |