1 | //===- ARM64.cpp ----------------------------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "Arch/ARM64Common.h" |
10 | #include "InputFiles.h" |
11 | #include "Symbols.h" |
12 | #include "SyntheticSections.h" |
13 | #include "Target.h" |
14 | |
15 | #include "lld/Common/ErrorHandler.h" |
16 | #include "mach-o/compact_unwind_encoding.h" |
17 | #include "llvm/ADT/SmallVector.h" |
18 | #include "llvm/BinaryFormat/MachO.h" |
19 | #include "llvm/Support/Endian.h" |
20 | #include "llvm/Support/LEB128.h" |
21 | #include "llvm/Support/MathExtras.h" |
22 | |
23 | using namespace llvm; |
24 | using namespace llvm::MachO; |
25 | using namespace llvm::support::endian; |
26 | using namespace lld; |
27 | using namespace lld::macho; |
28 | |
29 | namespace { |
30 | |
31 | struct ARM64 : ARM64Common { |
32 | ARM64(); |
33 | void writeStub(uint8_t *buf, const Symbol &, uint64_t) const override; |
34 | void writeStubHelperHeader(uint8_t *buf) const override; |
35 | void writeStubHelperEntry(uint8_t *buf, const Symbol &, |
36 | uint64_t entryAddr) const override; |
37 | |
38 | void writeObjCMsgSendStub(uint8_t *buf, Symbol *sym, uint64_t stubsAddr, |
39 | uint64_t &stubOffset, uint64_t selrefVA, |
40 | Symbol *objcMsgSend) const override; |
41 | void populateThunk(InputSection *thunk, Symbol *funcSym) override; |
42 | void applyOptimizationHints(uint8_t *, const ObjFile &) const override; |
43 | |
44 | void initICFSafeThunkBody(InputSection *thunk, |
45 | Symbol *targetSym) const override; |
46 | Symbol *getThunkBranchTarget(InputSection *thunk) const override; |
47 | uint32_t getICFSafeThunkSize() const override; |
48 | }; |
49 | |
50 | } // namespace |
51 | |
52 | // Random notes on reloc types: |
53 | // ADDEND always pairs with BRANCH26, PAGE21, or PAGEOFF12 |
54 | // POINTER_TO_GOT: ld64 supports a 4-byte pc-relative form as well as an 8-byte |
55 | // absolute version of this relocation. The semantics of the absolute relocation |
56 | // are weird -- it results in the value of the GOT slot being written, instead |
57 | // of the address. Let's not support it unless we find a real-world use case. |
58 | static constexpr std::array<RelocAttrs, 11> relocAttrsArray{._M_elems: { |
59 | #define B(x) RelocAttrBits::x |
60 | {.name: "UNSIGNED" , |
61 | B(UNSIGNED) | B(ABSOLUTE) | B(EXTERN) | B(LOCAL) | B(BYTE4) | B(BYTE8)}, |
62 | {.name: "SUBTRACTOR" , B(SUBTRAHEND) | B(EXTERN) | B(BYTE4) | B(BYTE8)}, |
63 | {.name: "BRANCH26" , B(PCREL) | B(EXTERN) | B(BRANCH) | B(BYTE4)}, |
64 | {.name: "PAGE21" , B(PCREL) | B(EXTERN) | B(BYTE4)}, |
65 | {.name: "PAGEOFF12" , B(ABSOLUTE) | B(EXTERN) | B(BYTE4)}, |
66 | {.name: "GOT_LOAD_PAGE21" , B(PCREL) | B(EXTERN) | B(GOT) | B(BYTE4)}, |
67 | {.name: "GOT_LOAD_PAGEOFF12" , |
68 | B(ABSOLUTE) | B(EXTERN) | B(GOT) | B(LOAD) | B(BYTE4)}, |
69 | {.name: "POINTER_TO_GOT" , B(PCREL) | B(EXTERN) | B(GOT) | B(POINTER) | B(BYTE4)}, |
70 | {.name: "TLVP_LOAD_PAGE21" , B(PCREL) | B(EXTERN) | B(TLV) | B(BYTE4)}, |
71 | {.name: "TLVP_LOAD_PAGEOFF12" , |
72 | B(ABSOLUTE) | B(EXTERN) | B(TLV) | B(LOAD) | B(BYTE4)}, |
73 | {.name: "ADDEND" , B(ADDEND)}, |
74 | #undef B |
75 | }}; |
76 | |
77 | static constexpr uint32_t stubCode[] = { |
78 | 0x90000010, // 00: adrp x16, __la_symbol_ptr@page |
79 | 0xf9400210, // 04: ldr x16, [x16, __la_symbol_ptr@pageoff] |
80 | 0xd61f0200, // 08: br x16 |
81 | }; |
82 | |
83 | void ARM64::writeStub(uint8_t *buf8, const Symbol &sym, |
84 | uint64_t pointerVA) const { |
85 | ::writeStub(buf8, stubCode, sym, pointerVA); |
86 | } |
87 | |
88 | static constexpr uint32_t [] = { |
89 | 0x90000011, // 00: adrp x17, _dyld_private@page |
90 | 0x91000231, // 04: add x17, x17, _dyld_private@pageoff |
91 | 0xa9bf47f0, // 08: stp x16/x17, [sp, #-16]! |
92 | 0x90000010, // 0c: adrp x16, dyld_stub_binder@page |
93 | 0xf9400210, // 10: ldr x16, [x16, dyld_stub_binder@pageoff] |
94 | 0xd61f0200, // 14: br x16 |
95 | }; |
96 | |
97 | void ARM64::(uint8_t *buf8) const { |
98 | ::writeStubHelperHeader<LP64>(buf8, stubHelperHeaderCode); |
99 | } |
100 | |
101 | static constexpr uint32_t stubHelperEntryCode[] = { |
102 | 0x18000050, // 00: ldr w16, l0 |
103 | 0x14000000, // 04: b stubHelperHeader |
104 | 0x00000000, // 08: l0: .long 0 |
105 | }; |
106 | |
107 | void ARM64::writeStubHelperEntry(uint8_t *buf8, const Symbol &sym, |
108 | uint64_t entryVA) const { |
109 | ::writeStubHelperEntry(buf8, stubHelperEntryCode, sym, entryVA); |
110 | } |
111 | |
112 | static constexpr uint32_t objcStubsFastCode[] = { |
113 | 0x90000001, // adrp x1, __objc_selrefs@page |
114 | 0xf9400021, // ldr x1, [x1, @selector("foo")@pageoff] |
115 | 0x90000010, // adrp x16, _got@page |
116 | 0xf9400210, // ldr x16, [x16, _objc_msgSend@pageoff] |
117 | 0xd61f0200, // br x16 |
118 | 0xd4200020, // brk #0x1 |
119 | 0xd4200020, // brk #0x1 |
120 | 0xd4200020, // brk #0x1 |
121 | }; |
122 | |
123 | static constexpr uint32_t objcStubsSmallCode[] = { |
124 | 0x90000001, // adrp x1, __objc_selrefs@page |
125 | 0xf9400021, // ldr x1, [x1, @selector("foo")@pageoff] |
126 | 0x14000000, // b _objc_msgSend |
127 | }; |
128 | |
129 | void ARM64::writeObjCMsgSendStub(uint8_t *buf, Symbol *sym, uint64_t stubsAddr, |
130 | uint64_t &stubOffset, uint64_t selrefVA, |
131 | Symbol *objcMsgSend) const { |
132 | uint64_t objcMsgSendAddr; |
133 | uint64_t objcStubSize; |
134 | uint64_t objcMsgSendIndex; |
135 | |
136 | if (config->objcStubsMode == ObjCStubsMode::fast) { |
137 | objcStubSize = target->objcStubsFastSize; |
138 | objcMsgSendAddr = in.got->addr; |
139 | objcMsgSendIndex = objcMsgSend->gotIndex; |
140 | ::writeObjCMsgSendFastStub<LP64>(buf, objcStubsFastCode, sym, stubsAddr, |
141 | stubOffset, selrefVA, gotAddr: objcMsgSendAddr, |
142 | msgSendIndex: objcMsgSendIndex); |
143 | } else { |
144 | assert(config->objcStubsMode == ObjCStubsMode::small); |
145 | objcStubSize = target->objcStubsSmallSize; |
146 | if (auto *d = dyn_cast<Defined>(Val: objcMsgSend)) { |
147 | objcMsgSendAddr = d->getVA(); |
148 | objcMsgSendIndex = 0; |
149 | } else { |
150 | objcMsgSendAddr = in.stubs->addr; |
151 | objcMsgSendIndex = objcMsgSend->stubsIndex; |
152 | } |
153 | ::writeObjCMsgSendSmallStub<LP64>(buf, objcStubsSmallCode, sym, stubsAddr, |
154 | stubOffset, selrefVA, msgSendAddr: objcMsgSendAddr, |
155 | msgSendIndex: objcMsgSendIndex); |
156 | } |
157 | stubOffset += objcStubSize; |
158 | } |
159 | |
160 | // A thunk is the relaxed variation of stubCode. We don't need the |
161 | // extra indirection through a lazy pointer because the target address |
162 | // is known at link time. |
163 | static constexpr uint32_t thunkCode[] = { |
164 | 0x90000010, // 00: adrp x16, <thunk.ptr>@page |
165 | 0x91000210, // 04: add x16, [x16,<thunk.ptr>@pageoff] |
166 | 0xd61f0200, // 08: br x16 |
167 | }; |
168 | |
169 | void ARM64::populateThunk(InputSection *thunk, Symbol *funcSym) { |
170 | thunk->align = 4; |
171 | thunk->data = {reinterpret_cast<const uint8_t *>(thunkCode), |
172 | sizeof(thunkCode)}; |
173 | thunk->relocs.emplace_back(/*type=*/args: ARM64_RELOC_PAGEOFF12, |
174 | /*pcrel=*/args: false, /*length=*/args: 2, |
175 | /*offset=*/args: 4, /*addend=*/args: 0, |
176 | /*referent=*/args&: funcSym); |
177 | thunk->relocs.emplace_back(/*type=*/args: ARM64_RELOC_PAGE21, |
178 | /*pcrel=*/args: true, /*length=*/args: 2, |
179 | /*offset=*/args: 0, /*addend=*/args: 0, |
180 | /*referent=*/args&: funcSym); |
181 | } |
182 | // Just a single direct branch to the target function. |
183 | static constexpr uint32_t icfSafeThunkCode[] = { |
184 | 0x14000000, // 08: b target |
185 | }; |
186 | |
187 | void ARM64::initICFSafeThunkBody(InputSection *thunk, Symbol *targetSym) const { |
188 | // The base data here will not be itself modified, we'll just be adding a |
189 | // reloc below. So we can directly use the constexpr above as the data. |
190 | thunk->data = {reinterpret_cast<const uint8_t *>(icfSafeThunkCode), |
191 | sizeof(icfSafeThunkCode)}; |
192 | |
193 | thunk->relocs.emplace_back(/*type=*/args: ARM64_RELOC_BRANCH26, |
194 | /*pcrel=*/args: true, /*length=*/args: 2, |
195 | /*offset=*/args: 0, /*addend=*/args: 0, |
196 | /*referent=*/args&: targetSym); |
197 | } |
198 | |
199 | Symbol *ARM64::getThunkBranchTarget(InputSection *thunk) const { |
200 | assert(thunk->relocs.size() == 1 && |
201 | "expected a single reloc on ARM64 ICF thunk" ); |
202 | auto &reloc = thunk->relocs[0]; |
203 | assert(isa<Symbol *>(reloc.referent) && |
204 | "ARM64 thunk reloc is expected to point to a Symbol" ); |
205 | |
206 | return cast<Symbol *>(Val&: reloc.referent); |
207 | } |
208 | |
209 | uint32_t ARM64::getICFSafeThunkSize() const { return sizeof(icfSafeThunkCode); } |
210 | |
211 | ARM64::ARM64() : ARM64Common(LP64()) { |
212 | cpuType = CPU_TYPE_ARM64; |
213 | cpuSubtype = CPU_SUBTYPE_ARM64_ALL; |
214 | |
215 | stubSize = sizeof(stubCode); |
216 | thunkSize = sizeof(thunkCode); |
217 | |
218 | objcStubsFastSize = sizeof(objcStubsFastCode); |
219 | objcStubsFastAlignment = 32; |
220 | objcStubsSmallSize = sizeof(objcStubsSmallCode); |
221 | objcStubsSmallAlignment = 4; |
222 | |
223 | // Branch immediate is two's complement 26 bits, which is implicitly |
224 | // multiplied by 4 (since all functions are 4-aligned: The branch range |
225 | // is -4*(2**(26-1))..4*(2**(26-1) - 1). |
226 | backwardBranchRange = 128 * 1024 * 1024; |
227 | forwardBranchRange = backwardBranchRange - 4; |
228 | |
229 | modeDwarfEncoding = UNWIND_ARM64_MODE_DWARF; |
230 | subtractorRelocType = ARM64_RELOC_SUBTRACTOR; |
231 | unsignedRelocType = ARM64_RELOC_UNSIGNED; |
232 | |
233 | stubHelperHeaderSize = sizeof(stubHelperHeaderCode); |
234 | stubHelperEntrySize = sizeof(stubHelperEntryCode); |
235 | |
236 | relocAttrs = {relocAttrsArray.data(), relocAttrsArray.size()}; |
237 | } |
238 | |
239 | namespace { |
240 | struct Adrp { |
241 | uint32_t destRegister; |
242 | int64_t addend; |
243 | }; |
244 | |
245 | struct Add { |
246 | uint8_t destRegister; |
247 | uint8_t srcRegister; |
248 | uint32_t addend; |
249 | }; |
250 | |
251 | enum ExtendType { ZeroExtend = 1, Sign64 = 2, Sign32 = 3 }; |
252 | |
253 | struct Ldr { |
254 | uint8_t destRegister; |
255 | uint8_t baseRegister; |
256 | uint8_t p2Size; |
257 | bool isFloat; |
258 | ExtendType extendType; |
259 | int64_t offset; |
260 | }; |
261 | } // namespace |
262 | |
263 | static bool parseAdrp(uint32_t insn, Adrp &adrp) { |
264 | if ((insn & 0x9f000000) != 0x90000000) |
265 | return false; |
266 | adrp.destRegister = insn & 0x1f; |
267 | uint64_t immHi = (insn >> 5) & 0x7ffff; |
268 | uint64_t immLo = (insn >> 29) & 0x3; |
269 | adrp.addend = SignExtend64<21>(x: immLo | (immHi << 2)) * 4096; |
270 | return true; |
271 | } |
272 | |
273 | static bool parseAdd(uint32_t insn, Add &add) { |
274 | if ((insn & 0xffc00000) != 0x91000000) |
275 | return false; |
276 | add.destRegister = insn & 0x1f; |
277 | add.srcRegister = (insn >> 5) & 0x1f; |
278 | add.addend = (insn >> 10) & 0xfff; |
279 | return true; |
280 | } |
281 | |
282 | static bool parseLdr(uint32_t insn, Ldr &ldr) { |
283 | ldr.destRegister = insn & 0x1f; |
284 | ldr.baseRegister = (insn >> 5) & 0x1f; |
285 | uint8_t size = insn >> 30; |
286 | uint8_t opc = (insn >> 22) & 3; |
287 | |
288 | if ((insn & 0x3fc00000) == 0x39400000) { |
289 | // LDR (immediate), LDRB (immediate), LDRH (immediate) |
290 | ldr.p2Size = size; |
291 | ldr.extendType = ZeroExtend; |
292 | ldr.isFloat = false; |
293 | } else if ((insn & 0x3f800000) == 0x39800000) { |
294 | // LDRSB (immediate), LDRSH (immediate), LDRSW (immediate) |
295 | ldr.p2Size = size; |
296 | ldr.extendType = static_cast<ExtendType>(opc); |
297 | ldr.isFloat = false; |
298 | } else if ((insn & 0x3f400000) == 0x3d400000) { |
299 | // LDR (immediate, SIMD&FP) |
300 | ldr.extendType = ZeroExtend; |
301 | ldr.isFloat = true; |
302 | if (opc == 1) |
303 | ldr.p2Size = size; |
304 | else if (size == 0 && opc == 3) |
305 | ldr.p2Size = 4; |
306 | else |
307 | return false; |
308 | } else { |
309 | return false; |
310 | } |
311 | ldr.offset = ((insn >> 10) & 0xfff) << ldr.p2Size; |
312 | return true; |
313 | } |
314 | |
315 | static bool isValidAdrOffset(int32_t delta) { return isInt<21>(x: delta); } |
316 | |
317 | static void writeAdr(void *loc, uint32_t dest, int32_t delta) { |
318 | assert(isValidAdrOffset(delta)); |
319 | uint32_t opcode = 0x10000000; |
320 | uint32_t immHi = (delta & 0x001ffffc) << 3; |
321 | uint32_t immLo = (delta & 0x00000003) << 29; |
322 | write32le(P: loc, V: opcode | immHi | immLo | dest); |
323 | } |
324 | |
325 | static void writeNop(void *loc) { write32le(P: loc, V: 0xd503201f); } |
326 | |
327 | static bool isLiteralLdrEligible(const Ldr &ldr) { |
328 | return ldr.p2Size > 1 && isShiftedInt<19, 2>(x: ldr.offset); |
329 | } |
330 | |
331 | static void writeLiteralLdr(void *loc, const Ldr &ldr) { |
332 | assert(isLiteralLdrEligible(ldr)); |
333 | uint32_t imm19 = (ldr.offset / 4 & maskTrailingOnes<uint32_t>(N: 19)) << 5; |
334 | uint32_t opcode; |
335 | switch (ldr.p2Size) { |
336 | case 2: |
337 | if (ldr.isFloat) |
338 | opcode = 0x1c000000; |
339 | else |
340 | opcode = ldr.extendType == Sign64 ? 0x98000000 : 0x18000000; |
341 | break; |
342 | case 3: |
343 | opcode = ldr.isFloat ? 0x5c000000 : 0x58000000; |
344 | break; |
345 | case 4: |
346 | opcode = 0x9c000000; |
347 | break; |
348 | default: |
349 | llvm_unreachable("Invalid literal ldr size" ); |
350 | } |
351 | write32le(P: loc, V: opcode | imm19 | ldr.destRegister); |
352 | } |
353 | |
354 | static bool isImmediateLdrEligible(const Ldr &ldr) { |
355 | // Note: We deviate from ld64's behavior, which converts to immediate loads |
356 | // only if ldr.offset < 4096, even though the offset is divided by the load's |
357 | // size in the 12-bit immediate operand. Only the unsigned offset variant is |
358 | // supported. |
359 | |
360 | uint32_t size = 1 << ldr.p2Size; |
361 | return ldr.offset >= 0 && (ldr.offset % size) == 0 && |
362 | isUInt<12>(x: ldr.offset >> ldr.p2Size); |
363 | } |
364 | |
365 | static void writeImmediateLdr(void *loc, const Ldr &ldr) { |
366 | assert(isImmediateLdrEligible(ldr)); |
367 | uint32_t opcode = 0x39000000; |
368 | if (ldr.isFloat) { |
369 | opcode |= 0x04000000; |
370 | assert(ldr.extendType == ZeroExtend); |
371 | } |
372 | opcode |= ldr.destRegister; |
373 | opcode |= ldr.baseRegister << 5; |
374 | uint8_t size, opc; |
375 | if (ldr.p2Size == 4) { |
376 | size = 0; |
377 | opc = 3; |
378 | } else { |
379 | opc = ldr.extendType; |
380 | size = ldr.p2Size; |
381 | } |
382 | uint32_t immBits = ldr.offset >> ldr.p2Size; |
383 | write32le(P: loc, V: opcode | (immBits << 10) | (opc << 22) | (size << 30)); |
384 | } |
385 | |
386 | // Transforms a pair of adrp+add instructions into an adr instruction if the |
387 | // target is within the +/- 1 MiB range allowed by the adr's 21 bit signed |
388 | // immediate offset. |
389 | // |
390 | // adrp xN, _foo@PAGE |
391 | // add xM, xN, _foo@PAGEOFF |
392 | // -> |
393 | // adr xM, _foo |
394 | // nop |
395 | static bool applyAdrpAdd(uint8_t *buf, const ConcatInputSection *isec, |
396 | uint64_t offset1, uint64_t offset2) { |
397 | uint32_t ins1 = read32le(P: buf + offset1); |
398 | uint32_t ins2 = read32le(P: buf + offset2); |
399 | Adrp adrp; |
400 | Add add; |
401 | if (!parseAdrp(insn: ins1, adrp) || !parseAdd(insn: ins2, add)) |
402 | return false; |
403 | if (adrp.destRegister != add.srcRegister) |
404 | return false; |
405 | |
406 | uint64_t addr1 = isec->getVA() + offset1; |
407 | uint64_t referent = pageBits(address: addr1) + adrp.addend + add.addend; |
408 | int64_t delta = referent - addr1; |
409 | if (!isValidAdrOffset(delta)) |
410 | return false; |
411 | |
412 | writeAdr(loc: buf + offset1, dest: add.destRegister, delta); |
413 | writeNop(loc: buf + offset2); |
414 | return true; |
415 | } |
416 | |
417 | // Transforms two adrp instructions into a single adrp if their referent |
418 | // addresses are located on the same 4096 byte page. |
419 | // |
420 | // adrp xN, _foo@PAGE |
421 | // adrp xN, _bar@PAGE |
422 | // -> |
423 | // adrp xN, _foo@PAGE |
424 | // nop |
425 | static void applyAdrpAdrp(uint8_t *buf, const ConcatInputSection *isec, |
426 | uint64_t offset1, uint64_t offset2) { |
427 | uint32_t ins1 = read32le(P: buf + offset1); |
428 | uint32_t ins2 = read32le(P: buf + offset2); |
429 | Adrp adrp1, adrp2; |
430 | if (!parseAdrp(insn: ins1, adrp&: adrp1) || !parseAdrp(insn: ins2, adrp&: adrp2)) |
431 | return; |
432 | if (adrp1.destRegister != adrp2.destRegister) |
433 | return; |
434 | |
435 | uint64_t page1 = pageBits(address: offset1 + isec->getVA()) + adrp1.addend; |
436 | uint64_t page2 = pageBits(address: offset2 + isec->getVA()) + adrp2.addend; |
437 | if (page1 != page2) |
438 | return; |
439 | |
440 | writeNop(loc: buf + offset2); |
441 | } |
442 | |
443 | // Transforms a pair of adrp+ldr (immediate) instructions into an ldr (literal) |
444 | // load from a PC-relative address if it is 4-byte aligned and within +/- 1 MiB, |
445 | // as ldr can encode a signed 19-bit offset that gets multiplied by 4. |
446 | // |
447 | // adrp xN, _foo@PAGE |
448 | // ldr xM, [xN, _foo@PAGEOFF] |
449 | // -> |
450 | // nop |
451 | // ldr xM, _foo |
452 | static void applyAdrpLdr(uint8_t *buf, const ConcatInputSection *isec, |
453 | uint64_t offset1, uint64_t offset2) { |
454 | uint32_t ins1 = read32le(P: buf + offset1); |
455 | uint32_t ins2 = read32le(P: buf + offset2); |
456 | Adrp adrp; |
457 | Ldr ldr; |
458 | if (!parseAdrp(insn: ins1, adrp) || !parseLdr(insn: ins2, ldr)) |
459 | return; |
460 | if (adrp.destRegister != ldr.baseRegister) |
461 | return; |
462 | |
463 | uint64_t addr1 = isec->getVA() + offset1; |
464 | uint64_t addr2 = isec->getVA() + offset2; |
465 | uint64_t referent = pageBits(address: addr1) + adrp.addend + ldr.offset; |
466 | ldr.offset = referent - addr2; |
467 | if (!isLiteralLdrEligible(ldr)) |
468 | return; |
469 | |
470 | writeNop(loc: buf + offset1); |
471 | writeLiteralLdr(loc: buf + offset2, ldr); |
472 | } |
473 | |
474 | // GOT loads are emitted by the compiler as a pair of adrp and ldr instructions, |
475 | // but they may be changed to adrp+add by relaxGotLoad(). This hint performs |
476 | // the AdrpLdr or AdrpAdd transformation depending on whether it was relaxed. |
477 | static void applyAdrpLdrGot(uint8_t *buf, const ConcatInputSection *isec, |
478 | uint64_t offset1, uint64_t offset2) { |
479 | uint32_t ins2 = read32le(P: buf + offset2); |
480 | Add add; |
481 | Ldr ldr; |
482 | if (parseAdd(insn: ins2, add)) |
483 | applyAdrpAdd(buf, isec, offset1, offset2); |
484 | else if (parseLdr(insn: ins2, ldr)) |
485 | applyAdrpLdr(buf, isec, offset1, offset2); |
486 | } |
487 | |
488 | // Optimizes an adrp+add+ldr sequence used for loading from a local symbol's |
489 | // address by loading directly if it's close enough, or to an adrp(p)+ldr |
490 | // sequence if it's not. |
491 | // |
492 | // adrp x0, _foo@PAGE |
493 | // add x1, x0, _foo@PAGEOFF |
494 | // ldr x2, [x1, #off] |
495 | static void applyAdrpAddLdr(uint8_t *buf, const ConcatInputSection *isec, |
496 | uint64_t offset1, uint64_t offset2, |
497 | uint64_t offset3) { |
498 | uint32_t ins1 = read32le(P: buf + offset1); |
499 | uint32_t ins2 = read32le(P: buf + offset2); |
500 | uint32_t ins3 = read32le(P: buf + offset3); |
501 | Adrp adrp; |
502 | Add add; |
503 | Ldr ldr; |
504 | if (!parseAdrp(insn: ins1, adrp) || !parseAdd(insn: ins2, add) || !parseLdr(insn: ins3, ldr)) |
505 | return; |
506 | if (adrp.destRegister != add.srcRegister) |
507 | return; |
508 | if (add.destRegister != ldr.baseRegister) |
509 | return; |
510 | |
511 | // Load from the target address directly. |
512 | // nop |
513 | // nop |
514 | // ldr x2, [_foo + #off] |
515 | uint64_t addr1 = isec->getVA() + offset1; |
516 | uint64_t addr3 = isec->getVA() + offset3; |
517 | uint64_t referent = pageBits(address: addr1) + adrp.addend + add.addend; |
518 | Ldr literalLdr = ldr; |
519 | literalLdr.offset += referent - addr3; |
520 | if (isLiteralLdrEligible(ldr: literalLdr)) { |
521 | writeNop(loc: buf + offset1); |
522 | writeNop(loc: buf + offset2); |
523 | writeLiteralLdr(loc: buf + offset3, ldr: literalLdr); |
524 | return; |
525 | } |
526 | |
527 | if (applyAdrpAdd(buf, isec, offset1, offset2)) |
528 | return; |
529 | |
530 | // Move the target's page offset into the ldr's immediate offset. |
531 | // adrp x0, _foo@PAGE |
532 | // nop |
533 | // ldr x2, [x0, _foo@PAGEOFF + #off] |
534 | Ldr immediateLdr = ldr; |
535 | immediateLdr.baseRegister = adrp.destRegister; |
536 | immediateLdr.offset += add.addend; |
537 | if (isImmediateLdrEligible(ldr: immediateLdr)) { |
538 | writeNop(loc: buf + offset2); |
539 | writeImmediateLdr(loc: buf + offset3, ldr: immediateLdr); |
540 | return; |
541 | } |
542 | } |
543 | |
544 | // Relaxes a GOT-indirect load. |
545 | // If the referenced symbol is external and its GOT entry is within +/- 1 MiB, |
546 | // the GOT entry can be loaded with a single literal ldr instruction. |
547 | // If the referenced symbol is local and thus has been relaxed to adrp+add+ldr, |
548 | // we perform the AdrpAddLdr transformation. |
549 | static void applyAdrpLdrGotLdr(uint8_t *buf, const ConcatInputSection *isec, |
550 | uint64_t offset1, uint64_t offset2, |
551 | uint64_t offset3) { |
552 | uint32_t ins2 = read32le(P: buf + offset2); |
553 | Add add; |
554 | Ldr ldr2; |
555 | |
556 | if (parseAdd(insn: ins2, add)) { |
557 | applyAdrpAddLdr(buf, isec, offset1, offset2, offset3); |
558 | } else if (parseLdr(insn: ins2, ldr&: ldr2)) { |
559 | // adrp x1, _foo@GOTPAGE |
560 | // ldr x2, [x1, _foo@GOTPAGEOFF] |
561 | // ldr x3, [x2, #off] |
562 | uint32_t ins3 = read32le(P: buf + offset3); |
563 | Ldr ldr3; |
564 | if (!parseLdr(insn: ins3, ldr&: ldr3)) |
565 | return; |
566 | if (ldr3.baseRegister != ldr2.destRegister) |
567 | return; |
568 | // Loads from the GOT must be pointer sized. |
569 | if (ldr2.p2Size != 3 || ldr2.isFloat) |
570 | return; |
571 | applyAdrpLdr(buf, isec, offset1, offset2); |
572 | } |
573 | } |
574 | |
575 | template <typename Callback> |
576 | static void forEachHint(ArrayRef<uint8_t> data, Callback callback) { |
577 | std::array<uint64_t, 3> args; |
578 | |
579 | auto readNext = [&]() -> uint64_t { |
580 | unsigned int n = 0; |
581 | uint64_t value = decodeULEB128(p: data.data(), n: &n, end: data.end()); |
582 | data = data.drop_front(N: n); |
583 | return value; |
584 | }; |
585 | |
586 | while (!data.empty()) { |
587 | uint64_t type = readNext(); |
588 | if (type == 0) |
589 | break; |
590 | |
591 | uint64_t argCount = readNext(); |
592 | for (unsigned i = 0; i < argCount; ++i) { |
593 | uint64_t arg = readNext(); |
594 | if (i < 3) |
595 | args[i] = arg; |
596 | } |
597 | // All known LOH types as of 2022-09 have 3 or fewer arguments; skip others. |
598 | if (argCount > 3) |
599 | continue; |
600 | callback(type, ArrayRef(args.data(), argCount)); |
601 | } |
602 | } |
603 | |
604 | // On RISC architectures like arm64, materializing a memory address generally |
605 | // takes multiple instructions. If the referenced symbol is located close enough |
606 | // in memory, fewer instructions are needed. |
607 | // |
608 | // Linker optimization hints record where addresses are computed. After |
609 | // addresses have been assigned, if possible, we change them to a shorter |
610 | // sequence of instructions. The size of the binary is not modified; the |
611 | // eliminated instructions are replaced with NOPs. This still leads to faster |
612 | // code as the CPU can skip over NOPs quickly. |
613 | // |
614 | // LOHs are specified by the LC_LINKER_OPTIMIZATION_HINTS load command, which |
615 | // points to a sequence of ULEB128-encoded numbers. Each entry specifies a |
616 | // transformation kind, and 2 or 3 addresses where the instructions are located. |
617 | void ARM64::applyOptimizationHints(uint8_t *outBuf, const ObjFile &obj) const { |
618 | ArrayRef<uint8_t> data = obj.getOptimizationHints(); |
619 | if (data.empty()) |
620 | return; |
621 | |
622 | const ConcatInputSection *section = nullptr; |
623 | uint64_t sectionAddr = 0; |
624 | uint8_t *buf = nullptr; |
625 | |
626 | auto findSection = [&](uint64_t addr) { |
627 | if (section && addr >= sectionAddr && |
628 | addr < sectionAddr + section->getSize()) |
629 | return true; |
630 | |
631 | if (obj.sections.empty()) |
632 | return false; |
633 | auto secIt = std::prev(x: llvm::upper_bound( |
634 | Range: obj.sections, Value&: addr, |
635 | C: [](uint64_t off, const Section *sec) { return off < sec->addr; })); |
636 | const Section *sec = *secIt; |
637 | |
638 | if (sec->subsections.empty()) |
639 | return false; |
640 | auto subsecIt = std::prev(x: llvm::upper_bound( |
641 | Range: sec->subsections, Value: addr - sec->addr, |
642 | C: [](uint64_t off, Subsection subsec) { return off < subsec.offset; })); |
643 | const Subsection &subsec = *subsecIt; |
644 | const ConcatInputSection *isec = |
645 | dyn_cast_or_null<ConcatInputSection>(Val: subsec.isec); |
646 | if (!isec || isec->shouldOmitFromOutput()) |
647 | return false; |
648 | |
649 | section = isec; |
650 | sectionAddr = subsec.offset + sec->addr; |
651 | buf = outBuf + section->outSecOff + section->parent->fileOff; |
652 | return true; |
653 | }; |
654 | |
655 | auto isValidOffset = [&](uint64_t offset) { |
656 | if (offset < sectionAddr || offset >= sectionAddr + section->getSize()) { |
657 | error(msg: toString(file: &obj) + |
658 | ": linker optimization hint spans multiple sections" ); |
659 | return false; |
660 | } |
661 | return true; |
662 | }; |
663 | |
664 | bool hasAdrpAdrp = false; |
665 | forEachHint(data, callback: [&](uint64_t kind, ArrayRef<uint64_t> args) { |
666 | if (kind == LOH_ARM64_ADRP_ADRP) { |
667 | hasAdrpAdrp = true; |
668 | return; |
669 | } |
670 | |
671 | if (!findSection(args[0])) |
672 | return; |
673 | switch (kind) { |
674 | case LOH_ARM64_ADRP_ADD: |
675 | if (isValidOffset(args[1])) |
676 | applyAdrpAdd(buf, isec: section, offset1: args[0] - sectionAddr, |
677 | offset2: args[1] - sectionAddr); |
678 | break; |
679 | case LOH_ARM64_ADRP_LDR: |
680 | if (isValidOffset(args[1])) |
681 | applyAdrpLdr(buf, isec: section, offset1: args[0] - sectionAddr, |
682 | offset2: args[1] - sectionAddr); |
683 | break; |
684 | case LOH_ARM64_ADRP_LDR_GOT: |
685 | if (isValidOffset(args[1])) |
686 | applyAdrpLdrGot(buf, isec: section, offset1: args[0] - sectionAddr, |
687 | offset2: args[1] - sectionAddr); |
688 | break; |
689 | case LOH_ARM64_ADRP_ADD_LDR: |
690 | if (isValidOffset(args[1]) && isValidOffset(args[2])) |
691 | applyAdrpAddLdr(buf, isec: section, offset1: args[0] - sectionAddr, |
692 | offset2: args[1] - sectionAddr, offset3: args[2] - sectionAddr); |
693 | break; |
694 | case LOH_ARM64_ADRP_LDR_GOT_LDR: |
695 | if (isValidOffset(args[1]) && isValidOffset(args[2])) |
696 | applyAdrpLdrGotLdr(buf, isec: section, offset1: args[0] - sectionAddr, |
697 | offset2: args[1] - sectionAddr, offset3: args[2] - sectionAddr); |
698 | break; |
699 | case LOH_ARM64_ADRP_ADD_STR: |
700 | case LOH_ARM64_ADRP_LDR_GOT_STR: |
701 | // TODO: Implement these |
702 | break; |
703 | } |
704 | }); |
705 | |
706 | if (!hasAdrpAdrp) |
707 | return; |
708 | |
709 | // AdrpAdrp optimization hints are performed in a second pass because they |
710 | // might interfere with other transformations. For instance, consider the |
711 | // following input: |
712 | // |
713 | // adrp x0, _foo@PAGE |
714 | // add x1, x0, _foo@PAGEOFF |
715 | // adrp x0, _bar@PAGE |
716 | // add x2, x0, _bar@PAGEOFF |
717 | // |
718 | // If we perform the AdrpAdrp relaxation first, we get: |
719 | // |
720 | // adrp x0, _foo@PAGE |
721 | // add x1, x0, _foo@PAGEOFF |
722 | // nop |
723 | // add x2, x0, _bar@PAGEOFF |
724 | // |
725 | // If we then apply AdrpAdd to the first two instructions, the add will have a |
726 | // garbage value in x0: |
727 | // |
728 | // adr x1, _foo |
729 | // nop |
730 | // nop |
731 | // add x2, x0, _bar@PAGEOFF |
732 | forEachHint(data, callback: [&](uint64_t kind, ArrayRef<uint64_t> args) { |
733 | if (kind != LOH_ARM64_ADRP_ADRP) |
734 | return; |
735 | if (!findSection(args[0])) |
736 | return; |
737 | if (isValidOffset(args[1])) |
738 | applyAdrpAdrp(buf, isec: section, offset1: args[0] - sectionAddr, offset2: args[1] - sectionAddr); |
739 | }); |
740 | } |
741 | |
742 | TargetInfo *macho::createARM64TargetInfo() { |
743 | static ARM64 t; |
744 | return &t; |
745 | } |
746 | |