1//===- LinkerOptimizationHints.cpp ----------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "LinkerOptimizationHints.h"
10
11#include "Arch/ARM64Common.h"
12#include "lld/Common/ErrorHandler.h"
13#include "llvm/ADT/ArrayRef.h"
14#include "llvm/BinaryFormat/MachO.h"
15#include "llvm/Support/Endian.h"
16#include "llvm/Support/LEB128.h"
17#include "llvm/Support/MathExtras.h"
18
19using namespace llvm;
20using namespace llvm::MachO;
21using namespace llvm::support::endian;
22using namespace lld;
23using namespace lld::macho;
24
25namespace {
26struct Adrp {
27 uint32_t destRegister;
28 int64_t addend;
29};
30
31struct Add {
32 uint8_t destRegister;
33 uint8_t srcRegister;
34 uint32_t addend;
35};
36
37enum ExtendType { ZeroExtend = 1, Sign64 = 2, Sign32 = 3 };
38
39struct Ldr {
40 uint8_t destRegister;
41 uint8_t baseRegister;
42 uint8_t p2Size;
43 bool isFloat;
44 ExtendType extendType;
45 int64_t offset;
46};
47} // namespace
48
49static bool parseAdrp(uint32_t insn, Adrp &adrp) {
50 if ((insn & 0x9f000000) != 0x90000000)
51 return false;
52 adrp.destRegister = insn & 0x1f;
53 uint64_t immHi = (insn >> 5) & 0x7ffff;
54 uint64_t immLo = (insn >> 29) & 0x3;
55 adrp.addend = SignExtend64<21>(x: immLo | (immHi << 2)) * 4096;
56 return true;
57}
58
59static bool parseAdd(uint32_t insn, Add &add) {
60 if ((insn & 0xffc00000) != 0x91000000)
61 return false;
62 add.destRegister = insn & 0x1f;
63 add.srcRegister = (insn >> 5) & 0x1f;
64 add.addend = (insn >> 10) & 0xfff;
65 return true;
66}
67
68static bool parseLdr(uint32_t insn, Ldr &ldr) {
69 ldr.destRegister = insn & 0x1f;
70 ldr.baseRegister = (insn >> 5) & 0x1f;
71 uint8_t size = insn >> 30;
72 uint8_t opc = (insn >> 22) & 3;
73
74 if ((insn & 0x3fc00000) == 0x39400000) {
75 // LDR (immediate), LDRB (immediate), LDRH (immediate)
76 ldr.p2Size = size;
77 ldr.extendType = ZeroExtend;
78 ldr.isFloat = false;
79 } else if ((insn & 0x3f800000) == 0x39800000) {
80 // LDRSB (immediate), LDRSH (immediate), LDRSW (immediate)
81 ldr.p2Size = size;
82 ldr.extendType = static_cast<ExtendType>(opc);
83 ldr.isFloat = false;
84 } else if ((insn & 0x3f400000) == 0x3d400000) {
85 // LDR (immediate, SIMD&FP)
86 ldr.extendType = ZeroExtend;
87 ldr.isFloat = true;
88 if (opc == 1)
89 ldr.p2Size = size;
90 else if (size == 0 && opc == 3)
91 ldr.p2Size = 4;
92 else
93 return false;
94 } else {
95 return false;
96 }
97 ldr.offset = ((insn >> 10) & 0xfff) << ldr.p2Size;
98 return true;
99}
100
101static bool isValidAdrOffset(int32_t delta) { return isInt<21>(x: delta); }
102
103static void writeAdr(void *loc, uint32_t dest, int32_t delta) {
104 assert(isValidAdrOffset(delta));
105 uint32_t opcode = 0x10000000;
106 uint32_t immHi = (delta & 0x001ffffc) << 3;
107 uint32_t immLo = (delta & 0x00000003) << 29;
108 write32le(P: loc, V: opcode | immHi | immLo | dest);
109}
110
111static void writeNop(void *loc) { write32le(P: loc, V: 0xd503201f); }
112
113static bool isLiteralLdrEligible(const Ldr &ldr) {
114 return ldr.p2Size > 1 && isShiftedInt<19, 2>(x: ldr.offset);
115}
116
117static void writeLiteralLdr(void *loc, const Ldr &ldr) {
118 assert(isLiteralLdrEligible(ldr));
119 uint32_t imm19 = (ldr.offset / 4 & maskTrailingOnes<uint32_t>(N: 19)) << 5;
120 uint32_t opcode;
121 switch (ldr.p2Size) {
122 case 2:
123 if (ldr.isFloat)
124 opcode = 0x1c000000;
125 else
126 opcode = ldr.extendType == Sign64 ? 0x98000000 : 0x18000000;
127 break;
128 case 3:
129 opcode = ldr.isFloat ? 0x5c000000 : 0x58000000;
130 break;
131 case 4:
132 opcode = 0x9c000000;
133 break;
134 default:
135 llvm_unreachable("Invalid literal ldr size");
136 }
137 write32le(P: loc, V: opcode | imm19 | ldr.destRegister);
138}
139
140static bool isImmediateLdrEligible(const Ldr &ldr) {
141 // Note: We deviate from ld64's behavior, which converts to immediate loads
142 // only if ldr.offset < 4096, even though the offset is divided by the load's
143 // size in the 12-bit immediate operand. Only the unsigned offset variant is
144 // supported.
145
146 uint32_t size = 1 << ldr.p2Size;
147 return ldr.offset >= 0 && (ldr.offset % size) == 0 &&
148 isUInt<12>(x: ldr.offset >> ldr.p2Size);
149}
150
151static void writeImmediateLdr(void *loc, const Ldr &ldr) {
152 assert(isImmediateLdrEligible(ldr));
153 uint32_t opcode = 0x39000000;
154 if (ldr.isFloat) {
155 opcode |= 0x04000000;
156 assert(ldr.extendType == ZeroExtend);
157 }
158 opcode |= ldr.destRegister;
159 opcode |= ldr.baseRegister << 5;
160 uint8_t size, opc;
161 if (ldr.p2Size == 4) {
162 size = 0;
163 opc = 3;
164 } else {
165 opc = ldr.extendType;
166 size = ldr.p2Size;
167 }
168 uint32_t immBits = ldr.offset >> ldr.p2Size;
169 write32le(P: loc, V: opcode | (immBits << 10) | (opc << 22) | (size << 30));
170}
171
172// Transforms a pair of adrp+add instructions into an adr instruction if the
173// target is within the +/- 1 MiB range allowed by the adr's 21 bit signed
174// immediate offset.
175//
176// adrp xN, _foo@PAGE
177// add xM, xN, _foo@PAGEOFF
178// ->
179// adr xM, _foo
180// nop
181static bool applyAdrpAdd(uint8_t *buf, const ConcatInputSection *isec,
182 uint64_t offset1, uint64_t offset2) {
183 uint32_t ins1 = read32le(P: buf + offset1);
184 uint32_t ins2 = read32le(P: buf + offset2);
185 Adrp adrp;
186 Add add;
187 if (!parseAdrp(insn: ins1, adrp) || !parseAdd(insn: ins2, add))
188 return false;
189 if (adrp.destRegister != add.srcRegister)
190 return false;
191
192 uint64_t addr1 = isec->getVA() + offset1;
193 uint64_t referent = lld::macho::pageBits(address: addr1) + adrp.addend + add.addend;
194 int64_t delta = referent - addr1;
195 if (!isValidAdrOffset(delta))
196 return false;
197
198 writeAdr(loc: buf + offset1, dest: add.destRegister, delta);
199 writeNop(loc: buf + offset2);
200 return true;
201}
202
203// Transforms two adrp instructions into a single adrp if their referent
204// addresses are located on the same 4096 byte page.
205//
206// adrp xN, _foo@PAGE
207// adrp xN, _bar@PAGE
208// ->
209// adrp xN, _foo@PAGE
210// nop
211static void applyAdrpAdrp(uint8_t *buf, const ConcatInputSection *isec,
212 uint64_t offset1, uint64_t offset2) {
213 uint32_t ins1 = read32le(P: buf + offset1);
214 uint32_t ins2 = read32le(P: buf + offset2);
215 Adrp adrp1, adrp2;
216 if (!parseAdrp(insn: ins1, adrp&: adrp1) || !parseAdrp(insn: ins2, adrp&: adrp2))
217 return;
218 if (adrp1.destRegister != adrp2.destRegister)
219 return;
220
221 uint64_t page1 = pageBits(address: offset1 + isec->getVA()) + adrp1.addend;
222 uint64_t page2 = pageBits(address: offset2 + isec->getVA()) + adrp2.addend;
223 if (page1 != page2)
224 return;
225
226 writeNop(loc: buf + offset2);
227}
228
229// Transforms a pair of adrp+ldr (immediate) instructions into an ldr (literal)
230// load from a PC-relative address if it is 4-byte aligned and within +/- 1 MiB,
231// as ldr can encode a signed 19-bit offset that gets multiplied by 4.
232//
233// adrp xN, _foo@PAGE
234// ldr xM, [xN, _foo@PAGEOFF]
235// ->
236// nop
237// ldr xM, _foo
238static void applyAdrpLdr(uint8_t *buf, const ConcatInputSection *isec,
239 uint64_t offset1, uint64_t offset2) {
240 uint32_t ins1 = read32le(P: buf + offset1);
241 uint32_t ins2 = read32le(P: buf + offset2);
242 Adrp adrp;
243 Ldr ldr;
244 if (!parseAdrp(insn: ins1, adrp) || !parseLdr(insn: ins2, ldr))
245 return;
246 if (adrp.destRegister != ldr.baseRegister)
247 return;
248
249 uint64_t addr1 = isec->getVA() + offset1;
250 uint64_t addr2 = isec->getVA() + offset2;
251 uint64_t referent = pageBits(address: addr1) + adrp.addend + ldr.offset;
252 ldr.offset = referent - addr2;
253 if (!isLiteralLdrEligible(ldr))
254 return;
255
256 writeNop(loc: buf + offset1);
257 writeLiteralLdr(loc: buf + offset2, ldr);
258}
259
260// GOT loads are emitted by the compiler as a pair of adrp and ldr instructions,
261// but they may be changed to adrp+add by relaxGotLoad(). This hint performs
262// the AdrpLdr or AdrpAdd transformation depending on whether it was relaxed.
263static void applyAdrpLdrGot(uint8_t *buf, const ConcatInputSection *isec,
264 uint64_t offset1, uint64_t offset2) {
265 uint32_t ins2 = read32le(P: buf + offset2);
266 Add add;
267 Ldr ldr;
268 if (parseAdd(insn: ins2, add))
269 applyAdrpAdd(buf, isec, offset1, offset2);
270 else if (parseLdr(insn: ins2, ldr))
271 applyAdrpLdr(buf, isec, offset1, offset2);
272}
273
274// Optimizes an adrp+add+ldr sequence used for loading from a local symbol's
275// address by loading directly if it's close enough, or to an adrp(p)+ldr
276// sequence if it's not.
277//
278// adrp x0, _foo@PAGE
279// add x1, x0, _foo@PAGEOFF
280// ldr x2, [x1, #off]
281static void applyAdrpAddLdr(uint8_t *buf, const ConcatInputSection *isec,
282 uint64_t offset1, uint64_t offset2,
283 uint64_t offset3) {
284 uint32_t ins1 = read32le(P: buf + offset1);
285 uint32_t ins2 = read32le(P: buf + offset2);
286 uint32_t ins3 = read32le(P: buf + offset3);
287 Adrp adrp;
288 Add add;
289 Ldr ldr;
290 if (!parseAdrp(insn: ins1, adrp) || !parseAdd(insn: ins2, add) || !parseLdr(insn: ins3, ldr))
291 return;
292 if (adrp.destRegister != add.srcRegister)
293 return;
294 if (add.destRegister != ldr.baseRegister)
295 return;
296
297 // Load from the target address directly.
298 // nop
299 // nop
300 // ldr x2, [_foo + #off]
301 uint64_t addr1 = isec->getVA() + offset1;
302 uint64_t addr3 = isec->getVA() + offset3;
303 uint64_t referent = pageBits(address: addr1) + adrp.addend + add.addend;
304 Ldr literalLdr = ldr;
305 literalLdr.offset += referent - addr3;
306 if (isLiteralLdrEligible(ldr: literalLdr)) {
307 writeNop(loc: buf + offset1);
308 writeNop(loc: buf + offset2);
309 writeLiteralLdr(loc: buf + offset3, ldr: literalLdr);
310 return;
311 }
312
313 if (applyAdrpAdd(buf, isec, offset1, offset2))
314 return;
315
316 // Move the target's page offset into the ldr's immediate offset.
317 // adrp x0, _foo@PAGE
318 // nop
319 // ldr x2, [x0, _foo@PAGEOFF + #off]
320 Ldr immediateLdr = ldr;
321 immediateLdr.baseRegister = adrp.destRegister;
322 immediateLdr.offset += add.addend;
323 if (isImmediateLdrEligible(ldr: immediateLdr)) {
324 writeNop(loc: buf + offset2);
325 writeImmediateLdr(loc: buf + offset3, ldr: immediateLdr);
326 return;
327 }
328}
329
330// Relaxes a GOT-indirect load.
331// If the referenced symbol is external and its GOT entry is within +/- 1 MiB,
332// the GOT entry can be loaded with a single literal ldr instruction.
333// If the referenced symbol is local and thus has been relaxed to adrp+add+ldr,
334// we perform the AdrpAddLdr transformation.
335static void applyAdrpLdrGotLdr(uint8_t *buf, const ConcatInputSection *isec,
336 uint64_t offset1, uint64_t offset2,
337 uint64_t offset3) {
338 uint32_t ins2 = read32le(P: buf + offset2);
339 Add add;
340 Ldr ldr2;
341
342 if (parseAdd(insn: ins2, add)) {
343 applyAdrpAddLdr(buf, isec, offset1, offset2, offset3);
344 } else if (parseLdr(insn: ins2, ldr&: ldr2)) {
345 // adrp x1, _foo@GOTPAGE
346 // ldr x2, [x1, _foo@GOTPAGEOFF]
347 // ldr x3, [x2, #off]
348 uint32_t ins3 = read32le(P: buf + offset3);
349 Ldr ldr3;
350 if (!parseLdr(insn: ins3, ldr&: ldr3))
351 return;
352 if (ldr3.baseRegister != ldr2.destRegister)
353 return;
354 applyAdrpLdr(buf, isec, offset1, offset2);
355 }
356}
357
358template <typename Callback>
359static void forEachHint(ArrayRef<uint8_t> data, Callback callback) {
360 std::array<uint64_t, 3> args;
361
362 auto readNext = [&]() -> uint64_t {
363 unsigned int n = 0;
364 uint64_t value = decodeULEB128(p: data.data(), n: &n, end: data.end());
365 data = data.drop_front(N: n);
366 return value;
367 };
368
369 while (!data.empty()) {
370 uint64_t type = readNext();
371 if (type == 0)
372 break;
373
374 uint64_t argCount = readNext();
375 for (unsigned i = 0; i < argCount; ++i) {
376 uint64_t arg = readNext();
377 if (i < 3)
378 args[i] = arg;
379 }
380 // All known LOH types as of 2022-09 have 3 or fewer arguments; skip others.
381 if (argCount > 3)
382 continue;
383 callback(type, ArrayRef(args.data(), argCount));
384 }
385}
386
387// On RISC architectures like arm64, materializing a memory address generally
388// takes multiple instructions. If the referenced symbol is located close enough
389// in memory, fewer instructions are needed.
390//
391// Linker optimization hints record where addresses are computed. After
392// addresses have been assigned, if possible, we change them to a shorter
393// sequence of instructions. The size of the binary is not modified; the
394// eliminated instructions are replaced with NOPs. This still leads to faster
395// code as the CPU can skip over NOPs quickly.
396//
397// LOHs are specified by the LC_LINKER_OPTIMIZATION_HINTS load command, which
398// points to a sequence of ULEB128-encoded numbers. Each entry specifies a
399// transformation kind, and 2 or 3 addresses where the instructions are located.
400void macho::applyOptimizationHints(uint8_t *outBuf, const ObjFile &obj) {
401 ArrayRef<uint8_t> data = obj.getOptimizationHints();
402 if (data.empty())
403 return;
404
405 const ConcatInputSection *section = nullptr;
406 uint64_t sectionAddr = 0;
407 uint8_t *buf = nullptr;
408
409 auto findSection = [&](uint64_t addr) {
410 if (section && addr >= sectionAddr &&
411 addr < sectionAddr + section->getSize())
412 return true;
413
414 if (obj.sections.empty())
415 return false;
416 auto secIt = std::prev(x: llvm::upper_bound(
417 Range: obj.sections, Value&: addr,
418 C: [](uint64_t off, const Section *sec) { return off < sec->addr; }));
419 const Section *sec = *secIt;
420
421 if (sec->subsections.empty())
422 return false;
423 auto subsecIt = std::prev(x: llvm::upper_bound(
424 Range: sec->subsections, Value: addr - sec->addr,
425 C: [](uint64_t off, Subsection subsec) { return off < subsec.offset; }));
426 const Subsection &subsec = *subsecIt;
427 const ConcatInputSection *isec =
428 dyn_cast_or_null<ConcatInputSection>(Val: subsec.isec);
429 if (!isec || isec->shouldOmitFromOutput())
430 return false;
431
432 section = isec;
433 sectionAddr = subsec.offset + sec->addr;
434 buf = outBuf + section->outSecOff + section->parent->fileOff;
435 return true;
436 };
437
438 auto isValidOffset = [&](uint64_t offset) {
439 if (offset < sectionAddr || offset >= sectionAddr + section->getSize()) {
440 error(msg: toString(file: &obj) +
441 ": linker optimization hint spans multiple sections");
442 return false;
443 }
444 return true;
445 };
446
447 bool hasAdrpAdrp = false;
448 forEachHint(data, callback: [&](uint64_t kind, ArrayRef<uint64_t> args) {
449 if (kind == LOH_ARM64_ADRP_ADRP) {
450 hasAdrpAdrp = true;
451 return;
452 }
453
454 if (!findSection(args[0]))
455 return;
456 switch (kind) {
457 case LOH_ARM64_ADRP_ADD:
458 if (isValidOffset(args[1]))
459 applyAdrpAdd(buf, isec: section, offset1: args[0] - sectionAddr,
460 offset2: args[1] - sectionAddr);
461 break;
462 case LOH_ARM64_ADRP_LDR:
463 if (isValidOffset(args[1]))
464 applyAdrpLdr(buf, isec: section, offset1: args[0] - sectionAddr,
465 offset2: args[1] - sectionAddr);
466 break;
467 case LOH_ARM64_ADRP_LDR_GOT:
468 if (isValidOffset(args[1]))
469 applyAdrpLdrGot(buf, isec: section, offset1: args[0] - sectionAddr,
470 offset2: args[1] - sectionAddr);
471 break;
472 case LOH_ARM64_ADRP_ADD_LDR:
473 if (isValidOffset(args[1]) && isValidOffset(args[2]))
474 applyAdrpAddLdr(buf, isec: section, offset1: args[0] - sectionAddr,
475 offset2: args[1] - sectionAddr, offset3: args[2] - sectionAddr);
476 break;
477 case LOH_ARM64_ADRP_LDR_GOT_LDR:
478 if (isValidOffset(args[1]) && isValidOffset(args[2]))
479 applyAdrpLdrGotLdr(buf, isec: section, offset1: args[0] - sectionAddr,
480 offset2: args[1] - sectionAddr, offset3: args[2] - sectionAddr);
481 break;
482 case LOH_ARM64_ADRP_ADD_STR:
483 case LOH_ARM64_ADRP_LDR_GOT_STR:
484 // TODO: Implement these
485 break;
486 }
487 });
488
489 if (!hasAdrpAdrp)
490 return;
491
492 // AdrpAdrp optimization hints are performed in a second pass because they
493 // might interfere with other transformations. For instance, consider the
494 // following input:
495 //
496 // adrp x0, _foo@PAGE
497 // add x1, x0, _foo@PAGEOFF
498 // adrp x0, _bar@PAGE
499 // add x2, x0, _bar@PAGEOFF
500 //
501 // If we perform the AdrpAdrp relaxation first, we get:
502 //
503 // adrp x0, _foo@PAGE
504 // add x1, x0, _foo@PAGEOFF
505 // nop
506 // add x2, x0, _bar@PAGEOFF
507 //
508 // If we then apply AdrpAdd to the first two instructions, the add will have a
509 // garbage value in x0:
510 //
511 // adr x1, _foo
512 // nop
513 // nop
514 // add x2, x0, _bar@PAGEOFF
515 forEachHint(data, callback: [&](uint64_t kind, ArrayRef<uint64_t> args) {
516 if (kind != LOH_ARM64_ADRP_ADRP)
517 return;
518 if (!findSection(args[0]))
519 return;
520 if (isValidOffset(args[1]))
521 applyAdrpAdrp(buf, isec: section, offset1: args[0] - sectionAddr, offset2: args[1] - sectionAddr);
522 });
523}
524