1//===- AArch64.cpp --------------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "InputFiles.h"
10#include "OutputSections.h"
11#include "Symbols.h"
12#include "SyntheticSections.h"
13#include "Target.h"
14#include "TargetImpl.h"
15#include "llvm/BinaryFormat/ELF.h"
16#include "llvm/Support/Endian.h"
17
18using namespace llvm;
19using namespace llvm::support::endian;
20using namespace llvm::ELF;
21using namespace lld;
22using namespace lld::elf;
23
24// Page(Expr) is the page address of the expression Expr, defined
25// as (Expr & ~0xFFF). (This applies even if the machine page size
26// supported by the platform has a different value.)
27uint64_t elf::getAArch64Page(uint64_t expr) {
28 return expr & ~static_cast<uint64_t>(0xFFF);
29}
30
31// A BTI landing pad is a valid target for an indirect branch when the Branch
32// Target Identification has been enabled. As linker generated branches are
33// via x16 the BTI landing pads are defined as: BTI C, BTI J, BTI JC, PACIASP,
34// PACIBSP.
35bool elf::isAArch64BTILandingPad(Ctx &ctx, Symbol &s, int64_t a) {
36 // PLT entries accessed indirectly have a BTI c.
37 if (s.isInPlt(ctx))
38 return true;
39 Defined *d = dyn_cast<Defined>(Val: &s);
40 if (!isa_and_nonnull<InputSection>(Val: d->section))
41 // All places that we cannot disassemble are responsible for making
42 // the target a BTI landing pad.
43 return true;
44 InputSection *isec = cast<InputSection>(Val: d->section);
45 uint64_t off = d->value + a;
46 // Likely user error, but protect ourselves against out of bounds
47 // access.
48 if (off >= isec->getSize())
49 return true;
50 const uint8_t *buf = isec->content().begin();
51 const uint32_t instr = read32le(P: buf + off);
52 // All BTI instructions are HINT instructions which all have same encoding
53 // apart from bits [11:5]
54 if ((instr & 0xd503201f) == 0xd503201f &&
55 is_contained(Set: {/*PACIASP*/ 0xd503233f, /*PACIBSP*/ 0xd503237f,
56 /*BTI C*/ 0xd503245f, /*BTI J*/ 0xd503249f,
57 /*BTI JC*/ 0xd50324df},
58 Element: instr))
59 return true;
60 return false;
61}
62
63namespace {
64class AArch64 : public TargetInfo {
65public:
66 AArch64(Ctx &);
67 RelExpr getRelExpr(RelType type, const Symbol &s,
68 const uint8_t *loc) const override;
69 RelType getDynRel(RelType type) const override;
70 int64_t getImplicitAddend(const uint8_t *buf, RelType type) const override;
71 void writeGotPlt(uint8_t *buf, const Symbol &s) const override;
72 void writeIgotPlt(uint8_t *buf, const Symbol &s) const override;
73 void writePltHeader(uint8_t *buf) const override;
74 void writePlt(uint8_t *buf, const Symbol &sym,
75 uint64_t pltEntryAddr) const override;
76 bool needsThunk(RelExpr expr, RelType type, const InputFile *file,
77 uint64_t branchAddr, const Symbol &s,
78 int64_t a) const override;
79 uint32_t getThunkSectionSpacing() const override;
80 bool inBranchRange(RelType type, uint64_t src, uint64_t dst) const override;
81 bool usesOnlyLowPageBits(RelType type) const override;
82 void relocate(uint8_t *loc, const Relocation &rel,
83 uint64_t val) const override;
84 RelExpr adjustTlsExpr(RelType type, RelExpr expr) const override;
85 void relocateAlloc(InputSectionBase &sec, uint8_t *buf) const override;
86 void applyBranchToBranchOpt() const override;
87
88private:
89 void relaxTlsGdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
90 void relaxTlsGdToIe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
91 void relaxTlsIeToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
92};
93
94struct AArch64Relaxer {
95 Ctx &ctx;
96 bool safeToRelaxAdrpLdr = false;
97
98 AArch64Relaxer(Ctx &ctx, ArrayRef<Relocation> relocs);
99 bool tryRelaxAdrpAdd(const Relocation &adrpRel, const Relocation &addRel,
100 uint64_t secAddr, uint8_t *buf) const;
101 bool tryRelaxAdrpLdr(const Relocation &adrpRel, const Relocation &ldrRel,
102 uint64_t secAddr, uint8_t *buf) const;
103};
104} // namespace
105
106// Return the bits [Start, End] from Val shifted Start bits.
107// For instance, getBits(0xF0, 4, 8) returns 0xF.
108static uint64_t getBits(uint64_t val, int start, int end) {
109 uint64_t mask = ((uint64_t)1 << (end + 1 - start)) - 1;
110 return (val >> start) & mask;
111}
112
113AArch64::AArch64(Ctx &ctx) : TargetInfo(ctx) {
114 copyRel = R_AARCH64_COPY;
115 relativeRel = R_AARCH64_RELATIVE;
116 iRelativeRel = R_AARCH64_IRELATIVE;
117 gotRel = R_AARCH64_GLOB_DAT;
118 pltRel = R_AARCH64_JUMP_SLOT;
119 symbolicRel = R_AARCH64_ABS64;
120 tlsDescRel = R_AARCH64_TLSDESC;
121 tlsGotRel = R_AARCH64_TLS_TPREL64;
122 pltHeaderSize = 32;
123 pltEntrySize = 16;
124 ipltEntrySize = 16;
125 defaultMaxPageSize = 65536;
126
127 // Align to the 2 MiB page size (known as a superpage or huge page).
128 // FreeBSD automatically promotes 2 MiB-aligned allocations.
129 defaultImageBase = 0x200000;
130
131 needsThunks = true;
132}
133
134RelExpr AArch64::getRelExpr(RelType type, const Symbol &s,
135 const uint8_t *loc) const {
136 switch (type) {
137 case R_AARCH64_ABS16:
138 case R_AARCH64_ABS32:
139 case R_AARCH64_ABS64:
140 case R_AARCH64_ADD_ABS_LO12_NC:
141 case R_AARCH64_LDST128_ABS_LO12_NC:
142 case R_AARCH64_LDST16_ABS_LO12_NC:
143 case R_AARCH64_LDST32_ABS_LO12_NC:
144 case R_AARCH64_LDST64_ABS_LO12_NC:
145 case R_AARCH64_LDST8_ABS_LO12_NC:
146 case R_AARCH64_MOVW_SABS_G0:
147 case R_AARCH64_MOVW_SABS_G1:
148 case R_AARCH64_MOVW_SABS_G2:
149 case R_AARCH64_MOVW_UABS_G0:
150 case R_AARCH64_MOVW_UABS_G0_NC:
151 case R_AARCH64_MOVW_UABS_G1:
152 case R_AARCH64_MOVW_UABS_G1_NC:
153 case R_AARCH64_MOVW_UABS_G2:
154 case R_AARCH64_MOVW_UABS_G2_NC:
155 case R_AARCH64_MOVW_UABS_G3:
156 return R_ABS;
157 case R_AARCH64_AUTH_ABS64:
158 return RE_AARCH64_AUTH;
159 case R_AARCH64_TLSDESC_ADR_PAGE21:
160 return RE_AARCH64_TLSDESC_PAGE;
161 case R_AARCH64_AUTH_TLSDESC_ADR_PAGE21:
162 return RE_AARCH64_AUTH_TLSDESC_PAGE;
163 case R_AARCH64_TLSDESC_LD64_LO12:
164 case R_AARCH64_TLSDESC_ADD_LO12:
165 return R_TLSDESC;
166 case R_AARCH64_AUTH_TLSDESC_LD64_LO12:
167 case R_AARCH64_AUTH_TLSDESC_ADD_LO12:
168 return RE_AARCH64_AUTH_TLSDESC;
169 case R_AARCH64_TLSDESC_CALL:
170 return R_TLSDESC_CALL;
171 case R_AARCH64_TLSLE_ADD_TPREL_HI12:
172 case R_AARCH64_TLSLE_ADD_TPREL_LO12_NC:
173 case R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC:
174 case R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC:
175 case R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC:
176 case R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC:
177 case R_AARCH64_TLSLE_LDST128_TPREL_LO12_NC:
178 case R_AARCH64_TLSLE_MOVW_TPREL_G0:
179 case R_AARCH64_TLSLE_MOVW_TPREL_G0_NC:
180 case R_AARCH64_TLSLE_MOVW_TPREL_G1:
181 case R_AARCH64_TLSLE_MOVW_TPREL_G1_NC:
182 case R_AARCH64_TLSLE_MOVW_TPREL_G2:
183 return R_TPREL;
184 case R_AARCH64_CALL26:
185 case R_AARCH64_CONDBR19:
186 case R_AARCH64_JUMP26:
187 case R_AARCH64_TSTBR14:
188 return R_PLT_PC;
189 case R_AARCH64_PLT32:
190 const_cast<Symbol &>(s).thunkAccessed = true;
191 return R_PLT_PC;
192 case R_AARCH64_PREL16:
193 case R_AARCH64_PREL32:
194 case R_AARCH64_PREL64:
195 case R_AARCH64_ADR_PREL_LO21:
196 case R_AARCH64_LD_PREL_LO19:
197 case R_AARCH64_MOVW_PREL_G0:
198 case R_AARCH64_MOVW_PREL_G0_NC:
199 case R_AARCH64_MOVW_PREL_G1:
200 case R_AARCH64_MOVW_PREL_G1_NC:
201 case R_AARCH64_MOVW_PREL_G2:
202 case R_AARCH64_MOVW_PREL_G2_NC:
203 case R_AARCH64_MOVW_PREL_G3:
204 return R_PC;
205 case R_AARCH64_ADR_PREL_PG_HI21:
206 case R_AARCH64_ADR_PREL_PG_HI21_NC:
207 return RE_AARCH64_PAGE_PC;
208 case R_AARCH64_LD64_GOT_LO12_NC:
209 case R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC:
210 return R_GOT;
211 case R_AARCH64_AUTH_LD64_GOT_LO12_NC:
212 case R_AARCH64_AUTH_GOT_ADD_LO12_NC:
213 return RE_AARCH64_AUTH_GOT;
214 case R_AARCH64_AUTH_GOT_LD_PREL19:
215 case R_AARCH64_AUTH_GOT_ADR_PREL_LO21:
216 return RE_AARCH64_AUTH_GOT_PC;
217 case R_AARCH64_LD64_GOTPAGE_LO15:
218 return RE_AARCH64_GOT_PAGE;
219 case R_AARCH64_ADR_GOT_PAGE:
220 case R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21:
221 return RE_AARCH64_GOT_PAGE_PC;
222 case R_AARCH64_AUTH_ADR_GOT_PAGE:
223 return RE_AARCH64_AUTH_GOT_PAGE_PC;
224 case R_AARCH64_GOTPCREL32:
225 case R_AARCH64_GOT_LD_PREL19:
226 return R_GOT_PC;
227 case R_AARCH64_NONE:
228 return R_NONE;
229 default:
230 Err(ctx) << getErrorLoc(ctx, loc) << "unknown relocation (" << type.v
231 << ") against symbol " << &s;
232 return R_NONE;
233 }
234}
235
236RelExpr AArch64::adjustTlsExpr(RelType type, RelExpr expr) const {
237 if (expr == R_RELAX_TLS_GD_TO_IE) {
238 if (type == R_AARCH64_TLSDESC_ADR_PAGE21)
239 return RE_AARCH64_RELAX_TLS_GD_TO_IE_PAGE_PC;
240 return R_RELAX_TLS_GD_TO_IE_ABS;
241 }
242 return expr;
243}
244
245bool AArch64::usesOnlyLowPageBits(RelType type) const {
246 switch (type) {
247 default:
248 return false;
249 case R_AARCH64_ADD_ABS_LO12_NC:
250 case R_AARCH64_LD64_GOT_LO12_NC:
251 case R_AARCH64_LDST128_ABS_LO12_NC:
252 case R_AARCH64_LDST16_ABS_LO12_NC:
253 case R_AARCH64_LDST32_ABS_LO12_NC:
254 case R_AARCH64_LDST64_ABS_LO12_NC:
255 case R_AARCH64_LDST8_ABS_LO12_NC:
256 case R_AARCH64_TLSDESC_ADD_LO12:
257 case R_AARCH64_TLSDESC_LD64_LO12:
258 case R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC:
259 return true;
260 }
261}
262
263RelType AArch64::getDynRel(RelType type) const {
264 if (type == R_AARCH64_ABS64 || type == R_AARCH64_AUTH_ABS64)
265 return type;
266 return R_AARCH64_NONE;
267}
268
269int64_t AArch64::getImplicitAddend(const uint8_t *buf, RelType type) const {
270 switch (type) {
271 case R_AARCH64_TLSDESC:
272 return read64(ctx, p: buf + 8);
273 case R_AARCH64_NONE:
274 case R_AARCH64_GLOB_DAT:
275 case R_AARCH64_AUTH_GLOB_DAT:
276 case R_AARCH64_JUMP_SLOT:
277 return 0;
278 case R_AARCH64_ABS16:
279 case R_AARCH64_PREL16:
280 return SignExtend64<16>(x: read16(ctx, p: buf));
281 case R_AARCH64_ABS32:
282 case R_AARCH64_PREL32:
283 return SignExtend64<32>(x: read32(ctx, p: buf));
284 case R_AARCH64_ABS64:
285 case R_AARCH64_PREL64:
286 case R_AARCH64_RELATIVE:
287 case R_AARCH64_IRELATIVE:
288 case R_AARCH64_TLS_TPREL64:
289 return read64(ctx, p: buf);
290
291 // The following relocation types all point at instructions, and
292 // relocate an immediate field in the instruction.
293 //
294 // The general rule, from AAELF64 §5.7.2 "Addends and PC-bias",
295 // says: "If the relocation relocates an instruction the immediate
296 // field of the instruction is extracted, scaled as required by
297 // the instruction field encoding, and sign-extended to 64 bits".
298
299 // The R_AARCH64_MOVW family operates on wide MOV/MOVK/MOVZ
300 // instructions, which have a 16-bit immediate field with its low
301 // bit in bit 5 of the instruction encoding. When the immediate
302 // field is used as an implicit addend for REL-type relocations,
303 // it is treated as added to the low bits of the output value, not
304 // shifted depending on the relocation type.
305 //
306 // This allows REL relocations to express the requirement 'please
307 // add 12345 to this symbol value and give me the four 16-bit
308 // chunks of the result', by putting the same addend 12345 in all
309 // four instructions. Carries between the 16-bit chunks are
310 // handled correctly, because the whole 64-bit addition is done
311 // once per relocation.
312 case R_AARCH64_MOVW_UABS_G0:
313 case R_AARCH64_MOVW_UABS_G0_NC:
314 case R_AARCH64_MOVW_UABS_G1:
315 case R_AARCH64_MOVW_UABS_G1_NC:
316 case R_AARCH64_MOVW_UABS_G2:
317 case R_AARCH64_MOVW_UABS_G2_NC:
318 case R_AARCH64_MOVW_UABS_G3:
319 return SignExtend64<16>(x: getBits(val: read32le(P: buf), start: 5, end: 20));
320
321 // R_AARCH64_TSTBR14 points at a TBZ or TBNZ instruction, which
322 // has a 14-bit offset measured in instructions, i.e. shifted left
323 // by 2.
324 case R_AARCH64_TSTBR14:
325 return SignExtend64<16>(x: getBits(val: read32le(P: buf), start: 5, end: 18) << 2);
326
327 // R_AARCH64_CONDBR19 operates on the ordinary B.cond instruction,
328 // which has a 19-bit offset measured in instructions.
329 //
330 // R_AARCH64_LD_PREL_LO19 operates on the LDR (literal)
331 // instruction, which also has a 19-bit offset, measured in 4-byte
332 // chunks. So the calculation is the same as for
333 // R_AARCH64_CONDBR19.
334 case R_AARCH64_CONDBR19:
335 case R_AARCH64_LD_PREL_LO19:
336 return SignExtend64<21>(x: getBits(val: read32le(P: buf), start: 5, end: 23) << 2);
337
338 // R_AARCH64_ADD_ABS_LO12_NC operates on ADD (immediate). The
339 // immediate can optionally be shifted left by 12 bits, but this
340 // relocation is intended for the case where it is not.
341 case R_AARCH64_ADD_ABS_LO12_NC:
342 return SignExtend64<12>(x: getBits(val: read32le(P: buf), start: 10, end: 21));
343
344 // R_AARCH64_ADR_PREL_LO21 operates on an ADR instruction, whose
345 // 21-bit immediate is split between two bits high up in the word
346 // (in fact the two _lowest_ order bits of the value) and 19 bits
347 // lower down.
348 //
349 // R_AARCH64_ADR_PREL_PG_HI21[_NC] operate on an ADRP instruction,
350 // which encodes the immediate in the same way, but will shift it
351 // left by 12 bits when the instruction executes. For the same
352 // reason as the MOVW family, we don't apply that left shift here.
353 case R_AARCH64_ADR_PREL_LO21:
354 case R_AARCH64_ADR_PREL_PG_HI21:
355 case R_AARCH64_ADR_PREL_PG_HI21_NC:
356 return SignExtend64<21>(x: (getBits(val: read32le(P: buf), start: 5, end: 23) << 2) |
357 getBits(val: read32le(P: buf), start: 29, end: 30));
358
359 // R_AARCH64_{JUMP,CALL}26 operate on B and BL, which have a
360 // 26-bit offset measured in instructions.
361 case R_AARCH64_JUMP26:
362 case R_AARCH64_CALL26:
363 return SignExtend64<28>(x: getBits(val: read32le(P: buf), start: 0, end: 25) << 2);
364
365 default:
366 InternalErr(ctx, buf) << "cannot read addend for relocation " << type;
367 return 0;
368 }
369}
370
371void AArch64::writeGotPlt(uint8_t *buf, const Symbol &) const {
372 write64(ctx, p: buf, v: ctx.in.plt->getVA());
373}
374
375void AArch64::writeIgotPlt(uint8_t *buf, const Symbol &s) const {
376 if (ctx.arg.writeAddends)
377 write64(ctx, p: buf, v: s.getVA(ctx));
378}
379
380void AArch64::writePltHeader(uint8_t *buf) const {
381 const uint8_t pltData[] = {
382 0xf0, 0x7b, 0xbf, 0xa9, // stp x16, x30, [sp,#-16]!
383 0x10, 0x00, 0x00, 0x90, // adrp x16, Page(&(.got.plt[2]))
384 0x11, 0x02, 0x40, 0xf9, // ldr x17, [x16, Offset(&(.got.plt[2]))]
385 0x10, 0x02, 0x00, 0x91, // add x16, x16, Offset(&(.got.plt[2]))
386 0x20, 0x02, 0x1f, 0xd6, // br x17
387 0x1f, 0x20, 0x03, 0xd5, // nop
388 0x1f, 0x20, 0x03, 0xd5, // nop
389 0x1f, 0x20, 0x03, 0xd5 // nop
390 };
391 memcpy(dest: buf, src: pltData, n: sizeof(pltData));
392
393 uint64_t got = ctx.in.gotPlt->getVA();
394 uint64_t plt = ctx.in.plt->getVA();
395 relocateNoSym(loc: buf + 4, type: R_AARCH64_ADR_PREL_PG_HI21,
396 val: getAArch64Page(expr: got + 16) - getAArch64Page(expr: plt + 4));
397 relocateNoSym(loc: buf + 8, type: R_AARCH64_LDST64_ABS_LO12_NC, val: got + 16);
398 relocateNoSym(loc: buf + 12, type: R_AARCH64_ADD_ABS_LO12_NC, val: got + 16);
399}
400
401void AArch64::writePlt(uint8_t *buf, const Symbol &sym,
402 uint64_t pltEntryAddr) const {
403 const uint8_t inst[] = {
404 0x10, 0x00, 0x00, 0x90, // adrp x16, Page(&(.got.plt[n]))
405 0x11, 0x02, 0x40, 0xf9, // ldr x17, [x16, Offset(&(.got.plt[n]))]
406 0x10, 0x02, 0x00, 0x91, // add x16, x16, Offset(&(.got.plt[n]))
407 0x20, 0x02, 0x1f, 0xd6 // br x17
408 };
409 memcpy(dest: buf, src: inst, n: sizeof(inst));
410
411 uint64_t gotPltEntryAddr = sym.getGotPltVA(ctx);
412 relocateNoSym(loc: buf, type: R_AARCH64_ADR_PREL_PG_HI21,
413 val: getAArch64Page(expr: gotPltEntryAddr) - getAArch64Page(expr: pltEntryAddr));
414 relocateNoSym(loc: buf + 4, type: R_AARCH64_LDST64_ABS_LO12_NC, val: gotPltEntryAddr);
415 relocateNoSym(loc: buf + 8, type: R_AARCH64_ADD_ABS_LO12_NC, val: gotPltEntryAddr);
416}
417
418bool AArch64::needsThunk(RelExpr expr, RelType type, const InputFile *file,
419 uint64_t branchAddr, const Symbol &s,
420 int64_t a) const {
421 // If s is an undefined weak symbol and does not have a PLT entry then it will
422 // be resolved as a branch to the next instruction. If it is hidden, its
423 // binding has been converted to local, so we just check isUndefined() here. A
424 // undefined non-weak symbol will have been errored.
425 if (s.isUndefined() && !s.isInPlt(ctx))
426 return false;
427 // ELF for the ARM 64-bit architecture, section Call and Jump relocations
428 // only permits range extension thunks for R_AARCH64_CALL26 and
429 // R_AARCH64_JUMP26 relocation types.
430 if (type != R_AARCH64_CALL26 && type != R_AARCH64_JUMP26 &&
431 type != R_AARCH64_PLT32)
432 return false;
433 uint64_t dst = expr == R_PLT_PC ? s.getPltVA(ctx) : s.getVA(ctx, addend: a);
434 return !inBranchRange(type, src: branchAddr, dst);
435}
436
437uint32_t AArch64::getThunkSectionSpacing() const {
438 // See comment in Arch/ARM.cpp for a more detailed explanation of
439 // getThunkSectionSpacing(). For AArch64 the only branches we are permitted to
440 // Thunk have a range of +/- 128 MiB
441 return (128 * 1024 * 1024) - 0x30000;
442}
443
444bool AArch64::inBranchRange(RelType type, uint64_t src, uint64_t dst) const {
445 if (type != R_AARCH64_CALL26 && type != R_AARCH64_JUMP26 &&
446 type != R_AARCH64_PLT32)
447 return true;
448 // The AArch64 call and unconditional branch instructions have a range of
449 // +/- 128 MiB. The PLT32 relocation supports a range up to +/- 2 GiB.
450 uint64_t range =
451 type == R_AARCH64_PLT32 ? (UINT64_C(1) << 31) : (128 * 1024 * 1024);
452 if (dst > src) {
453 // Immediate of branch is signed.
454 range -= 4;
455 return dst - src <= range;
456 }
457 return src - dst <= range;
458}
459
460static void write32AArch64Addr(uint8_t *l, uint64_t imm) {
461 uint32_t immLo = (imm & 0x3) << 29;
462 uint32_t immHi = (imm & 0x1FFFFC) << 3;
463 uint64_t mask = (0x3 << 29) | (0x1FFFFC << 3);
464 write32le(P: l, V: (read32le(P: l) & ~mask) | immLo | immHi);
465}
466
467static void writeMaskedBits32le(uint8_t *p, int32_t v, uint32_t mask) {
468 write32le(P: p, V: (read32le(P: p) & ~mask) | v);
469}
470
471// Update the immediate field in a AARCH64 ldr, str, and add instruction.
472static void write32Imm12(uint8_t *l, uint64_t imm) {
473 writeMaskedBits32le(p: l, v: (imm & 0xFFF) << 10, mask: 0xFFF << 10);
474}
475
476// Update the immediate field in an AArch64 movk, movn or movz instruction
477// for a signed relocation, and update the opcode of a movn or movz instruction
478// to match the sign of the operand.
479static void writeSMovWImm(uint8_t *loc, uint32_t imm) {
480 uint32_t inst = read32le(P: loc);
481 // Opcode field is bits 30, 29, with 10 = movz, 00 = movn and 11 = movk.
482 if (!(inst & (1 << 29))) {
483 // movn or movz.
484 if (imm & 0x10000) {
485 // Change opcode to movn, which takes an inverted operand.
486 imm ^= 0xFFFF;
487 inst &= ~(1 << 30);
488 } else {
489 // Change opcode to movz.
490 inst |= 1 << 30;
491 }
492 }
493 write32le(P: loc, V: inst | ((imm & 0xFFFF) << 5));
494}
495
496void AArch64::relocate(uint8_t *loc, const Relocation &rel,
497 uint64_t val) const {
498 switch (rel.type) {
499 case R_AARCH64_ABS16:
500 case R_AARCH64_PREL16:
501 checkIntUInt(ctx, loc, v: val, n: 16, rel);
502 write16(ctx, p: loc, v: val);
503 break;
504 case R_AARCH64_ABS32:
505 case R_AARCH64_PREL32:
506 checkIntUInt(ctx, loc, v: val, n: 32, rel);
507 write32(ctx, p: loc, v: val);
508 break;
509 case R_AARCH64_PLT32:
510 case R_AARCH64_GOTPCREL32:
511 checkInt(ctx, loc, v: val, n: 32, rel);
512 write32(ctx, p: loc, v: val);
513 break;
514 case R_AARCH64_ABS64:
515 // AArch64 relocations to tagged symbols have extended semantics, as
516 // described here:
517 // https://github.com/ARM-software/abi-aa/blob/main/memtagabielf64/memtagabielf64.rst#841extended-semantics-of-r_aarch64_relative.
518 // tl;dr: encode the symbol's special addend in the place, which is an
519 // offset to the point where the logical tag is derived from. Quick hack, if
520 // the addend is within the symbol's bounds, no need to encode the tag
521 // derivation offset.
522 if (rel.sym && rel.sym->isTagged() &&
523 (rel.addend < 0 ||
524 rel.addend >= static_cast<int64_t>(rel.sym->getSize())))
525 write64(ctx, p: loc, v: -rel.addend);
526 else
527 write64(ctx, p: loc, v: val);
528 break;
529 case R_AARCH64_PREL64:
530 write64(ctx, p: loc, v: val);
531 break;
532 case R_AARCH64_AUTH_ABS64:
533 // If val is wider than 32 bits, the relocation must have been moved from
534 // .relr.auth.dyn to .rela.dyn, and the addend write is not needed.
535 //
536 // If val fits in 32 bits, we have two potential scenarios:
537 // * True RELR: Write the 32-bit `val`.
538 // * RELA: Even if the value now fits in 32 bits, it might have been
539 // converted from RELR during an iteration in
540 // finalizeAddressDependentContent(). Writing the value is harmless
541 // because dynamic linking ignores it.
542 if (isInt<32>(x: val))
543 write32(ctx, p: loc, v: val);
544 break;
545 case R_AARCH64_ADD_ABS_LO12_NC:
546 case R_AARCH64_AUTH_GOT_ADD_LO12_NC:
547 write32Imm12(l: loc, imm: val);
548 break;
549 case R_AARCH64_ADR_GOT_PAGE:
550 case R_AARCH64_AUTH_ADR_GOT_PAGE:
551 case R_AARCH64_ADR_PREL_PG_HI21:
552 case R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21:
553 case R_AARCH64_TLSDESC_ADR_PAGE21:
554 case R_AARCH64_AUTH_TLSDESC_ADR_PAGE21:
555 checkInt(ctx, loc, v: val, n: 33, rel);
556 [[fallthrough]];
557 case R_AARCH64_ADR_PREL_PG_HI21_NC:
558 write32AArch64Addr(l: loc, imm: val >> 12);
559 break;
560 case R_AARCH64_ADR_PREL_LO21:
561 case R_AARCH64_AUTH_GOT_ADR_PREL_LO21:
562 checkInt(ctx, loc, v: val, n: 21, rel);
563 write32AArch64Addr(l: loc, imm: val);
564 break;
565 case R_AARCH64_JUMP26:
566 // Normally we would just write the bits of the immediate field, however
567 // when patching instructions for the cpu errata fix -fix-cortex-a53-843419
568 // we want to replace a non-branch instruction with a branch immediate
569 // instruction. By writing all the bits of the instruction including the
570 // opcode and the immediate (0 001 | 01 imm26) we can do this
571 // transformation by placing a R_AARCH64_JUMP26 relocation at the offset of
572 // the instruction we want to patch.
573 write32le(P: loc, V: 0x14000000);
574 [[fallthrough]];
575 case R_AARCH64_CALL26:
576 checkInt(ctx, loc, v: val, n: 28, rel);
577 writeMaskedBits32le(p: loc, v: (val & 0x0FFFFFFC) >> 2, mask: 0x0FFFFFFC >> 2);
578 break;
579 case R_AARCH64_CONDBR19:
580 case R_AARCH64_LD_PREL_LO19:
581 case R_AARCH64_GOT_LD_PREL19:
582 case R_AARCH64_AUTH_GOT_LD_PREL19:
583 checkAlignment(ctx, loc, v: val, n: 4, rel);
584 checkInt(ctx, loc, v: val, n: 21, rel);
585 writeMaskedBits32le(p: loc, v: (val & 0x1FFFFC) << 3, mask: 0x1FFFFC << 3);
586 break;
587 case R_AARCH64_LDST8_ABS_LO12_NC:
588 case R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC:
589 write32Imm12(l: loc, imm: getBits(val, start: 0, end: 11));
590 break;
591 case R_AARCH64_LDST16_ABS_LO12_NC:
592 case R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC:
593 checkAlignment(ctx, loc, v: val, n: 2, rel);
594 write32Imm12(l: loc, imm: getBits(val, start: 1, end: 11));
595 break;
596 case R_AARCH64_LDST32_ABS_LO12_NC:
597 case R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC:
598 checkAlignment(ctx, loc, v: val, n: 4, rel);
599 write32Imm12(l: loc, imm: getBits(val, start: 2, end: 11));
600 break;
601 case R_AARCH64_LDST64_ABS_LO12_NC:
602 case R_AARCH64_LD64_GOT_LO12_NC:
603 case R_AARCH64_AUTH_LD64_GOT_LO12_NC:
604 case R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC:
605 case R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC:
606 case R_AARCH64_TLSDESC_LD64_LO12:
607 case R_AARCH64_AUTH_TLSDESC_LD64_LO12:
608 checkAlignment(ctx, loc, v: val, n: 8, rel);
609 write32Imm12(l: loc, imm: getBits(val, start: 3, end: 11));
610 break;
611 case R_AARCH64_LDST128_ABS_LO12_NC:
612 case R_AARCH64_TLSLE_LDST128_TPREL_LO12_NC:
613 checkAlignment(ctx, loc, v: val, n: 16, rel);
614 write32Imm12(l: loc, imm: getBits(val, start: 4, end: 11));
615 break;
616 case R_AARCH64_LD64_GOTPAGE_LO15:
617 checkAlignment(ctx, loc, v: val, n: 8, rel);
618 write32Imm12(l: loc, imm: getBits(val, start: 3, end: 14));
619 break;
620 case R_AARCH64_MOVW_UABS_G0:
621 checkUInt(ctx, loc, v: val, n: 16, rel);
622 [[fallthrough]];
623 case R_AARCH64_MOVW_UABS_G0_NC:
624 writeMaskedBits32le(p: loc, v: (val & 0xFFFF) << 5, mask: 0xFFFF << 5);
625 break;
626 case R_AARCH64_MOVW_UABS_G1:
627 checkUInt(ctx, loc, v: val, n: 32, rel);
628 [[fallthrough]];
629 case R_AARCH64_MOVW_UABS_G1_NC:
630 writeMaskedBits32le(p: loc, v: (val & 0xFFFF0000) >> 11, mask: 0xFFFF0000 >> 11);
631 break;
632 case R_AARCH64_MOVW_UABS_G2:
633 checkUInt(ctx, loc, v: val, n: 48, rel);
634 [[fallthrough]];
635 case R_AARCH64_MOVW_UABS_G2_NC:
636 writeMaskedBits32le(p: loc, v: (val & 0xFFFF00000000) >> 27,
637 mask: 0xFFFF00000000 >> 27);
638 break;
639 case R_AARCH64_MOVW_UABS_G3:
640 writeMaskedBits32le(p: loc, v: (val & 0xFFFF000000000000) >> 43,
641 mask: 0xFFFF000000000000 >> 43);
642 break;
643 case R_AARCH64_MOVW_PREL_G0:
644 case R_AARCH64_MOVW_SABS_G0:
645 case R_AARCH64_TLSLE_MOVW_TPREL_G0:
646 checkInt(ctx, loc, v: val, n: 17, rel);
647 [[fallthrough]];
648 case R_AARCH64_MOVW_PREL_G0_NC:
649 case R_AARCH64_TLSLE_MOVW_TPREL_G0_NC:
650 writeSMovWImm(loc, imm: val);
651 break;
652 case R_AARCH64_MOVW_PREL_G1:
653 case R_AARCH64_MOVW_SABS_G1:
654 case R_AARCH64_TLSLE_MOVW_TPREL_G1:
655 checkInt(ctx, loc, v: val, n: 33, rel);
656 [[fallthrough]];
657 case R_AARCH64_MOVW_PREL_G1_NC:
658 case R_AARCH64_TLSLE_MOVW_TPREL_G1_NC:
659 writeSMovWImm(loc, imm: val >> 16);
660 break;
661 case R_AARCH64_MOVW_PREL_G2:
662 case R_AARCH64_MOVW_SABS_G2:
663 case R_AARCH64_TLSLE_MOVW_TPREL_G2:
664 checkInt(ctx, loc, v: val, n: 49, rel);
665 [[fallthrough]];
666 case R_AARCH64_MOVW_PREL_G2_NC:
667 writeSMovWImm(loc, imm: val >> 32);
668 break;
669 case R_AARCH64_MOVW_PREL_G3:
670 writeSMovWImm(loc, imm: val >> 48);
671 break;
672 case R_AARCH64_TSTBR14:
673 checkInt(ctx, loc, v: val, n: 16, rel);
674 writeMaskedBits32le(p: loc, v: (val & 0xFFFC) << 3, mask: 0xFFFC << 3);
675 break;
676 case R_AARCH64_TLSLE_ADD_TPREL_HI12:
677 checkUInt(ctx, loc, v: val, n: 24, rel);
678 write32Imm12(l: loc, imm: val >> 12);
679 break;
680 case R_AARCH64_TLSLE_ADD_TPREL_LO12_NC:
681 case R_AARCH64_TLSDESC_ADD_LO12:
682 case R_AARCH64_AUTH_TLSDESC_ADD_LO12:
683 write32Imm12(l: loc, imm: val);
684 break;
685 case R_AARCH64_TLSDESC:
686 // For R_AARCH64_TLSDESC the addend is stored in the second 64-bit word.
687 write64(ctx, p: loc + 8, v: val);
688 break;
689 default:
690 llvm_unreachable("unknown relocation");
691 }
692}
693
694void AArch64::relaxTlsGdToLe(uint8_t *loc, const Relocation &rel,
695 uint64_t val) const {
696 // TLSDESC Global-Dynamic relocation are in the form:
697 // adrp x0, :tlsdesc:v [R_AARCH64_TLSDESC_ADR_PAGE21]
698 // ldr x1, [x0, #:tlsdesc_lo12:v [R_AARCH64_TLSDESC_LD64_LO12]
699 // add x0, x0, :tlsdesc_los:v [R_AARCH64_TLSDESC_ADD_LO12]
700 // .tlsdesccall [R_AARCH64_TLSDESC_CALL]
701 // blr x1
702 // And it can optimized to:
703 // movz x0, #0x0, lsl #16
704 // movk x0, #0x10
705 // nop
706 // nop
707 checkUInt(ctx, loc, v: val, n: 32, rel);
708
709 switch (rel.type) {
710 case R_AARCH64_TLSDESC_ADD_LO12:
711 case R_AARCH64_TLSDESC_CALL:
712 write32le(P: loc, V: 0xd503201f); // nop
713 return;
714 case R_AARCH64_TLSDESC_ADR_PAGE21:
715 write32le(P: loc, V: 0xd2a00000 | (((val >> 16) & 0xffff) << 5)); // movz
716 return;
717 case R_AARCH64_TLSDESC_LD64_LO12:
718 write32le(P: loc, V: 0xf2800000 | ((val & 0xffff) << 5)); // movk
719 return;
720 default:
721 llvm_unreachable("unsupported relocation for TLS GD to LE relaxation");
722 }
723}
724
725void AArch64::relaxTlsGdToIe(uint8_t *loc, const Relocation &rel,
726 uint64_t val) const {
727 // TLSDESC Global-Dynamic relocation are in the form:
728 // adrp x0, :tlsdesc:v [R_AARCH64_TLSDESC_ADR_PAGE21]
729 // ldr x1, [x0, #:tlsdesc_lo12:v [R_AARCH64_TLSDESC_LD64_LO12]
730 // add x0, x0, :tlsdesc_los:v [R_AARCH64_TLSDESC_ADD_LO12]
731 // .tlsdesccall [R_AARCH64_TLSDESC_CALL]
732 // blr x1
733 // And it can optimized to:
734 // adrp x0, :gottprel:v
735 // ldr x0, [x0, :gottprel_lo12:v]
736 // nop
737 // nop
738
739 switch (rel.type) {
740 case R_AARCH64_TLSDESC_ADD_LO12:
741 case R_AARCH64_TLSDESC_CALL:
742 write32le(P: loc, V: 0xd503201f); // nop
743 break;
744 case R_AARCH64_TLSDESC_ADR_PAGE21:
745 write32le(P: loc, V: 0x90000000); // adrp
746 relocateNoSym(loc, type: R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21, val);
747 break;
748 case R_AARCH64_TLSDESC_LD64_LO12:
749 write32le(P: loc, V: 0xf9400000); // ldr
750 relocateNoSym(loc, type: R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC, val);
751 break;
752 default:
753 llvm_unreachable("unsupported relocation for TLS GD to LE relaxation");
754 }
755}
756
757void AArch64::relaxTlsIeToLe(uint8_t *loc, const Relocation &rel,
758 uint64_t val) const {
759 checkUInt(ctx, loc, v: val, n: 32, rel);
760
761 if (rel.type == R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21) {
762 // Generate MOVZ.
763 uint32_t regNo = read32le(P: loc) & 0x1f;
764 write32le(P: loc, V: (0xd2a00000 | regNo) | (((val >> 16) & 0xffff) << 5));
765 return;
766 }
767 if (rel.type == R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC) {
768 // Generate MOVK.
769 uint32_t regNo = read32le(P: loc) & 0x1f;
770 write32le(P: loc, V: (0xf2800000 | regNo) | ((val & 0xffff) << 5));
771 return;
772 }
773 llvm_unreachable("invalid relocation for TLS IE to LE relaxation");
774}
775
776AArch64Relaxer::AArch64Relaxer(Ctx &ctx, ArrayRef<Relocation> relocs)
777 : ctx(ctx) {
778 if (!ctx.arg.relax)
779 return;
780 // Check if R_AARCH64_ADR_GOT_PAGE and R_AARCH64_LD64_GOT_LO12_NC
781 // always appear in pairs.
782 size_t i = 0;
783 const size_t size = relocs.size();
784 for (; i != size; ++i) {
785 if (relocs[i].type == R_AARCH64_ADR_GOT_PAGE) {
786 if (i + 1 < size && relocs[i + 1].type == R_AARCH64_LD64_GOT_LO12_NC) {
787 ++i;
788 continue;
789 }
790 break;
791 } else if (relocs[i].type == R_AARCH64_LD64_GOT_LO12_NC) {
792 break;
793 }
794 }
795 safeToRelaxAdrpLdr = i == size;
796}
797
798bool AArch64Relaxer::tryRelaxAdrpAdd(const Relocation &adrpRel,
799 const Relocation &addRel, uint64_t secAddr,
800 uint8_t *buf) const {
801 // When the address of sym is within the range of ADR then
802 // we may relax
803 // ADRP xn, sym
804 // ADD xn, xn, :lo12: sym
805 // to
806 // NOP
807 // ADR xn, sym
808 if (!ctx.arg.relax || adrpRel.type != R_AARCH64_ADR_PREL_PG_HI21 ||
809 addRel.type != R_AARCH64_ADD_ABS_LO12_NC)
810 return false;
811 // Check if the relocations apply to consecutive instructions.
812 if (adrpRel.offset + 4 != addRel.offset)
813 return false;
814 if (adrpRel.sym != addRel.sym)
815 return false;
816 if (adrpRel.addend != 0 || addRel.addend != 0)
817 return false;
818
819 uint32_t adrpInstr = read32le(P: buf + adrpRel.offset);
820 uint32_t addInstr = read32le(P: buf + addRel.offset);
821 // Check if the first instruction is ADRP and the second instruction is ADD.
822 if ((adrpInstr & 0x9f000000) != 0x90000000 ||
823 (addInstr & 0xffc00000) != 0x91000000)
824 return false;
825 uint32_t adrpDestReg = adrpInstr & 0x1f;
826 uint32_t addDestReg = addInstr & 0x1f;
827 uint32_t addSrcReg = (addInstr >> 5) & 0x1f;
828 if (adrpDestReg != addDestReg || adrpDestReg != addSrcReg)
829 return false;
830
831 Symbol &sym = *adrpRel.sym;
832 // Check if the address difference is within 1MiB range.
833 int64_t val = sym.getVA(ctx) - (secAddr + addRel.offset);
834 if (val < -1024 * 1024 || val >= 1024 * 1024)
835 return false;
836
837 Relocation adrRel = {.expr: R_ABS, .type: R_AARCH64_ADR_PREL_LO21, .offset: addRel.offset,
838 /*addend=*/0, .sym: &sym};
839 // nop
840 write32le(P: buf + adrpRel.offset, V: 0xd503201f);
841 // adr x_<dest_reg>
842 write32le(P: buf + adrRel.offset, V: 0x10000000 | adrpDestReg);
843 ctx.target->relocate(loc: buf + adrRel.offset, rel: adrRel, val);
844 return true;
845}
846
847bool AArch64Relaxer::tryRelaxAdrpLdr(const Relocation &adrpRel,
848 const Relocation &ldrRel, uint64_t secAddr,
849 uint8_t *buf) const {
850 if (!safeToRelaxAdrpLdr)
851 return false;
852
853 // When the definition of sym is not preemptible then we may
854 // be able to relax
855 // ADRP xn, :got: sym
856 // LDR xn, [ xn :got_lo12: sym]
857 // to
858 // ADRP xn, sym
859 // ADD xn, xn, :lo_12: sym
860
861 if (adrpRel.type != R_AARCH64_ADR_GOT_PAGE ||
862 ldrRel.type != R_AARCH64_LD64_GOT_LO12_NC)
863 return false;
864 // Check if the relocations apply to consecutive instructions.
865 if (adrpRel.offset + 4 != ldrRel.offset)
866 return false;
867 // Check if the relocations reference the same symbol and
868 // skip undefined, preemptible and STT_GNU_IFUNC symbols.
869 if (!adrpRel.sym || adrpRel.sym != ldrRel.sym || !adrpRel.sym->isDefined() ||
870 adrpRel.sym->isPreemptible || adrpRel.sym->isGnuIFunc())
871 return false;
872 // Check if the addends of the both relocations are zero.
873 if (adrpRel.addend != 0 || ldrRel.addend != 0)
874 return false;
875 uint32_t adrpInstr = read32le(P: buf + adrpRel.offset);
876 uint32_t ldrInstr = read32le(P: buf + ldrRel.offset);
877 // Check if the first instruction is ADRP and the second instruction is LDR.
878 if ((adrpInstr & 0x9f000000) != 0x90000000 ||
879 (ldrInstr & 0x3b000000) != 0x39000000)
880 return false;
881 // Check the value of the sf bit.
882 if (!(ldrInstr >> 31))
883 return false;
884 uint32_t adrpDestReg = adrpInstr & 0x1f;
885 uint32_t ldrDestReg = ldrInstr & 0x1f;
886 uint32_t ldrSrcReg = (ldrInstr >> 5) & 0x1f;
887 // Check if ADPR and LDR use the same register.
888 if (adrpDestReg != ldrDestReg || adrpDestReg != ldrSrcReg)
889 return false;
890
891 Symbol &sym = *adrpRel.sym;
892 // GOT references to absolute symbols can't be relaxed to use ADRP/ADD in
893 // position-independent code because these instructions produce a relative
894 // address.
895 if (ctx.arg.isPic && !cast<Defined>(Val&: sym).section)
896 return false;
897 // Check if the address difference is within 4GB range.
898 int64_t val =
899 getAArch64Page(expr: sym.getVA(ctx)) - getAArch64Page(expr: secAddr + adrpRel.offset);
900 if (val != llvm::SignExtend64(X: val, B: 33))
901 return false;
902
903 Relocation adrpSymRel = {.expr: RE_AARCH64_PAGE_PC, .type: R_AARCH64_ADR_PREL_PG_HI21,
904 .offset: adrpRel.offset, /*addend=*/0, .sym: &sym};
905 Relocation addRel = {.expr: R_ABS, .type: R_AARCH64_ADD_ABS_LO12_NC, .offset: ldrRel.offset,
906 /*addend=*/0, .sym: &sym};
907
908 // adrp x_<dest_reg>
909 write32le(P: buf + adrpSymRel.offset, V: 0x90000000 | adrpDestReg);
910 // add x_<dest reg>, x_<dest reg>
911 write32le(P: buf + addRel.offset, V: 0x91000000 | adrpDestReg | (adrpDestReg << 5));
912
913 ctx.target->relocate(
914 loc: buf + adrpSymRel.offset, rel: adrpSymRel,
915 val: SignExtend64(X: getAArch64Page(expr: sym.getVA(ctx)) -
916 getAArch64Page(expr: secAddr + adrpSymRel.offset),
917 B: 64));
918 ctx.target->relocate(loc: buf + addRel.offset, rel: addRel,
919 val: SignExtend64(X: sym.getVA(ctx), B: 64));
920 tryRelaxAdrpAdd(adrpRel: adrpSymRel, addRel, secAddr, buf);
921 return true;
922}
923
924// Tagged symbols have upper address bits that are added by the dynamic loader,
925// and thus need the full 64-bit GOT entry. Do not relax such symbols.
926static bool needsGotForMemtag(const Relocation &rel) {
927 return rel.sym->isTagged() && needsGot(expr: rel.expr);
928}
929
930void AArch64::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
931 uint64_t secAddr = sec.getOutputSection()->addr;
932 if (auto *s = dyn_cast<InputSection>(Val: &sec))
933 secAddr += s->outSecOff;
934 else if (auto *ehIn = dyn_cast<EhInputSection>(Val: &sec))
935 secAddr += ehIn->getParent()->outSecOff;
936 AArch64Relaxer relaxer(ctx, sec.relocs());
937 for (size_t i = 0, size = sec.relocs().size(); i != size; ++i) {
938 const Relocation &rel = sec.relocs()[i];
939 uint8_t *loc = buf + rel.offset;
940 const uint64_t val = sec.getRelocTargetVA(ctx, r: rel, p: secAddr + rel.offset);
941
942 if (needsGotForMemtag(rel)) {
943 relocate(loc, rel, val);
944 continue;
945 }
946
947 switch (rel.expr) {
948 case RE_AARCH64_GOT_PAGE_PC:
949 if (i + 1 < size &&
950 relaxer.tryRelaxAdrpLdr(adrpRel: rel, ldrRel: sec.relocs()[i + 1], secAddr, buf)) {
951 ++i;
952 continue;
953 }
954 break;
955 case RE_AARCH64_PAGE_PC:
956 if (i + 1 < size &&
957 relaxer.tryRelaxAdrpAdd(adrpRel: rel, addRel: sec.relocs()[i + 1], secAddr, buf)) {
958 ++i;
959 continue;
960 }
961 break;
962 case RE_AARCH64_RELAX_TLS_GD_TO_IE_PAGE_PC:
963 case R_RELAX_TLS_GD_TO_IE_ABS:
964 relaxTlsGdToIe(loc, rel, val);
965 continue;
966 case R_RELAX_TLS_GD_TO_LE:
967 relaxTlsGdToLe(loc, rel, val);
968 continue;
969 case R_RELAX_TLS_IE_TO_LE:
970 relaxTlsIeToLe(loc, rel, val);
971 continue;
972 default:
973 break;
974 }
975 relocate(loc, rel, val);
976 }
977}
978
979static std::optional<uint64_t> getControlTransferAddend(InputSection &is,
980 Relocation &r) {
981 // Identify a control transfer relocation for the branch-to-branch
982 // optimization. A "control transfer relocation" means a B or BL
983 // target but it also includes relative vtable relocations for example.
984 //
985 // We require the relocation type to be JUMP26, CALL26 or PLT32. With a
986 // relocation type of PLT32 the value may be assumed to be used for branching
987 // directly to the symbol and the addend is only used to produce the relocated
988 // value (hence the effective addend is always 0). This is because if a PLT is
989 // needed the addend will be added to the address of the PLT, and it doesn't
990 // make sense to branch into the middle of a PLT. For example, relative vtable
991 // relocations use PLT32 and 0 or a positive value as the addend but still are
992 // used to branch to the symbol.
993 //
994 // With JUMP26 or CALL26 the only reasonable interpretation of a non-zero
995 // addend is that we are branching to symbol+addend so that becomes the
996 // effective addend.
997 if (r.type == R_AARCH64_PLT32)
998 return 0;
999 if (r.type == R_AARCH64_JUMP26 || r.type == R_AARCH64_CALL26)
1000 return r.addend;
1001 return std::nullopt;
1002}
1003
1004static std::pair<Relocation *, uint64_t>
1005getBranchInfoAtTarget(InputSection &is, uint64_t offset) {
1006 auto *i = llvm::partition_point(
1007 Range&: is.relocations, P: [&](Relocation &r) { return r.offset < offset; });
1008 if (i != is.relocations.end() && i->offset == offset &&
1009 i->type == R_AARCH64_JUMP26) {
1010 return {i, i->addend};
1011 }
1012 return {nullptr, 0};
1013}
1014
1015static void redirectControlTransferRelocations(Relocation &r1,
1016 const Relocation &r2) {
1017 r1.expr = r2.expr;
1018 r1.sym = r2.sym;
1019 // With PLT32 we must respect the original addend as that affects the value's
1020 // interpretation. With the other relocation types the original addend is
1021 // irrelevant because it referred to an offset within the original target
1022 // section so we overwrite it.
1023 if (r1.type == R_AARCH64_PLT32)
1024 r1.addend += r2.addend;
1025 else
1026 r1.addend = r2.addend;
1027}
1028
1029void AArch64::applyBranchToBranchOpt() const {
1030 applyBranchToBranchOptImpl(ctx, getControlTransferAddend,
1031 getBranchInfoAtTarget,
1032 redirectControlTransferRelocations);
1033}
1034
1035// AArch64 may use security features in variant PLT sequences. These are:
1036// Pointer Authentication (PAC), introduced in armv8.3-a and Branch Target
1037// Indicator (BTI) introduced in armv8.5-a. The additional instructions used
1038// in the variant Plt sequences are encoded in the Hint space so they can be
1039// deployed on older architectures, which treat the instructions as a nop.
1040// PAC and BTI can be combined leading to the following combinations:
1041// writePltHeader
1042// writePltHeaderBti (no PAC Header needed)
1043// writePlt
1044// writePltBti (BTI only)
1045// writePltPac (PAC only)
1046// writePltBtiPac (BTI and PAC)
1047//
1048// When PAC is enabled the dynamic loader encrypts the address that it places
1049// in the .got.plt using the pacia1716 instruction which encrypts the value in
1050// x17 using the modifier in x16. The static linker places autia1716 before the
1051// indirect branch to x17 to authenticate the address in x17 with the modifier
1052// in x16. This makes it more difficult for an attacker to modify the value in
1053// the .got.plt.
1054//
1055// When BTI is enabled all indirect branches must land on a bti instruction.
1056// The static linker must place a bti instruction at the start of any PLT entry
1057// that may be the target of an indirect branch. As the PLT entries call the
1058// lazy resolver indirectly this must have a bti instruction at start. In
1059// general a bti instruction is not needed for a PLT entry as indirect calls
1060// are resolved to the function address and not the PLT entry for the function.
1061// There are a small number of cases where the PLT address can escape, such as
1062// taking the address of a function or ifunc via a non got-generating
1063// relocation, and a shared library refers to that symbol.
1064//
1065// We use the bti c variant of the instruction which permits indirect branches
1066// (br) via x16/x17 and indirect function calls (blr) via any register. The ABI
1067// guarantees that all indirect branches from code requiring BTI protection
1068// will go via x16/x17
1069
1070namespace {
1071class AArch64BtiPac final : public AArch64 {
1072public:
1073 AArch64BtiPac(Ctx &);
1074 void writePltHeader(uint8_t *buf) const override;
1075 void writePlt(uint8_t *buf, const Symbol &sym,
1076 uint64_t pltEntryAddr) const override;
1077
1078private:
1079 bool btiHeader; // bti instruction needed in PLT Header and Entry
1080 enum {
1081 PEK_NoAuth,
1082 PEK_AuthHint, // use autia1716 instr for authenticated branch in PLT entry
1083 PEK_Auth, // use braa instr for authenticated branch in PLT entry
1084 } pacEntryKind;
1085};
1086} // namespace
1087
1088AArch64BtiPac::AArch64BtiPac(Ctx &ctx) : AArch64(ctx) {
1089 btiHeader = (ctx.arg.andFeatures & GNU_PROPERTY_AARCH64_FEATURE_1_BTI);
1090 // A BTI (Branch Target Indicator) Plt Entry is only required if the
1091 // address of the PLT entry can be taken by the program, which permits an
1092 // indirect jump to the PLT entry. This can happen when the address
1093 // of the PLT entry for a function is canonicalised due to the address of
1094 // the function in an executable being taken by a shared library, or
1095 // non-preemptible ifunc referenced by non-GOT-generating, non-PLT-generating
1096 // relocations.
1097 // The PAC PLT entries require dynamic loader support and this isn't known
1098 // from properties in the objects, so we use the command line flag.
1099 // By default we only use hint-space instructions, but if we detect the
1100 // PAuthABI, which requires v8.3-A, we can use the non-hint space
1101 // instructions.
1102
1103 if (ctx.arg.zPacPlt) {
1104 if (ctx.aarch64PauthAbiCoreInfo && ctx.aarch64PauthAbiCoreInfo->isValid())
1105 pacEntryKind = PEK_Auth;
1106 else
1107 pacEntryKind = PEK_AuthHint;
1108 } else {
1109 pacEntryKind = PEK_NoAuth;
1110 }
1111
1112 if (btiHeader || (pacEntryKind != PEK_NoAuth)) {
1113 pltEntrySize = 24;
1114 ipltEntrySize = 24;
1115 }
1116}
1117
1118void AArch64BtiPac::writePltHeader(uint8_t *buf) const {
1119 const uint8_t btiData[] = { 0x5f, 0x24, 0x03, 0xd5 }; // bti c
1120 const uint8_t pltData[] = {
1121 0xf0, 0x7b, 0xbf, 0xa9, // stp x16, x30, [sp,#-16]!
1122 0x10, 0x00, 0x00, 0x90, // adrp x16, Page(&(.got.plt[2]))
1123 0x11, 0x02, 0x40, 0xf9, // ldr x17, [x16, Offset(&(.got.plt[2]))]
1124 0x10, 0x02, 0x00, 0x91, // add x16, x16, Offset(&(.got.plt[2]))
1125 0x20, 0x02, 0x1f, 0xd6, // br x17
1126 0x1f, 0x20, 0x03, 0xd5, // nop
1127 0x1f, 0x20, 0x03, 0xd5 // nop
1128 };
1129 const uint8_t nopData[] = { 0x1f, 0x20, 0x03, 0xd5 }; // nop
1130
1131 uint64_t got = ctx.in.gotPlt->getVA();
1132 uint64_t plt = ctx.in.plt->getVA();
1133
1134 if (btiHeader) {
1135 // PltHeader is called indirectly by plt[N]. Prefix pltData with a BTI C
1136 // instruction.
1137 memcpy(dest: buf, src: btiData, n: sizeof(btiData));
1138 buf += sizeof(btiData);
1139 plt += sizeof(btiData);
1140 }
1141 memcpy(dest: buf, src: pltData, n: sizeof(pltData));
1142
1143 relocateNoSym(loc: buf + 4, type: R_AARCH64_ADR_PREL_PG_HI21,
1144 val: getAArch64Page(expr: got + 16) - getAArch64Page(expr: plt + 4));
1145 relocateNoSym(loc: buf + 8, type: R_AARCH64_LDST64_ABS_LO12_NC, val: got + 16);
1146 relocateNoSym(loc: buf + 12, type: R_AARCH64_ADD_ABS_LO12_NC, val: got + 16);
1147 if (!btiHeader)
1148 // We didn't add the BTI c instruction so round out size with NOP.
1149 memcpy(dest: buf + sizeof(pltData), src: nopData, n: sizeof(nopData));
1150}
1151
1152void AArch64BtiPac::writePlt(uint8_t *buf, const Symbol &sym,
1153 uint64_t pltEntryAddr) const {
1154 // The PLT entry is of the form:
1155 // [btiData] addrInst (pacBr | stdBr) [nopData]
1156 const uint8_t btiData[] = { 0x5f, 0x24, 0x03, 0xd5 }; // bti c
1157 const uint8_t addrInst[] = {
1158 0x10, 0x00, 0x00, 0x90, // adrp x16, Page(&(.got.plt[n]))
1159 0x11, 0x02, 0x40, 0xf9, // ldr x17, [x16, Offset(&(.got.plt[n]))]
1160 0x10, 0x02, 0x00, 0x91 // add x16, x16, Offset(&(.got.plt[n]))
1161 };
1162 const uint8_t pacHintBr[] = {
1163 0x9f, 0x21, 0x03, 0xd5, // autia1716
1164 0x20, 0x02, 0x1f, 0xd6 // br x17
1165 };
1166 const uint8_t pacBr[] = {
1167 0x30, 0x0a, 0x1f, 0xd7, // braa x17, x16
1168 0x1f, 0x20, 0x03, 0xd5 // nop
1169 };
1170 const uint8_t stdBr[] = {
1171 0x20, 0x02, 0x1f, 0xd6, // br x17
1172 0x1f, 0x20, 0x03, 0xd5 // nop
1173 };
1174 const uint8_t nopData[] = { 0x1f, 0x20, 0x03, 0xd5 }; // nop
1175
1176 // NEEDS_COPY indicates a non-ifunc canonical PLT entry whose address may
1177 // escape to shared objects. isInIplt indicates a non-preemptible ifunc. Its
1178 // address may escape if referenced by a direct relocation. If relative
1179 // vtables are used then if the vtable is in a shared object the offsets will
1180 // be to the PLT entry. The condition is conservative.
1181 bool hasBti = btiHeader &&
1182 (sym.hasFlag(bit: NEEDS_COPY) || sym.isInIplt || sym.thunkAccessed);
1183 if (hasBti) {
1184 memcpy(dest: buf, src: btiData, n: sizeof(btiData));
1185 buf += sizeof(btiData);
1186 pltEntryAddr += sizeof(btiData);
1187 }
1188
1189 uint64_t gotPltEntryAddr = sym.getGotPltVA(ctx);
1190 memcpy(dest: buf, src: addrInst, n: sizeof(addrInst));
1191 relocateNoSym(loc: buf, type: R_AARCH64_ADR_PREL_PG_HI21,
1192 val: getAArch64Page(expr: gotPltEntryAddr) - getAArch64Page(expr: pltEntryAddr));
1193 relocateNoSym(loc: buf + 4, type: R_AARCH64_LDST64_ABS_LO12_NC, val: gotPltEntryAddr);
1194 relocateNoSym(loc: buf + 8, type: R_AARCH64_ADD_ABS_LO12_NC, val: gotPltEntryAddr);
1195
1196 if (pacEntryKind != PEK_NoAuth)
1197 memcpy(dest: buf + sizeof(addrInst),
1198 src: pacEntryKind == PEK_AuthHint ? pacHintBr : pacBr,
1199 n: sizeof(pacEntryKind == PEK_AuthHint ? pacHintBr : pacBr));
1200 else
1201 memcpy(dest: buf + sizeof(addrInst), src: stdBr, n: sizeof(stdBr));
1202 if (!hasBti)
1203 // We didn't add the BTI c instruction so round out size with NOP.
1204 memcpy(dest: buf + sizeof(addrInst) + sizeof(stdBr), src: nopData, n: sizeof(nopData));
1205}
1206
1207template <class ELFT>
1208static void
1209addTaggedSymbolReferences(Ctx &ctx, InputSectionBase &sec,
1210 DenseMap<Symbol *, unsigned> &referenceCount) {
1211 assert(sec.type == SHT_AARCH64_MEMTAG_GLOBALS_STATIC);
1212
1213 const RelsOrRelas<ELFT> rels = sec.relsOrRelas<ELFT>();
1214 if (rels.areRelocsRel())
1215 ErrAlways(ctx)
1216 << "non-RELA relocations are not allowed with memtag globals";
1217
1218 for (const typename ELFT::Rela &rel : rels.relas) {
1219 Symbol &sym = sec.file->getRelocTargetSym(rel);
1220 // Linker-synthesized symbols such as __executable_start may be referenced
1221 // as tagged in input objfiles, and we don't want them to be tagged. A
1222 // cheap way to exclude them is the type check, but their type is
1223 // STT_NOTYPE. In addition, this save us from checking untaggable symbols,
1224 // like functions or TLS symbols.
1225 if (sym.type != STT_OBJECT)
1226 continue;
1227 // STB_LOCAL symbols can't be referenced from outside the object file, and
1228 // thus don't need to be checked for references from other object files.
1229 if (sym.binding == STB_LOCAL) {
1230 sym.setIsTagged(true);
1231 continue;
1232 }
1233 ++referenceCount[&sym];
1234 }
1235 sec.markDead();
1236}
1237
1238// A tagged symbol must be denoted as being tagged by all references and the
1239// chosen definition. For simplicity, here, it must also be denoted as tagged
1240// for all definitions. Otherwise:
1241//
1242// 1. A tagged definition can be used by an untagged declaration, in which case
1243// the untagged access may be PC-relative, causing a tag mismatch at
1244// runtime.
1245// 2. An untagged definition can be used by a tagged declaration, where the
1246// compiler has taken advantage of the increased alignment of the tagged
1247// declaration, but the alignment at runtime is wrong, causing a fault.
1248//
1249// Ideally, this isn't a problem, as any TU that imports or exports tagged
1250// symbols should also be built with tagging. But, to handle these cases, we
1251// demote the symbol to be untagged.
1252void elf::createTaggedSymbols(Ctx &ctx) {
1253 assert(hasMemtag(ctx));
1254
1255 // First, collect all symbols that are marked as tagged, and count how many
1256 // times they're marked as tagged.
1257 DenseMap<Symbol *, unsigned> taggedSymbolReferenceCount;
1258 for (InputFile *file : ctx.objectFiles) {
1259 if (file->kind() != InputFile::ObjKind)
1260 continue;
1261 for (InputSectionBase *section : file->getSections()) {
1262 if (!section || section->type != SHT_AARCH64_MEMTAG_GLOBALS_STATIC ||
1263 section == &InputSection::discarded)
1264 continue;
1265 invokeELFT(addTaggedSymbolReferences, ctx, *section,
1266 taggedSymbolReferenceCount);
1267 }
1268 }
1269
1270 // Now, go through all the symbols. If the number of declarations +
1271 // definitions to a symbol exceeds the amount of times they're marked as
1272 // tagged, it means we have an objfile that uses the untagged variant of the
1273 // symbol.
1274 for (InputFile *file : ctx.objectFiles) {
1275 if (file->kind() != InputFile::BinaryKind &&
1276 file->kind() != InputFile::ObjKind)
1277 continue;
1278
1279 for (Symbol *symbol : file->getSymbols()) {
1280 // See `addTaggedSymbolReferences` for more details.
1281 if (symbol->type != STT_OBJECT ||
1282 symbol->binding == STB_LOCAL)
1283 continue;
1284 auto it = taggedSymbolReferenceCount.find(Val: symbol);
1285 if (it == taggedSymbolReferenceCount.end()) continue;
1286 unsigned &remainingAllowedTaggedRefs = it->second;
1287 if (remainingAllowedTaggedRefs == 0) {
1288 taggedSymbolReferenceCount.erase(I: it);
1289 continue;
1290 }
1291 --remainingAllowedTaggedRefs;
1292 }
1293 }
1294
1295 // `addTaggedSymbolReferences` has already checked that we have RELA
1296 // relocations, the only other way to get written addends is with
1297 // --apply-dynamic-relocs.
1298 if (!taggedSymbolReferenceCount.empty() && ctx.arg.writeAddends)
1299 ErrAlways(ctx) << "--apply-dynamic-relocs cannot be used with MTE globals";
1300
1301 // Now, `taggedSymbolReferenceCount` should only contain symbols that are
1302 // defined as tagged exactly the same amount as it's referenced, meaning all
1303 // uses are tagged.
1304 for (auto &[symbol, remainingTaggedRefs] : taggedSymbolReferenceCount) {
1305 assert(remainingTaggedRefs == 0 &&
1306 "Symbol is defined as tagged more times than it's used");
1307 symbol->setIsTagged(true);
1308 }
1309}
1310
1311void elf::setAArch64TargetInfo(Ctx &ctx) {
1312 if ((ctx.arg.andFeatures & GNU_PROPERTY_AARCH64_FEATURE_1_BTI) ||
1313 ctx.arg.zPacPlt)
1314 ctx.target.reset(p: new AArch64BtiPac(ctx));
1315 else
1316 ctx.target.reset(p: new AArch64(ctx));
1317}
1318