1 | //===- LoongArch.cpp ------------------------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "InputFiles.h" |
10 | #include "OutputSections.h" |
11 | #include "Symbols.h" |
12 | #include "SyntheticSections.h" |
13 | #include "Target.h" |
14 | #include "llvm/BinaryFormat/ELF.h" |
15 | #include "llvm/Support/LEB128.h" |
16 | |
17 | using namespace llvm; |
18 | using namespace llvm::object; |
19 | using namespace llvm::support::endian; |
20 | using namespace llvm::ELF; |
21 | using namespace lld; |
22 | using namespace lld::elf; |
23 | |
24 | namespace { |
25 | class LoongArch final : public TargetInfo { |
26 | public: |
27 | LoongArch(); |
28 | uint32_t calcEFlags() const override; |
29 | int64_t getImplicitAddend(const uint8_t *buf, RelType type) const override; |
30 | void writeGotPlt(uint8_t *buf, const Symbol &s) const override; |
31 | void writeIgotPlt(uint8_t *buf, const Symbol &s) const override; |
32 | void writePltHeader(uint8_t *buf) const override; |
33 | void writePlt(uint8_t *buf, const Symbol &sym, |
34 | uint64_t pltEntryAddr) const override; |
35 | RelType getDynRel(RelType type) const override; |
36 | RelExpr getRelExpr(RelType type, const Symbol &s, |
37 | const uint8_t *loc) const override; |
38 | bool usesOnlyLowPageBits(RelType type) const override; |
39 | void relocate(uint8_t *loc, const Relocation &rel, |
40 | uint64_t val) const override; |
41 | bool relaxOnce(int pass) const override; |
42 | void finalizeRelax(int passes) const override; |
43 | }; |
44 | } // end anonymous namespace |
45 | |
46 | namespace { |
47 | enum Op { |
48 | SUB_W = 0x00110000, |
49 | SUB_D = 0x00118000, |
50 | BREAK = 0x002a0000, |
51 | SRLI_W = 0x00448000, |
52 | SRLI_D = 0x00450000, |
53 | ADDI_W = 0x02800000, |
54 | ADDI_D = 0x02c00000, |
55 | ANDI = 0x03400000, |
56 | PCADDU12I = 0x1c000000, |
57 | LD_W = 0x28800000, |
58 | LD_D = 0x28c00000, |
59 | JIRL = 0x4c000000, |
60 | }; |
61 | |
62 | enum Reg { |
63 | R_ZERO = 0, |
64 | R_RA = 1, |
65 | R_TP = 2, |
66 | R_T0 = 12, |
67 | R_T1 = 13, |
68 | R_T2 = 14, |
69 | R_T3 = 15, |
70 | }; |
71 | } // namespace |
72 | |
73 | // Mask out the input's lowest 12 bits for use with `pcalau12i`, in sequences |
74 | // like `pcalau12i + addi.[wd]` or `pcalau12i + {ld,st}.*` where the `pcalau12i` |
75 | // produces a PC-relative intermediate value with the lowest 12 bits zeroed (the |
76 | // "page") for the next instruction to add in the "page offset". (`pcalau12i` |
77 | // stands for something like "PC ALigned Add Upper that starts from the 12th |
78 | // bit, Immediate".) |
79 | // |
80 | // Here a "page" is in fact just another way to refer to the 12-bit range |
81 | // allowed by the immediate field of the addi/ld/st instructions, and not |
82 | // related to the system or the kernel's actual page size. The semantics happen |
83 | // to match the AArch64 `adrp`, so the concept of "page" is borrowed here. |
84 | static uint64_t getLoongArchPage(uint64_t p) { |
85 | return p & ~static_cast<uint64_t>(0xfff); |
86 | } |
87 | |
88 | static uint32_t lo12(uint32_t val) { return val & 0xfff; } |
89 | |
90 | // Calculate the adjusted page delta between dest and PC. |
91 | uint64_t elf::getLoongArchPageDelta(uint64_t dest, uint64_t pc, RelType type) { |
92 | // Note that if the sequence being relocated is `pcalau12i + addi.d + lu32i.d |
93 | // + lu52i.d`, they must be adjacent so that we can infer the PC of |
94 | // `pcalau12i` when calculating the page delta for the other two instructions |
95 | // (lu32i.d and lu52i.d). Compensate all the sign-extensions is a bit |
96 | // complicated. Just use psABI recommended algorithm. |
97 | uint64_t pcalau12i_pc; |
98 | switch (type) { |
99 | case R_LARCH_PCALA64_LO20: |
100 | case R_LARCH_GOT64_PC_LO20: |
101 | case R_LARCH_TLS_IE64_PC_LO20: |
102 | case R_LARCH_TLS_DESC64_PC_LO20: |
103 | pcalau12i_pc = pc - 8; |
104 | break; |
105 | case R_LARCH_PCALA64_HI12: |
106 | case R_LARCH_GOT64_PC_HI12: |
107 | case R_LARCH_TLS_IE64_PC_HI12: |
108 | case R_LARCH_TLS_DESC64_PC_HI12: |
109 | pcalau12i_pc = pc - 12; |
110 | break; |
111 | default: |
112 | pcalau12i_pc = pc; |
113 | break; |
114 | } |
115 | uint64_t result = getLoongArchPage(p: dest) - getLoongArchPage(p: pcalau12i_pc); |
116 | if (dest & 0x800) |
117 | result += 0x1000 - 0x1'0000'0000; |
118 | if (result & 0x8000'0000) |
119 | result += 0x1'0000'0000; |
120 | return result; |
121 | } |
122 | |
123 | static uint32_t hi20(uint32_t val) { return (val + 0x800) >> 12; } |
124 | |
125 | static uint32_t insn(uint32_t op, uint32_t d, uint32_t j, uint32_t k) { |
126 | return op | d | (j << 5) | (k << 10); |
127 | } |
128 | |
129 | // Extract bits v[begin:end], where range is inclusive. |
130 | static uint32_t (uint64_t v, uint32_t begin, uint32_t end) { |
131 | return begin == 63 ? v >> end : (v & ((1ULL << (begin + 1)) - 1)) >> end; |
132 | } |
133 | |
134 | static uint32_t setD5k16(uint32_t insn, uint32_t imm) { |
135 | uint32_t immLo = extractBits(v: imm, begin: 15, end: 0); |
136 | uint32_t immHi = extractBits(v: imm, begin: 20, end: 16); |
137 | return (insn & 0xfc0003e0) | (immLo << 10) | immHi; |
138 | } |
139 | |
140 | static uint32_t setD10k16(uint32_t insn, uint32_t imm) { |
141 | uint32_t immLo = extractBits(v: imm, begin: 15, end: 0); |
142 | uint32_t immHi = extractBits(v: imm, begin: 25, end: 16); |
143 | return (insn & 0xfc000000) | (immLo << 10) | immHi; |
144 | } |
145 | |
146 | static uint32_t setJ20(uint32_t insn, uint32_t imm) { |
147 | return (insn & 0xfe00001f) | (extractBits(v: imm, begin: 19, end: 0) << 5); |
148 | } |
149 | |
150 | static uint32_t setK12(uint32_t insn, uint32_t imm) { |
151 | return (insn & 0xffc003ff) | (extractBits(v: imm, begin: 11, end: 0) << 10); |
152 | } |
153 | |
154 | static uint32_t setK16(uint32_t insn, uint32_t imm) { |
155 | return (insn & 0xfc0003ff) | (extractBits(v: imm, begin: 15, end: 0) << 10); |
156 | } |
157 | |
158 | static bool isJirl(uint32_t insn) { |
159 | return (insn & 0xfc000000) == JIRL; |
160 | } |
161 | |
162 | static void handleUleb128(uint8_t *loc, uint64_t val) { |
163 | const uint32_t maxcount = 1 + 64 / 7; |
164 | uint32_t count; |
165 | const char *error = nullptr; |
166 | uint64_t orig = decodeULEB128(p: loc, n: &count, end: nullptr, error: &error); |
167 | if (count > maxcount || (count == maxcount && error)) |
168 | errorOrWarn(msg: getErrorLocation(loc) + "extra space for uleb128" ); |
169 | uint64_t mask = count < maxcount ? (1ULL << 7 * count) - 1 : -1ULL; |
170 | encodeULEB128(Value: (orig + val) & mask, p: loc, PadTo: count); |
171 | } |
172 | |
173 | LoongArch::LoongArch() { |
174 | // The LoongArch ISA itself does not have a limit on page sizes. According to |
175 | // the ISA manual, the PS (page size) field in MTLB entries and CSR.STLBPS is |
176 | // 6 bits wide, meaning the maximum page size is 2^63 which is equivalent to |
177 | // "unlimited". |
178 | // However, practically the maximum usable page size is constrained by the |
179 | // kernel implementation, and 64KiB is the biggest non-huge page size |
180 | // supported by Linux as of v6.4. The most widespread page size in use, |
181 | // though, is 16KiB. |
182 | defaultCommonPageSize = 16384; |
183 | defaultMaxPageSize = 65536; |
184 | write32le(P: trapInstr.data(), V: BREAK); // break 0 |
185 | |
186 | copyRel = R_LARCH_COPY; |
187 | pltRel = R_LARCH_JUMP_SLOT; |
188 | relativeRel = R_LARCH_RELATIVE; |
189 | iRelativeRel = R_LARCH_IRELATIVE; |
190 | |
191 | if (config->is64) { |
192 | symbolicRel = R_LARCH_64; |
193 | tlsModuleIndexRel = R_LARCH_TLS_DTPMOD64; |
194 | tlsOffsetRel = R_LARCH_TLS_DTPREL64; |
195 | tlsGotRel = R_LARCH_TLS_TPREL64; |
196 | tlsDescRel = R_LARCH_TLS_DESC64; |
197 | } else { |
198 | symbolicRel = R_LARCH_32; |
199 | tlsModuleIndexRel = R_LARCH_TLS_DTPMOD32; |
200 | tlsOffsetRel = R_LARCH_TLS_DTPREL32; |
201 | tlsGotRel = R_LARCH_TLS_TPREL32; |
202 | tlsDescRel = R_LARCH_TLS_DESC32; |
203 | } |
204 | |
205 | gotRel = symbolicRel; |
206 | |
207 | // .got.plt[0] = _dl_runtime_resolve, .got.plt[1] = link_map |
208 | gotPltHeaderEntriesNum = 2; |
209 | |
210 | pltHeaderSize = 32; |
211 | pltEntrySize = 16; |
212 | ipltEntrySize = 16; |
213 | } |
214 | |
215 | static uint32_t getEFlags(const InputFile *f) { |
216 | if (config->is64) |
217 | return cast<ObjFile<ELF64LE>>(Val: f)->getObj().getHeader().e_flags; |
218 | return cast<ObjFile<ELF32LE>>(Val: f)->getObj().getHeader().e_flags; |
219 | } |
220 | |
221 | static bool inputFileHasCode(const InputFile *f) { |
222 | for (const auto *sec : f->getSections()) |
223 | if (sec && sec->flags & SHF_EXECINSTR) |
224 | return true; |
225 | |
226 | return false; |
227 | } |
228 | |
229 | uint32_t LoongArch::calcEFlags() const { |
230 | // If there are only binary input files (from -b binary), use a |
231 | // value of 0 for the ELF header flags. |
232 | if (ctx.objectFiles.empty()) |
233 | return 0; |
234 | |
235 | uint32_t target = 0; |
236 | const InputFile *targetFile; |
237 | for (const InputFile *f : ctx.objectFiles) { |
238 | // Do not enforce ABI compatibility if the input file does not contain code. |
239 | // This is useful for allowing linkage with data-only object files produced |
240 | // with tools like objcopy, that have zero e_flags. |
241 | if (!inputFileHasCode(f)) |
242 | continue; |
243 | |
244 | // Take the first non-zero e_flags as the reference. |
245 | uint32_t flags = getEFlags(f); |
246 | if (target == 0 && flags != 0) { |
247 | target = flags; |
248 | targetFile = f; |
249 | } |
250 | |
251 | if ((flags & EF_LOONGARCH_ABI_MODIFIER_MASK) != |
252 | (target & EF_LOONGARCH_ABI_MODIFIER_MASK)) |
253 | error(msg: toString(f) + |
254 | ": cannot link object files with different ABI from " + |
255 | toString(f: targetFile)); |
256 | |
257 | // We cannot process psABI v1.x / object ABI v0 files (containing stack |
258 | // relocations), unlike ld.bfd. |
259 | // |
260 | // Instead of blindly accepting every v0 object and only failing at |
261 | // relocation processing time, just disallow interlink altogether. We |
262 | // don't expect significant usage of object ABI v0 in the wild (the old |
263 | // world may continue using object ABI v0 for a while, but as it's not |
264 | // binary-compatible with the upstream i.e. new-world ecosystem, it's not |
265 | // being considered here). |
266 | // |
267 | // There are briefly some new-world systems with object ABI v0 binaries too. |
268 | // It is because these systems were built before the new ABI was finalized. |
269 | // These are not supported either due to the extremely small number of them, |
270 | // and the few impacted users are advised to simply rebuild world or |
271 | // reinstall a recent system. |
272 | if ((flags & EF_LOONGARCH_OBJABI_MASK) != EF_LOONGARCH_OBJABI_V1) |
273 | error(msg: toString(f) + ": unsupported object file ABI version" ); |
274 | } |
275 | |
276 | return target; |
277 | } |
278 | |
279 | int64_t LoongArch::getImplicitAddend(const uint8_t *buf, RelType type) const { |
280 | switch (type) { |
281 | default: |
282 | internalLinkerError(loc: getErrorLocation(loc: buf), |
283 | msg: "cannot read addend for relocation " + toString(type)); |
284 | return 0; |
285 | case R_LARCH_32: |
286 | case R_LARCH_TLS_DTPMOD32: |
287 | case R_LARCH_TLS_DTPREL32: |
288 | case R_LARCH_TLS_TPREL32: |
289 | return SignExtend64<32>(x: read32le(P: buf)); |
290 | case R_LARCH_64: |
291 | case R_LARCH_TLS_DTPMOD64: |
292 | case R_LARCH_TLS_DTPREL64: |
293 | case R_LARCH_TLS_TPREL64: |
294 | return read64le(P: buf); |
295 | case R_LARCH_RELATIVE: |
296 | case R_LARCH_IRELATIVE: |
297 | return config->is64 ? read64le(P: buf) : read32le(P: buf); |
298 | case R_LARCH_NONE: |
299 | case R_LARCH_JUMP_SLOT: |
300 | // These relocations are defined as not having an implicit addend. |
301 | return 0; |
302 | case R_LARCH_TLS_DESC32: |
303 | return read32le(P: buf + 4); |
304 | case R_LARCH_TLS_DESC64: |
305 | return read64le(P: buf + 8); |
306 | } |
307 | } |
308 | |
309 | void LoongArch::writeGotPlt(uint8_t *buf, const Symbol &s) const { |
310 | if (config->is64) |
311 | write64le(P: buf, V: in.plt->getVA()); |
312 | else |
313 | write32le(P: buf, V: in.plt->getVA()); |
314 | } |
315 | |
316 | void LoongArch::writeIgotPlt(uint8_t *buf, const Symbol &s) const { |
317 | if (config->writeAddends) { |
318 | if (config->is64) |
319 | write64le(P: buf, V: s.getVA()); |
320 | else |
321 | write32le(P: buf, V: s.getVA()); |
322 | } |
323 | } |
324 | |
325 | void LoongArch::(uint8_t *buf) const { |
326 | // The LoongArch PLT is currently structured just like that of RISCV. |
327 | // Annoyingly, this means the PLT is still using `pcaddu12i` to perform |
328 | // PC-relative addressing (because `pcaddu12i` is the same as RISCV `auipc`), |
329 | // in contrast to the AArch64-like page-offset scheme with `pcalau12i` that |
330 | // is used everywhere else involving PC-relative operations in the LoongArch |
331 | // ELF psABI v2.00. |
332 | // |
333 | // The `pcrel_{hi20,lo12}` operators are illustrative only and not really |
334 | // supported by LoongArch assemblers. |
335 | // |
336 | // pcaddu12i $t2, %pcrel_hi20(.got.plt) |
337 | // sub.[wd] $t1, $t1, $t3 |
338 | // ld.[wd] $t3, $t2, %pcrel_lo12(.got.plt) ; t3 = _dl_runtime_resolve |
339 | // addi.[wd] $t1, $t1, -pltHeaderSize-12 ; t1 = &.plt[i] - &.plt[0] |
340 | // addi.[wd] $t0, $t2, %pcrel_lo12(.got.plt) |
341 | // srli.[wd] $t1, $t1, (is64?1:2) ; t1 = &.got.plt[i] - &.got.plt[0] |
342 | // ld.[wd] $t0, $t0, Wordsize ; t0 = link_map |
343 | // jr $t3 |
344 | uint32_t offset = in.gotPlt->getVA() - in.plt->getVA(); |
345 | uint32_t sub = config->is64 ? SUB_D : SUB_W; |
346 | uint32_t ld = config->is64 ? LD_D : LD_W; |
347 | uint32_t addi = config->is64 ? ADDI_D : ADDI_W; |
348 | uint32_t srli = config->is64 ? SRLI_D : SRLI_W; |
349 | write32le(P: buf + 0, V: insn(op: PCADDU12I, d: R_T2, j: hi20(val: offset), k: 0)); |
350 | write32le(P: buf + 4, V: insn(op: sub, d: R_T1, j: R_T1, k: R_T3)); |
351 | write32le(P: buf + 8, V: insn(op: ld, d: R_T3, j: R_T2, k: lo12(val: offset))); |
352 | write32le(P: buf + 12, V: insn(op: addi, d: R_T1, j: R_T1, k: lo12(val: -target->pltHeaderSize - 12))); |
353 | write32le(P: buf + 16, V: insn(op: addi, d: R_T0, j: R_T2, k: lo12(val: offset))); |
354 | write32le(P: buf + 20, V: insn(op: srli, d: R_T1, j: R_T1, k: config->is64 ? 1 : 2)); |
355 | write32le(P: buf + 24, V: insn(op: ld, d: R_T0, j: R_T0, k: config->wordsize)); |
356 | write32le(P: buf + 28, V: insn(op: JIRL, d: R_ZERO, j: R_T3, k: 0)); |
357 | } |
358 | |
359 | void LoongArch::writePlt(uint8_t *buf, const Symbol &sym, |
360 | uint64_t pltEntryAddr) const { |
361 | // See the comment in writePltHeader for reason why pcaddu12i is used instead |
362 | // of the pcalau12i that's more commonly seen in the ELF psABI v2.0 days. |
363 | // |
364 | // pcaddu12i $t3, %pcrel_hi20(f@.got.plt) |
365 | // ld.[wd] $t3, $t3, %pcrel_lo12(f@.got.plt) |
366 | // jirl $t1, $t3, 0 |
367 | // nop |
368 | uint32_t offset = sym.getGotPltVA() - pltEntryAddr; |
369 | write32le(P: buf + 0, V: insn(op: PCADDU12I, d: R_T3, j: hi20(val: offset), k: 0)); |
370 | write32le(P: buf + 4, |
371 | V: insn(op: config->is64 ? LD_D : LD_W, d: R_T3, j: R_T3, k: lo12(val: offset))); |
372 | write32le(P: buf + 8, V: insn(op: JIRL, d: R_T1, j: R_T3, k: 0)); |
373 | write32le(P: buf + 12, V: insn(op: ANDI, d: R_ZERO, j: R_ZERO, k: 0)); |
374 | } |
375 | |
376 | RelType LoongArch::getDynRel(RelType type) const { |
377 | return type == target->symbolicRel ? type |
378 | : static_cast<RelType>(R_LARCH_NONE); |
379 | } |
380 | |
381 | RelExpr LoongArch::getRelExpr(const RelType type, const Symbol &s, |
382 | const uint8_t *loc) const { |
383 | switch (type) { |
384 | case R_LARCH_NONE: |
385 | case R_LARCH_MARK_LA: |
386 | case R_LARCH_MARK_PCREL: |
387 | return R_NONE; |
388 | case R_LARCH_32: |
389 | case R_LARCH_64: |
390 | case R_LARCH_ABS_HI20: |
391 | case R_LARCH_ABS_LO12: |
392 | case R_LARCH_ABS64_LO20: |
393 | case R_LARCH_ABS64_HI12: |
394 | return R_ABS; |
395 | case R_LARCH_PCALA_LO12: |
396 | // We could just R_ABS, but the JIRL instruction reuses the relocation type |
397 | // for a different purpose. The questionable usage is part of glibc 2.37 |
398 | // libc_nonshared.a [1], which is linked into user programs, so we have to |
399 | // work around it for a while, even if a new relocation type may be |
400 | // introduced in the future [2]. |
401 | // |
402 | // [1]: https://sourceware.org/git/?p=glibc.git;a=commitdiff;h=9f482b73f41a9a1bbfb173aad0733d1c824c788a |
403 | // [2]: https://github.com/loongson/la-abi-specs/pull/3 |
404 | return isJirl(insn: read32le(P: loc)) ? R_PLT : R_ABS; |
405 | case R_LARCH_TLS_DTPREL32: |
406 | case R_LARCH_TLS_DTPREL64: |
407 | return R_DTPREL; |
408 | case R_LARCH_TLS_TPREL32: |
409 | case R_LARCH_TLS_TPREL64: |
410 | case R_LARCH_TLS_LE_HI20: |
411 | case R_LARCH_TLS_LE_HI20_R: |
412 | case R_LARCH_TLS_LE_LO12: |
413 | case R_LARCH_TLS_LE_LO12_R: |
414 | case R_LARCH_TLS_LE64_LO20: |
415 | case R_LARCH_TLS_LE64_HI12: |
416 | return R_TPREL; |
417 | case R_LARCH_ADD6: |
418 | case R_LARCH_ADD8: |
419 | case R_LARCH_ADD16: |
420 | case R_LARCH_ADD32: |
421 | case R_LARCH_ADD64: |
422 | case R_LARCH_ADD_ULEB128: |
423 | case R_LARCH_SUB6: |
424 | case R_LARCH_SUB8: |
425 | case R_LARCH_SUB16: |
426 | case R_LARCH_SUB32: |
427 | case R_LARCH_SUB64: |
428 | case R_LARCH_SUB_ULEB128: |
429 | // The LoongArch add/sub relocs behave like the RISCV counterparts; reuse |
430 | // the RelExpr to avoid code duplication. |
431 | return R_RISCV_ADD; |
432 | case R_LARCH_32_PCREL: |
433 | case R_LARCH_64_PCREL: |
434 | case R_LARCH_PCREL20_S2: |
435 | return R_PC; |
436 | case R_LARCH_B16: |
437 | case R_LARCH_B21: |
438 | case R_LARCH_B26: |
439 | case R_LARCH_CALL36: |
440 | return R_PLT_PC; |
441 | case R_LARCH_GOT_PC_HI20: |
442 | case R_LARCH_GOT64_PC_LO20: |
443 | case R_LARCH_GOT64_PC_HI12: |
444 | case R_LARCH_TLS_IE_PC_HI20: |
445 | case R_LARCH_TLS_IE64_PC_LO20: |
446 | case R_LARCH_TLS_IE64_PC_HI12: |
447 | return R_LOONGARCH_GOT_PAGE_PC; |
448 | case R_LARCH_GOT_PC_LO12: |
449 | case R_LARCH_TLS_IE_PC_LO12: |
450 | return R_LOONGARCH_GOT; |
451 | case R_LARCH_TLS_LD_PC_HI20: |
452 | case R_LARCH_TLS_GD_PC_HI20: |
453 | return R_LOONGARCH_TLSGD_PAGE_PC; |
454 | case R_LARCH_PCALA_HI20: |
455 | // Why not R_LOONGARCH_PAGE_PC, majority of references don't go through PLT |
456 | // anyway so why waste time checking only to get everything relaxed back to |
457 | // it? |
458 | // |
459 | // This is again due to the R_LARCH_PCALA_LO12 on JIRL case, where we want |
460 | // both the HI20 and LO12 to potentially refer to the PLT. But in reality |
461 | // the HI20 reloc appears earlier, and the relocs don't contain enough |
462 | // information to let us properly resolve semantics per symbol. |
463 | // Unlike RISCV, our LO12 relocs *do not* point to their corresponding HI20 |
464 | // relocs, hence it is nearly impossible to 100% accurately determine each |
465 | // HI20's "flavor" without taking big performance hits, in the presence of |
466 | // edge cases (e.g. HI20 without pairing LO12; paired LO12 placed so far |
467 | // apart that relationship is not certain anymore), and programmer mistakes |
468 | // (e.g. as outlined in https://github.com/loongson/la-abi-specs/pull/3). |
469 | // |
470 | // Ideally we would scan in an extra pass for all LO12s on JIRL, then mark |
471 | // every HI20 reloc referring to the same symbol differently; this is not |
472 | // feasible with the current function signature of getRelExpr that doesn't |
473 | // allow for such inter-pass state. |
474 | // |
475 | // So, unfortunately we have to again workaround this quirk the same way as |
476 | // BFD: assuming every R_LARCH_PCALA_HI20 is potentially PLT-needing, only |
477 | // relaxing back to R_LOONGARCH_PAGE_PC if it's known not so at a later |
478 | // stage. |
479 | return R_LOONGARCH_PLT_PAGE_PC; |
480 | case R_LARCH_PCALA64_LO20: |
481 | case R_LARCH_PCALA64_HI12: |
482 | return R_LOONGARCH_PAGE_PC; |
483 | case R_LARCH_GOT_HI20: |
484 | case R_LARCH_GOT_LO12: |
485 | case R_LARCH_GOT64_LO20: |
486 | case R_LARCH_GOT64_HI12: |
487 | case R_LARCH_TLS_IE_HI20: |
488 | case R_LARCH_TLS_IE_LO12: |
489 | case R_LARCH_TLS_IE64_LO20: |
490 | case R_LARCH_TLS_IE64_HI12: |
491 | return R_GOT; |
492 | case R_LARCH_TLS_LD_HI20: |
493 | return R_TLSLD_GOT; |
494 | case R_LARCH_TLS_GD_HI20: |
495 | return R_TLSGD_GOT; |
496 | case R_LARCH_TLS_LE_ADD_R: |
497 | case R_LARCH_RELAX: |
498 | return config->relax ? R_RELAX_HINT : R_NONE; |
499 | case R_LARCH_ALIGN: |
500 | return R_RELAX_HINT; |
501 | case R_LARCH_TLS_DESC_PC_HI20: |
502 | case R_LARCH_TLS_DESC64_PC_LO20: |
503 | case R_LARCH_TLS_DESC64_PC_HI12: |
504 | return R_LOONGARCH_TLSDESC_PAGE_PC; |
505 | case R_LARCH_TLS_DESC_PC_LO12: |
506 | case R_LARCH_TLS_DESC_LD: |
507 | case R_LARCH_TLS_DESC_HI20: |
508 | case R_LARCH_TLS_DESC_LO12: |
509 | case R_LARCH_TLS_DESC64_LO20: |
510 | case R_LARCH_TLS_DESC64_HI12: |
511 | return R_TLSDESC; |
512 | case R_LARCH_TLS_DESC_CALL: |
513 | return R_TLSDESC_CALL; |
514 | case R_LARCH_TLS_LD_PCREL20_S2: |
515 | return R_TLSLD_PC; |
516 | case R_LARCH_TLS_GD_PCREL20_S2: |
517 | return R_TLSGD_PC; |
518 | case R_LARCH_TLS_DESC_PCREL20_S2: |
519 | return R_TLSDESC_PC; |
520 | |
521 | // Other known relocs that are explicitly unimplemented: |
522 | // |
523 | // - psABI v1 relocs that need a stateful stack machine to work, and not |
524 | // required when implementing psABI v2; |
525 | // - relocs that are not used anywhere (R_LARCH_{ADD,SUB}_24 [1], and the |
526 | // two GNU vtable-related relocs). |
527 | // |
528 | // [1]: https://web.archive.org/web/20230709064026/https://github.com/loongson/LoongArch-Documentation/issues/51 |
529 | default: |
530 | error(msg: getErrorLocation(loc) + "unknown relocation (" + Twine(type) + |
531 | ") against symbol " + toString(s)); |
532 | return R_NONE; |
533 | } |
534 | } |
535 | |
536 | bool LoongArch::usesOnlyLowPageBits(RelType type) const { |
537 | switch (type) { |
538 | default: |
539 | return false; |
540 | case R_LARCH_PCALA_LO12: |
541 | case R_LARCH_GOT_LO12: |
542 | case R_LARCH_GOT_PC_LO12: |
543 | case R_LARCH_TLS_IE_PC_LO12: |
544 | case R_LARCH_TLS_DESC_LO12: |
545 | case R_LARCH_TLS_DESC_PC_LO12: |
546 | return true; |
547 | } |
548 | } |
549 | |
550 | void LoongArch::relocate(uint8_t *loc, const Relocation &rel, |
551 | uint64_t val) const { |
552 | switch (rel.type) { |
553 | case R_LARCH_32_PCREL: |
554 | checkInt(loc, v: val, n: 32, rel); |
555 | [[fallthrough]]; |
556 | case R_LARCH_32: |
557 | case R_LARCH_TLS_DTPREL32: |
558 | write32le(P: loc, V: val); |
559 | return; |
560 | case R_LARCH_64: |
561 | case R_LARCH_TLS_DTPREL64: |
562 | case R_LARCH_64_PCREL: |
563 | write64le(P: loc, V: val); |
564 | return; |
565 | |
566 | // Relocs intended for `pcaddi`. |
567 | case R_LARCH_PCREL20_S2: |
568 | case R_LARCH_TLS_LD_PCREL20_S2: |
569 | case R_LARCH_TLS_GD_PCREL20_S2: |
570 | case R_LARCH_TLS_DESC_PCREL20_S2: |
571 | checkInt(loc, v: val, n: 22, rel); |
572 | checkAlignment(loc, v: val, n: 4, rel); |
573 | write32le(P: loc, V: setJ20(insn: read32le(P: loc), imm: val >> 2)); |
574 | return; |
575 | |
576 | case R_LARCH_B16: |
577 | checkInt(loc, v: val, n: 18, rel); |
578 | checkAlignment(loc, v: val, n: 4, rel); |
579 | write32le(P: loc, V: setK16(insn: read32le(P: loc), imm: val >> 2)); |
580 | return; |
581 | |
582 | case R_LARCH_B21: |
583 | checkInt(loc, v: val, n: 23, rel); |
584 | checkAlignment(loc, v: val, n: 4, rel); |
585 | write32le(P: loc, V: setD5k16(insn: read32le(P: loc), imm: val >> 2)); |
586 | return; |
587 | |
588 | case R_LARCH_B26: |
589 | checkInt(loc, v: val, n: 28, rel); |
590 | checkAlignment(loc, v: val, n: 4, rel); |
591 | write32le(P: loc, V: setD10k16(insn: read32le(P: loc), imm: val >> 2)); |
592 | return; |
593 | |
594 | case R_LARCH_CALL36: { |
595 | // This relocation is designed for adjacent pcaddu18i+jirl pairs that |
596 | // are patched in one time. Because of sign extension of these insns' |
597 | // immediate fields, the relocation range is [-128G - 0x20000, +128G - |
598 | // 0x20000) (of course must be 4-byte aligned). |
599 | if (((int64_t)val + 0x20000) != llvm::SignExtend64(X: val + 0x20000, B: 38)) |
600 | reportRangeError(loc, rel, v: Twine(val), min: llvm::minIntN(N: 38) - 0x20000, |
601 | max: llvm::maxIntN(N: 38) - 0x20000); |
602 | checkAlignment(loc, v: val, n: 4, rel); |
603 | // Since jirl performs sign extension on the offset immediate, adds (1<<17) |
604 | // to original val to get the correct hi20. |
605 | uint32_t hi20 = extractBits(v: val + (1 << 17), begin: 37, end: 18); |
606 | // Despite the name, the lower part is actually 18 bits with 4-byte aligned. |
607 | uint32_t lo16 = extractBits(v: val, begin: 17, end: 2); |
608 | write32le(P: loc, V: setJ20(insn: read32le(P: loc), imm: hi20)); |
609 | write32le(P: loc + 4, V: setK16(insn: read32le(P: loc + 4), imm: lo16)); |
610 | return; |
611 | } |
612 | |
613 | // Relocs intended for `addi`, `ld` or `st`. |
614 | case R_LARCH_PCALA_LO12: |
615 | // We have to again inspect the insn word to handle the R_LARCH_PCALA_LO12 |
616 | // on JIRL case: firstly JIRL wants its immediate's 2 lowest zeroes |
617 | // removed by us (in contrast to regular R_LARCH_PCALA_LO12), secondly |
618 | // its immediate slot width is different too (16, not 12). |
619 | // In this case, process like an R_LARCH_B16, but without overflow checking |
620 | // and only taking the value's lowest 12 bits. |
621 | if (isJirl(insn: read32le(P: loc))) { |
622 | checkAlignment(loc, v: val, n: 4, rel); |
623 | val = SignExtend64<12>(x: val); |
624 | write32le(P: loc, V: setK16(insn: read32le(P: loc), imm: val >> 2)); |
625 | return; |
626 | } |
627 | [[fallthrough]]; |
628 | case R_LARCH_ABS_LO12: |
629 | case R_LARCH_GOT_PC_LO12: |
630 | case R_LARCH_GOT_LO12: |
631 | case R_LARCH_TLS_LE_LO12: |
632 | case R_LARCH_TLS_IE_PC_LO12: |
633 | case R_LARCH_TLS_IE_LO12: |
634 | case R_LARCH_TLS_LE_LO12_R: |
635 | case R_LARCH_TLS_DESC_PC_LO12: |
636 | case R_LARCH_TLS_DESC_LO12: |
637 | write32le(P: loc, V: setK12(insn: read32le(P: loc), imm: extractBits(v: val, begin: 11, end: 0))); |
638 | return; |
639 | |
640 | // Relocs intended for `lu12i.w` or `pcalau12i`. |
641 | case R_LARCH_ABS_HI20: |
642 | case R_LARCH_PCALA_HI20: |
643 | case R_LARCH_GOT_PC_HI20: |
644 | case R_LARCH_GOT_HI20: |
645 | case R_LARCH_TLS_LE_HI20: |
646 | case R_LARCH_TLS_IE_PC_HI20: |
647 | case R_LARCH_TLS_IE_HI20: |
648 | case R_LARCH_TLS_LD_PC_HI20: |
649 | case R_LARCH_TLS_LD_HI20: |
650 | case R_LARCH_TLS_GD_PC_HI20: |
651 | case R_LARCH_TLS_GD_HI20: |
652 | case R_LARCH_TLS_DESC_PC_HI20: |
653 | case R_LARCH_TLS_DESC_HI20: |
654 | write32le(P: loc, V: setJ20(insn: read32le(P: loc), imm: extractBits(v: val, begin: 31, end: 12))); |
655 | return; |
656 | case R_LARCH_TLS_LE_HI20_R: |
657 | write32le(P: loc, V: setJ20(insn: read32le(P: loc), imm: extractBits(v: val + 0x800, begin: 31, end: 12))); |
658 | return; |
659 | |
660 | // Relocs intended for `lu32i.d`. |
661 | case R_LARCH_ABS64_LO20: |
662 | case R_LARCH_PCALA64_LO20: |
663 | case R_LARCH_GOT64_PC_LO20: |
664 | case R_LARCH_GOT64_LO20: |
665 | case R_LARCH_TLS_LE64_LO20: |
666 | case R_LARCH_TLS_IE64_PC_LO20: |
667 | case R_LARCH_TLS_IE64_LO20: |
668 | case R_LARCH_TLS_DESC64_PC_LO20: |
669 | case R_LARCH_TLS_DESC64_LO20: |
670 | write32le(P: loc, V: setJ20(insn: read32le(P: loc), imm: extractBits(v: val, begin: 51, end: 32))); |
671 | return; |
672 | |
673 | // Relocs intended for `lu52i.d`. |
674 | case R_LARCH_ABS64_HI12: |
675 | case R_LARCH_PCALA64_HI12: |
676 | case R_LARCH_GOT64_PC_HI12: |
677 | case R_LARCH_GOT64_HI12: |
678 | case R_LARCH_TLS_LE64_HI12: |
679 | case R_LARCH_TLS_IE64_PC_HI12: |
680 | case R_LARCH_TLS_IE64_HI12: |
681 | case R_LARCH_TLS_DESC64_PC_HI12: |
682 | case R_LARCH_TLS_DESC64_HI12: |
683 | write32le(P: loc, V: setK12(insn: read32le(P: loc), imm: extractBits(v: val, begin: 63, end: 52))); |
684 | return; |
685 | |
686 | case R_LARCH_ADD6: |
687 | *loc = (*loc & 0xc0) | ((*loc + val) & 0x3f); |
688 | return; |
689 | case R_LARCH_ADD8: |
690 | *loc += val; |
691 | return; |
692 | case R_LARCH_ADD16: |
693 | write16le(P: loc, V: read16le(P: loc) + val); |
694 | return; |
695 | case R_LARCH_ADD32: |
696 | write32le(P: loc, V: read32le(P: loc) + val); |
697 | return; |
698 | case R_LARCH_ADD64: |
699 | write64le(P: loc, V: read64le(P: loc) + val); |
700 | return; |
701 | case R_LARCH_ADD_ULEB128: |
702 | handleUleb128(loc, val); |
703 | return; |
704 | case R_LARCH_SUB6: |
705 | *loc = (*loc & 0xc0) | ((*loc - val) & 0x3f); |
706 | return; |
707 | case R_LARCH_SUB8: |
708 | *loc -= val; |
709 | return; |
710 | case R_LARCH_SUB16: |
711 | write16le(P: loc, V: read16le(P: loc) - val); |
712 | return; |
713 | case R_LARCH_SUB32: |
714 | write32le(P: loc, V: read32le(P: loc) - val); |
715 | return; |
716 | case R_LARCH_SUB64: |
717 | write64le(P: loc, V: read64le(P: loc) - val); |
718 | return; |
719 | case R_LARCH_SUB_ULEB128: |
720 | handleUleb128(loc, val: -val); |
721 | return; |
722 | |
723 | case R_LARCH_MARK_LA: |
724 | case R_LARCH_MARK_PCREL: |
725 | // no-op |
726 | return; |
727 | |
728 | case R_LARCH_TLS_LE_ADD_R: |
729 | case R_LARCH_RELAX: |
730 | return; // Ignored (for now) |
731 | |
732 | case R_LARCH_TLS_DESC_LD: |
733 | return; // nothing to do. |
734 | case R_LARCH_TLS_DESC32: |
735 | write32le(P: loc + 4, V: val); |
736 | return; |
737 | case R_LARCH_TLS_DESC64: |
738 | write64le(P: loc + 8, V: val); |
739 | return; |
740 | |
741 | default: |
742 | llvm_unreachable("unknown relocation" ); |
743 | } |
744 | } |
745 | |
746 | static bool relax(InputSection &sec) { |
747 | const uint64_t secAddr = sec.getVA(); |
748 | const MutableArrayRef<Relocation> relocs = sec.relocs(); |
749 | auto &aux = *sec.relaxAux; |
750 | bool changed = false; |
751 | ArrayRef<SymbolAnchor> sa = ArrayRef(aux.anchors); |
752 | uint64_t delta = 0; |
753 | |
754 | std::fill_n(first: aux.relocTypes.get(), n: relocs.size(), value: R_LARCH_NONE); |
755 | aux.writes.clear(); |
756 | for (auto [i, r] : llvm::enumerate(First: relocs)) { |
757 | const uint64_t loc = secAddr + r.offset - delta; |
758 | uint32_t &cur = aux.relocDeltas[i], remove = 0; |
759 | switch (r.type) { |
760 | case R_LARCH_ALIGN: { |
761 | const uint64_t addend = |
762 | r.sym->isUndefined() ? Log2_64(Value: r.addend) + 1 : r.addend; |
763 | const uint64_t allBytes = (1ULL << (addend & 0xff)) - 4; |
764 | const uint64_t align = 1ULL << (addend & 0xff); |
765 | const uint64_t maxBytes = addend >> 8; |
766 | const uint64_t off = loc & (align - 1); |
767 | const uint64_t curBytes = off == 0 ? 0 : align - off; |
768 | // All bytes beyond the alignment boundary should be removed. |
769 | // If emit bytes more than max bytes to emit, remove all. |
770 | if (maxBytes != 0 && curBytes > maxBytes) |
771 | remove = allBytes; |
772 | else |
773 | remove = allBytes - curBytes; |
774 | // If we can't satisfy this alignment, we've found a bad input. |
775 | if (LLVM_UNLIKELY(static_cast<int32_t>(remove) < 0)) { |
776 | errorOrWarn(msg: getErrorLocation(loc: (const uint8_t *)loc) + |
777 | "insufficient padding bytes for " + lld::toString(type: r.type) + |
778 | ": " + Twine(allBytes) + " bytes available for " + |
779 | "requested alignment of " + Twine(align) + " bytes" ); |
780 | remove = 0; |
781 | } |
782 | break; |
783 | } |
784 | } |
785 | |
786 | // For all anchors whose offsets are <= r.offset, they are preceded by |
787 | // the previous relocation whose `relocDeltas` value equals `delta`. |
788 | // Decrease their st_value and update their st_size. |
789 | for (; sa.size() && sa[0].offset <= r.offset; sa = sa.slice(N: 1)) { |
790 | if (sa[0].end) |
791 | sa[0].d->size = sa[0].offset - delta - sa[0].d->value; |
792 | else |
793 | sa[0].d->value = sa[0].offset - delta; |
794 | } |
795 | delta += remove; |
796 | if (delta != cur) { |
797 | cur = delta; |
798 | changed = true; |
799 | } |
800 | } |
801 | |
802 | for (const SymbolAnchor &a : sa) { |
803 | if (a.end) |
804 | a.d->size = a.offset - delta - a.d->value; |
805 | else |
806 | a.d->value = a.offset - delta; |
807 | } |
808 | // Inform assignAddresses that the size has changed. |
809 | if (!isUInt<32>(x: delta)) |
810 | fatal(msg: "section size decrease is too large: " + Twine(delta)); |
811 | sec.bytesDropped = delta; |
812 | return changed; |
813 | } |
814 | |
815 | // When relaxing just R_LARCH_ALIGN, relocDeltas is usually changed only once in |
816 | // the absence of a linker script. For call and load/store R_LARCH_RELAX, code |
817 | // shrinkage may reduce displacement and make more relocations eligible for |
818 | // relaxation. Code shrinkage may increase displacement to a call/load/store |
819 | // target at a higher fixed address, invalidating an earlier relaxation. Any |
820 | // change in section sizes can have cascading effect and require another |
821 | // relaxation pass. |
822 | bool LoongArch::relaxOnce(int pass) const { |
823 | if (config->relocatable) |
824 | return false; |
825 | |
826 | if (pass == 0) |
827 | initSymbolAnchors(); |
828 | |
829 | SmallVector<InputSection *, 0> storage; |
830 | bool changed = false; |
831 | for (OutputSection *osec : outputSections) { |
832 | if (!(osec->flags & SHF_EXECINSTR)) |
833 | continue; |
834 | for (InputSection *sec : getInputSections(os: *osec, storage)) |
835 | changed |= relax(sec&: *sec); |
836 | } |
837 | return changed; |
838 | } |
839 | |
840 | void LoongArch::finalizeRelax(int passes) const { |
841 | log(msg: "relaxation passes: " + Twine(passes)); |
842 | SmallVector<InputSection *, 0> storage; |
843 | for (OutputSection *osec : outputSections) { |
844 | if (!(osec->flags & SHF_EXECINSTR)) |
845 | continue; |
846 | for (InputSection *sec : getInputSections(os: *osec, storage)) { |
847 | RelaxAux &aux = *sec->relaxAux; |
848 | if (!aux.relocDeltas) |
849 | continue; |
850 | |
851 | MutableArrayRef<Relocation> rels = sec->relocs(); |
852 | ArrayRef<uint8_t> old = sec->content(); |
853 | size_t newSize = old.size() - aux.relocDeltas[rels.size() - 1]; |
854 | uint8_t *p = context().bAlloc.Allocate<uint8_t>(Num: newSize); |
855 | uint64_t offset = 0; |
856 | int64_t delta = 0; |
857 | sec->content_ = p; |
858 | sec->size = newSize; |
859 | sec->bytesDropped = 0; |
860 | |
861 | // Update section content: remove NOPs for R_LARCH_ALIGN and rewrite |
862 | // instructions for relaxed relocations. |
863 | for (size_t i = 0, e = rels.size(); i != e; ++i) { |
864 | uint32_t remove = aux.relocDeltas[i] - delta; |
865 | delta = aux.relocDeltas[i]; |
866 | if (remove == 0 && aux.relocTypes[i] == R_LARCH_NONE) |
867 | continue; |
868 | |
869 | // Copy from last location to the current relocated location. |
870 | const Relocation &r = rels[i]; |
871 | uint64_t size = r.offset - offset; |
872 | memcpy(dest: p, src: old.data() + offset, n: size); |
873 | p += size; |
874 | offset = r.offset + remove; |
875 | } |
876 | memcpy(dest: p, src: old.data() + offset, n: old.size() - offset); |
877 | |
878 | // Subtract the previous relocDeltas value from the relocation offset. |
879 | // For a pair of R_LARCH_XXX/R_LARCH_RELAX with the same offset, decrease |
880 | // their r_offset by the same delta. |
881 | delta = 0; |
882 | for (size_t i = 0, e = rels.size(); i != e;) { |
883 | uint64_t cur = rels[i].offset; |
884 | do { |
885 | rels[i].offset -= delta; |
886 | if (aux.relocTypes[i] != R_LARCH_NONE) |
887 | rels[i].type = aux.relocTypes[i]; |
888 | } while (++i != e && rels[i].offset == cur); |
889 | delta = aux.relocDeltas[i - 1]; |
890 | } |
891 | } |
892 | } |
893 | } |
894 | |
895 | TargetInfo *elf::getLoongArchTargetInfo() { |
896 | static LoongArch target; |
897 | return ⌖ |
898 | } |
899 | |