LoongArch.cpp source code [llvm_projects/lld/ELF/Arch/LoongArch.cpp]

1	//===- LoongArch.cpp ------------------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "InputFiles.h"
10	#include "OutputSections.h"
11	#include "Symbols.h"
12	#include "SyntheticSections.h"
13	#include "Target.h"
14	#include "llvm/BinaryFormat/ELF.h"
15	#include "llvm/Support/LEB128.h"
16
17	using namespace llvm;
18	using namespace llvm::object;
19	using namespace llvm::support::endian;
20	using namespace llvm::ELF;
21	using namespace lld;
22	using namespace lld::elf;
23
24	namespace {
25	class LoongArch final : public TargetInfo {
26	public:
27	LoongArch(Ctx &);
28	uint32_t calcEFlags() const override;
29	int64_t getImplicitAddend(const uint8_t buf, RelType type) const* override;
30	void writeGotPlt(uint8_t buf, const* Symbol &s) const override;
31	void writeIgotPlt(uint8_t buf, const* Symbol &s) const override;
32	void writePltHeader(uint8_t buf) const* override;
33	void writePlt(uint8_t buf, const* Symbol &sym,
34	uint64_t pltEntryAddr) const override;
35	RelType getDynRel(RelType type) const override;
36	RelExpr getRelExpr(RelType type, const Symbol &s,
37	const uint8_t loc) const* override;
38	bool usesOnlyLowPageBits(RelType type) const override;
39	void relocate(uint8_t loc, const* Relocation &rel,
40	uint64_t val) const override;
41	bool relaxOnce(int pass) const override;
42	RelExpr adjustTlsExpr(RelType type, RelExpr expr) const override;
43	void relocateAlloc(InputSectionBase &sec, uint8_t buf) const* override;
44	void finalizeRelax(int passes) const override;
45
46	private:
47	void tlsdescToIe(uint8_t loc, const* Relocation &rel, uint64_t val) const;
48	void tlsdescToLe(uint8_t loc, const* Relocation &rel, uint64_t val) const;
49	};
50	} // end anonymous namespace
51
52	namespace {
53	enum Op {
54	SUB_W = `0x00110000`,
55	SUB_D = `0x00118000`,
56	BREAK = `0x002a0000`,
57	SRLI_W = `0x00448000`,
58	SRLI_D = `0x00450000`,
59	ADDI_W = `0x02800000`,
60	ADDI_D = `0x02c00000`,
61	ANDI = `0x03400000`,
62	ORI = `0x03800000`,
63	LU12I_W = `0x14000000`,
64	PCADDI = `0x18000000`,
65	PCADDU12I = `0x1c000000`,
66	PCALAU12I = `0x1a000000`,
67	LD_W = `0x28800000`,
68	LD_D = `0x28c00000`,
69	JIRL = `0x4c000000`,
70	B = `0x50000000`,
71	BL = `0x54000000`,
72	};
73
74	enum Reg {
75	R_ZERO = `0`,
76	R_RA = `1`,
77	R_TP = `2`,
78	R_A0 = `4`,
79	R_T0 = `12`,
80	R_T1 = `13`,
81	R_T2 = `14`,
82	R_T3 = `15`,
83	};
84	} // namespace
85
86	// Mask out the input's lowest 12 bits for use with `pcalau12i`, in sequences
87	// like `pcalau12i + addi.[wd]` or `pcalau12i + {ld,st}.` where the `pcalau12i`*
88	// produces a PC-relative intermediate value with the lowest 12 bits zeroed (the
89	// "page") for the next instruction to add in the "page offset". (`pcalau12i`
90	// stands for something like "PC ALigned Add Upper that starts from the 12th
91	// bit, Immediate".)
92	//
93	// Here a "page" is in fact just another way to refer to the 12-bit range
94	// allowed by the immediate field of the addi/ld/st instructions, and not
95	// related to the system or the kernel's actual page size. The semantics happen
96	// to match the AArch64 `adrp`, so the concept of "page" is borrowed here.
97	static uint64_t getLoongArchPage(uint64_t p) {
98	return p & ~static_cast<uint64_t>(`0xfff`);
99	}
100
101	static uint32_t lo12(uint32_t val) { return val & `0xfff`; }
102
103	// Calculate the adjusted page delta between dest and PC.
104	uint64_t elf::getLoongArchPageDelta(uint64_t dest, uint64_t pc, RelType type) {
105	// Note that if the sequence being relocated is `pcalau12i + addi.d + lu32i.d
106	// + lu52i.d`, they must be adjacent so that we can infer the PC of
107	// `pcalau12i` when calculating the page delta for the other two instructions
108	// (lu32i.d and lu52i.d). Compensate all the sign-extensions is a bit
109	// complicated. Just use psABI recommended algorithm.
110	uint64_t pcalau12i_pc;
111	switch (type) {
112	case R_LARCH_PCALA64_LO20:
113	case R_LARCH_GOT64_PC_LO20:
114	case R_LARCH_TLS_IE64_PC_LO20:
115	case R_LARCH_TLS_DESC64_PC_LO20:
116	pcalau12i_pc = pc - `8`;
117	break;
118	case R_LARCH_PCALA64_HI12:
119	case R_LARCH_GOT64_PC_HI12:
120	case R_LARCH_TLS_IE64_PC_HI12:
121	case R_LARCH_TLS_DESC64_PC_HI12:
122	pcalau12i_pc = pc - `12`;
123	break;
124	default:
125	pcalau12i_pc = pc;
126	break;
127	}
128	uint64_t result = getLoongArchPage(p: dest) - getLoongArchPage(p: pcalau12i_pc);
129	if (dest & `0x800`)
130	result += `0x1000` - `0x1'0000'0000`;
131	if (result & `0x8000'0000`)
132	result += `0x1'0000'0000`;
133	return result;
134	}
135
136	static uint32_t hi20(uint32_t val) { return (val + `0x800`) >> `12`; }
137
138	static uint32_t insn(uint32_t op, uint32_t d, uint32_t j, uint32_t k) {
139	return op \| d \| (j << `5`) \| (k << `10`);
140	}
141
142	// Extract bits v[begin:end], where range is inclusive.
143	static uint32_t extractBits(uint64_t v, uint32_t begin, uint32_t end) {
144	return begin == `63` ? v >> end : (v & ((`1ULL` << (begin + `1`)) - `1`)) >> end;
145	}
146
147	static uint32_t getD5(uint64_t v) { return extractBits(v, begin: `4`, end: `0`); }
148
149	static uint32_t getJ5(uint64_t v) { return extractBits(v, begin: `9`, end: `5`); }
150
151	static uint32_t setD5k16(uint32_t insn, uint32_t imm) {
152	uint32_t immLo = extractBits(v: imm, begin: `15`, end: `0`);
153	uint32_t immHi = extractBits(v: imm, begin: `20`, end: `16`);
154	return (insn & `0xfc0003e0`) \| (immLo << `10`) \| immHi;
155	}
156
157	static uint32_t setD10k16(uint32_t insn, uint32_t imm) {
158	uint32_t immLo = extractBits(v: imm, begin: `15`, end: `0`);
159	uint32_t immHi = extractBits(v: imm, begin: `25`, end: `16`);
160	return (insn & `0xfc000000`) \| (immLo << `10`) \| immHi;
161	}
162
163	static uint32_t setJ20(uint32_t insn, uint32_t imm) {
164	return (insn & `0xfe00001f`) \| (extractBits(v: imm, begin: `19`, end: `0`) << `5`);
165	}
166
167	static uint32_t setJ5(uint32_t insn, uint32_t imm) {
168	return (insn & `0xfffffc1f`) \| (extractBits(v: imm, begin: `4`, end: `0`) << `5`);
169	}
170
171	static uint32_t setK12(uint32_t insn, uint32_t imm) {
172	return (insn & `0xffc003ff`) \| (extractBits(v: imm, begin: `11`, end: `0`) << `10`);
173	}
174
175	static uint32_t setK16(uint32_t insn, uint32_t imm) {
176	return (insn & `0xfc0003ff`) \| (extractBits(v: imm, begin: `15`, end: `0`) << `10`);
177	}
178
179	static bool isJirl(uint32_t insn) {
180	return (insn & `0xfc000000`) == JIRL;
181	}
182
183	static void handleUleb128(Ctx &ctx, uint8_t *loc, uint64_t val) {
184	const uint32_t maxcount = `1` + `64` / `7`;
185	uint32_t count;
186	const char error = nullptr*;
187	uint64_t orig = decodeULEB128(p: loc, n: &count, end: nullptr, error: &error);
188	if (count > maxcount \|\| (count == maxcount && error))
189	Err(ctx) << getErrorLoc(ctx, loc) << "extra space for uleb128";
190	uint64_t mask = count < maxcount ? (`1ULL` << `7` * count) - `1` : -`1ULL`;
191	encodeULEB128(Value: (orig + val) & mask, p: loc, PadTo: count);
192	}
193
194	LoongArch::LoongArch(Ctx &ctx) : TargetInfo (ctx) {
195	// The LoongArch ISA itself does not have a limit on page sizes. According to
196	// the ISA manual, the PS (page size) field in MTLB entries and CSR.STLBPS is
197	// 6 bits wide, meaning the maximum page size is 2^63 which is equivalent to
198	// "unlimited".
199	// However, practically the maximum usable page size is constrained by the
200	// kernel implementation, and 64KiB is the biggest non-huge page size
201	// supported by Linux as of v6.4. The most widespread page size in use,
202	// though, is 16KiB.
203	defaultCommonPageSize = `16384`;
204	defaultMaxPageSize = `65536`;
205	write32le(P: trapInstr.data(), V: BREAK); // break 0
206
207	copyRel = R_LARCH_COPY;
208	pltRel = R_LARCH_JUMP_SLOT;
209	relativeRel = R_LARCH_RELATIVE;
210	iRelativeRel = R_LARCH_IRELATIVE;
211
212	if (ctx.arg.is64) {
213	symbolicRel = R_LARCH_64;
214	tlsModuleIndexRel = R_LARCH_TLS_DTPMOD64;
215	tlsOffsetRel = R_LARCH_TLS_DTPREL64;
216	tlsGotRel = R_LARCH_TLS_TPREL64;
217	tlsDescRel = R_LARCH_TLS_DESC64;
218	} else {
219	symbolicRel = R_LARCH_32;
220	tlsModuleIndexRel = R_LARCH_TLS_DTPMOD32;
221	tlsOffsetRel = R_LARCH_TLS_DTPREL32;
222	tlsGotRel = R_LARCH_TLS_TPREL32;
223	tlsDescRel = R_LARCH_TLS_DESC32;
224	}
225
226	gotRel = symbolicRel;
227
228	// .got.plt[0] = _dl_runtime_resolve, .got.plt[1] = link_map
229	gotPltHeaderEntriesNum = `2`;
230
231	pltHeaderSize = `32`;
232	pltEntrySize = `16`;
233	ipltEntrySize = `16`;
234	}
235
236	static uint32_t getEFlags(Ctx &ctx, const InputFile *f) {
237	if (ctx.arg.is64)
238	return cast<ObjFile<ELF64LE>>(Val: f)->getObj().getHeader().e_flags;
239	return cast<ObjFile<ELF32LE>>(Val: f)->getObj().getHeader().e_flags;
240	}
241
242	static bool inputFileHasCode(const InputFile *f) {
243	for (const auto *sec : f->getSections())
244	if (sec && sec->flags & SHF_EXECINSTR)
245	return true;
246
247	return false;
248	}
249
250	uint32_t LoongArch::calcEFlags() const {
251	// If there are only binary input files (from -b binary), use a
252	// value of 0 for the ELF header flags.
253	if (ctx.objectFiles.empty())
254	return `0`;
255
256	uint32_t target = `0`;
257	const InputFile *targetFile;
258	for (const InputFile *f : ctx.objectFiles) {
259	// Do not enforce ABI compatibility if the input file does not contain code.
260	// This is useful for allowing linkage with data-only object files produced
261	// with tools like objcopy, that have zero e_flags.
262	if (!inputFileHasCode(f))
263	continue;
264
265	// Take the first non-zero e_flags as the reference.
266	uint32_t flags = getEFlags(ctx, f);
267	if (target == `0` && flags != `0`) {
268	target = flags;
269	targetFile = f;
270	}
271
272	if ((flags & EF_LOONGARCH_ABI_MODIFIER_MASK) !=
273	(target & EF_LOONGARCH_ABI_MODIFIER_MASK))
274	ErrAlways(ctx) << f
275	<< ": cannot link object files with different ABI from "
276	<< targetFile;
277
278	// We cannot process psABI v1.x / object ABI v0 files (containing stack
279	// relocations), unlike ld.bfd.
280	//
281	// Instead of blindly accepting every v0 object and only failing at
282	// relocation processing time, just disallow interlink altogether. We
283	// don't expect significant usage of object ABI v0 in the wild (the old
284	// world may continue using object ABI v0 for a while, but as it's not
285	// binary-compatible with the upstream i.e. new-world ecosystem, it's not
286	// being considered here).
287	//
288	// There are briefly some new-world systems with object ABI v0 binaries too.
289	// It is because these systems were built before the new ABI was finalized.
290	// These are not supported either due to the extremely small number of them,
291	// and the few impacted users are advised to simply rebuild world or
292	// reinstall a recent system.
293	if ((flags & EF_LOONGARCH_OBJABI_MASK) != EF_LOONGARCH_OBJABI_V1)
294	ErrAlways(ctx) << f << ": unsupported object file ABI version";
295	}
296
297	return target;
298	}
299
300	int64_t LoongArch::getImplicitAddend(const uint8_t buf, RelType type) const* {
301	switch (type) {
302	default:
303	InternalErr(ctx, buf) << "cannot read addend for relocation " << type;
304	return `0`;
305	case R_LARCH_32:
306	case R_LARCH_TLS_DTPMOD32:
307	case R_LARCH_TLS_DTPREL32:
308	case R_LARCH_TLS_TPREL32:
309	return SignExtend64<`32`>(x: read32le(P: buf));
310	case R_LARCH_64:
311	case R_LARCH_TLS_DTPMOD64:
312	case R_LARCH_TLS_DTPREL64:
313	case R_LARCH_TLS_TPREL64:
314	return read64le(P: buf);
315	case R_LARCH_RELATIVE:
316	case R_LARCH_IRELATIVE:
317	return ctx.arg.is64 ? read64le(P: buf) : read32le(P: buf);
318	case R_LARCH_NONE:
319	case R_LARCH_JUMP_SLOT:
320	// These relocations are defined as not having an implicit addend.
321	return `0`;
322	case R_LARCH_TLS_DESC32:
323	return read32le(P: buf + `4`);
324	case R_LARCH_TLS_DESC64:
325	return read64le(P: buf + `8`);
326	}
327	}
328
329	void LoongArch::writeGotPlt(uint8_t buf, const* Symbol &s) const {
330	if (ctx.arg.is64)
331	write64le(P: buf, V: ctx.in.plt ->getVA());
332	else
333	write32le(P: buf, V: ctx.in.plt ->getVA());
334	}
335
336	void LoongArch::writeIgotPlt(uint8_t buf, const* Symbol &s) const {
337	if (ctx.arg.writeAddends) {
338	if (ctx.arg.is64)
339	write64le(P: buf, V: s.getVA(ctx));
340	else
341	write32le(P: buf, V: s.getVA(ctx));
342	}
343	}
344
345	void LoongArch::writePltHeader(uint8_t buf) const* {
346	// The LoongArch PLT is currently structured just like that of RISCV.
347	// Annoyingly, this means the PLT is still using `pcaddu12i` to perform
348	// PC-relative addressing (because `pcaddu12i` is the same as RISCV `auipc`),
349	// in contrast to the AArch64-like page-offset scheme with `pcalau12i` that
350	// is used everywhere else involving PC-relative operations in the LoongArch
351	// ELF psABI v2.00.
352	//
353	// The `pcrel_{hi20,lo12}` operators are illustrative only and not really
354	// supported by LoongArch assemblers.
355	//
356	// pcaddu12i $t2, %pcrel_hi20(.got.plt)
357	// sub.[wd] $t1, $t1, $t3
358	// ld.[wd] $t3, $t2, %pcrel_lo12(.got.plt) ; t3 = _dl_runtime_resolve
359	// addi.[wd] $t1, $t1, -pltHeaderSize-12 ; t1 = &.plt[i] - &.plt[0]
360	// addi.[wd] $t0, $t2, %pcrel_lo12(.got.plt)
361	// srli.[wd] $t1, $t1, (is64?1:2) ; t1 = &.got.plt[i] - &.got.plt[0]
362	// ld.[wd] $t0, $t0, Wordsize ; t0 = link_map
363	// jr $t3
364	uint32_t offset = ctx.in.gotPlt ->getVA() - ctx.in.plt ->getVA();
365	uint32_t sub = ctx.arg.is64 ? SUB_D : SUB_W;
366	uint32_t ld = ctx.arg.is64 ? LD_D : LD_W;
367	uint32_t addi = ctx.arg.is64 ? ADDI_D : ADDI_W;
368	uint32_t srli = ctx.arg.is64 ? SRLI_D : SRLI_W;
369	write32le(P: buf + `0`, V: insn(op: PCADDU12I, d: R_T2, j: hi20(val: offset), k: `0`));
370	write32le(P: buf + `4`, V: insn(op: sub, d: R_T1, j: R_T1, k: R_T3));
371	write32le(P: buf + `8`, V: insn(op: ld, d: R_T3, j: R_T2, k: lo12(val: offset)));
372	write32le(P: buf + `12`,
373	V: insn(op: addi, d: R_T1, j: R_T1, k: lo12(val: -ctx.target ->pltHeaderSize - `12`)));
374	write32le(P: buf + `16`, V: insn(op: addi, d: R_T0, j: R_T2, k: lo12(val: offset)));
375	write32le(P: buf + `20`, V: insn(op: srli, d: R_T1, j: R_T1, k: ctx.arg.is64 ? `1` : `2`));
376	write32le(P: buf + `24`, V: insn(op: ld, d: R_T0, j: R_T0, k: ctx.arg.wordsize));
377	write32le(P: buf + `28`, V: insn(op: JIRL, d: R_ZERO, j: R_T3, k: `0`));
378	}
379
380	void LoongArch::writePlt(uint8_t buf, const* Symbol &sym,
381	uint64_t pltEntryAddr) const {
382	// See the comment in writePltHeader for reason why pcaddu12i is used instead
383	// of the pcalau12i that's more commonly seen in the ELF psABI v2.0 days.
384	//
385	// pcaddu12i $t3, %pcrel_hi20(f@.got.plt)
386	// ld.[wd] $t3, $t3, %pcrel_lo12(f@.got.plt)
387	// jirl $t1, $t3, 0
388	// nop
389	uint32_t offset = sym.getGotPltVA(ctx) - pltEntryAddr;
390	write32le(P: buf + `0`, V: insn(op: PCADDU12I, d: R_T3, j: hi20(val: offset), k: `0`));
391	write32le(P: buf + `4`,
392	V: insn(op: ctx.arg.is64 ? LD_D : LD_W, d: R_T3, j: R_T3, k: lo12(val: offset)));
393	write32le(P: buf + `8`, V: insn(op: JIRL, d: R_T1, j: R_T3, k: `0`));
394	write32le(P: buf + `12`, V: insn(op: ANDI, d: R_ZERO, j: R_ZERO, k: `0`));
395	}
396
397	RelType LoongArch::getDynRel(RelType type) const {
398	return type == ctx.target ->symbolicRel ? type
399	: static_cast<RelType>(R_LARCH_NONE);
400	}
401
402	RelExpr LoongArch::getRelExpr(const RelType type, const Symbol &s,
403	const uint8_t loc) const* {
404	switch (type) {
405	case R_LARCH_NONE:
406	case R_LARCH_MARK_LA:
407	case R_LARCH_MARK_PCREL:
408	return R_NONE;
409	case R_LARCH_32:
410	case R_LARCH_64:
411	case R_LARCH_ABS_HI20:
412	case R_LARCH_ABS_LO12:
413	case R_LARCH_ABS64_LO20:
414	case R_LARCH_ABS64_HI12:
415	return R_ABS;
416	case R_LARCH_PCALA_LO12:
417	// We could just R_ABS, but the JIRL instruction reuses the relocation type
418	// for a different purpose. The questionable usage is part of glibc 2.37
419	// libc_nonshared.a [1], which is linked into user programs, so we have to
420	// work around it for a while, even if a new relocation type may be
421	// introduced in the future [2].
422	//
423	// [1]: https://sourceware.org/git/?p=glibc.git;a=commitdiff;h=9f482b73f41a9a1bbfb173aad0733d1c824c788a
424	// [2]: https://github.com/loongson/la-abi-specs/pull/3
425	return isJirl(insn: read32le(P: loc)) ? R_PLT : R_ABS;
426	case R_LARCH_TLS_DTPREL32:
427	case R_LARCH_TLS_DTPREL64:
428	return R_DTPREL;
429	case R_LARCH_TLS_TPREL32:
430	case R_LARCH_TLS_TPREL64:
431	case R_LARCH_TLS_LE_HI20:
432	case R_LARCH_TLS_LE_HI20_R:
433	case R_LARCH_TLS_LE_LO12:
434	case R_LARCH_TLS_LE_LO12_R:
435	case R_LARCH_TLS_LE64_LO20:
436	case R_LARCH_TLS_LE64_HI12:
437	return R_TPREL;
438	case R_LARCH_ADD6:
439	case R_LARCH_ADD8:
440	case R_LARCH_ADD16:
441	case R_LARCH_ADD32:
442	case R_LARCH_ADD64:
443	case R_LARCH_ADD_ULEB128:
444	case R_LARCH_SUB6:
445	case R_LARCH_SUB8:
446	case R_LARCH_SUB16:
447	case R_LARCH_SUB32:
448	case R_LARCH_SUB64:
449	case R_LARCH_SUB_ULEB128:
450	// The LoongArch add/sub relocs behave like the RISCV counterparts; reuse
451	// the RelExpr to avoid code duplication.
452	return RE_RISCV_ADD;
453	case R_LARCH_32_PCREL:
454	case R_LARCH_64_PCREL:
455	case R_LARCH_PCREL20_S2:
456	return R_PC;
457	case R_LARCH_B16:
458	case R_LARCH_B21:
459	case R_LARCH_B26:
460	case R_LARCH_CALL36:
461	return R_PLT_PC;
462	case R_LARCH_GOT_PC_HI20:
463	case R_LARCH_GOT64_PC_LO20:
464	case R_LARCH_GOT64_PC_HI12:
465	case R_LARCH_TLS_IE_PC_HI20:
466	case R_LARCH_TLS_IE64_PC_LO20:
467	case R_LARCH_TLS_IE64_PC_HI12:
468	return RE_LOONGARCH_GOT_PAGE_PC;
469	case R_LARCH_GOT_PC_LO12:
470	case R_LARCH_TLS_IE_PC_LO12:
471	return RE_LOONGARCH_GOT;
472	case R_LARCH_TLS_LD_PC_HI20:
473	case R_LARCH_TLS_GD_PC_HI20:
474	return RE_LOONGARCH_TLSGD_PAGE_PC;
475	case R_LARCH_PCALA_HI20:
476	// Why not RE_LOONGARCH_PAGE_PC, majority of references don't go through
477	// PLT anyway so why waste time checking only to get everything relaxed back
478	// to it?
479	//
480	// This is again due to the R_LARCH_PCALA_LO12 on JIRL case, where we want
481	// both the HI20 and LO12 to potentially refer to the PLT. But in reality
482	// the HI20 reloc appears earlier, and the relocs don't contain enough
483	// information to let us properly resolve semantics per symbol.
484	// Unlike RISCV, our LO12 relocs do not* point to their corresponding HI20*
485	// relocs, hence it is nearly impossible to 100% accurately determine each
486	// HI20's "flavor" without taking big performance hits, in the presence of
487	// edge cases (e.g. HI20 without pairing LO12; paired LO12 placed so far
488	// apart that relationship is not certain anymore), and programmer mistakes
489	// (e.g. as outlined in https://github.com/loongson/la-abi-specs/pull/3).
490	//
491	// Ideally we would scan in an extra pass for all LO12s on JIRL, then mark
492	// every HI20 reloc referring to the same symbol differently; this is not
493	// feasible with the current function signature of getRelExpr that doesn't
494	// allow for such inter-pass state.
495	//
496	// So, unfortunately we have to again workaround this quirk the same way as
497	// BFD: assuming every R_LARCH_PCALA_HI20 is potentially PLT-needing, only
498	// relaxing back to RE_LOONGARCH_PAGE_PC if it's known not so at a later
499	// stage.
500	return RE_LOONGARCH_PLT_PAGE_PC;
501	case R_LARCH_PCALA64_LO20:
502	case R_LARCH_PCALA64_HI12:
503	return RE_LOONGARCH_PAGE_PC;
504	case R_LARCH_GOT_HI20:
505	case R_LARCH_GOT_LO12:
506	case R_LARCH_GOT64_LO20:
507	case R_LARCH_GOT64_HI12:
508	case R_LARCH_TLS_IE_HI20:
509	case R_LARCH_TLS_IE_LO12:
510	case R_LARCH_TLS_IE64_LO20:
511	case R_LARCH_TLS_IE64_HI12:
512	return R_GOT;
513	case R_LARCH_TLS_LD_HI20:
514	return R_TLSLD_GOT;
515	case R_LARCH_TLS_GD_HI20:
516	return R_TLSGD_GOT;
517	case R_LARCH_TLS_LE_ADD_R:
518	case R_LARCH_RELAX:
519	return ctx.arg.relax ? R_RELAX_HINT : R_NONE;
520	case R_LARCH_ALIGN:
521	return R_RELAX_HINT;
522	case R_LARCH_TLS_DESC_PC_HI20:
523	case R_LARCH_TLS_DESC64_PC_LO20:
524	case R_LARCH_TLS_DESC64_PC_HI12:
525	return RE_LOONGARCH_TLSDESC_PAGE_PC;
526	case R_LARCH_TLS_DESC_PC_LO12:
527	case R_LARCH_TLS_DESC_LD:
528	case R_LARCH_TLS_DESC_HI20:
529	case R_LARCH_TLS_DESC_LO12:
530	case R_LARCH_TLS_DESC64_LO20:
531	case R_LARCH_TLS_DESC64_HI12:
532	return R_TLSDESC;
533	case R_LARCH_TLS_DESC_CALL:
534	return R_TLSDESC_CALL;
535	case R_LARCH_TLS_LD_PCREL20_S2:
536	return R_TLSLD_PC;
537	case R_LARCH_TLS_GD_PCREL20_S2:
538	return R_TLSGD_PC;
539	case R_LARCH_TLS_DESC_PCREL20_S2:
540	return R_TLSDESC_PC;
541
542	// Other known relocs that are explicitly unimplemented:
543	//
544	// - psABI v1 relocs that need a stateful stack machine to work, and not
545	// required when implementing psABI v2;
546	// - relocs that are not used anywhere (R_LARCH_{ADD,SUB}_24 [1], and the
547	// two GNU vtable-related relocs).
548	//
549	// [1]: https://web.archive.org/web/20230709064026/https://github.com/loongson/LoongArch-Documentation/issues/51
550	default:
551	Err(ctx) << getErrorLoc(ctx, loc) << "unknown relocation (" << type.v
552	<< ") against symbol " << &s;
553	return R_NONE;
554	}
555	}
556
557	bool LoongArch::usesOnlyLowPageBits(RelType type) const {
558	switch (type) {
559	default:
560	return false;
561	case R_LARCH_PCALA_LO12:
562	case R_LARCH_GOT_LO12:
563	case R_LARCH_GOT_PC_LO12:
564	case R_LARCH_TLS_IE_PC_LO12:
565	case R_LARCH_TLS_DESC_LO12:
566	case R_LARCH_TLS_DESC_PC_LO12:
567	return true;
568	}
569	}
570
571	void LoongArch::relocate(uint8_t loc, const* Relocation &rel,
572	uint64_t val) const {
573	switch (rel.type) {
574	case R_LARCH_32_PCREL:
575	checkInt(ctx, loc, v: val, n: `32`, rel);
576	[[fallthrough]];
577	case R_LARCH_32:
578	case R_LARCH_TLS_DTPREL32:
579	write32le(P: loc, V: val);
580	return;
581	case R_LARCH_64:
582	case R_LARCH_TLS_DTPREL64:
583	case R_LARCH_64_PCREL:
584	write64le(P: loc, V: val);
585	return;
586
587	// Relocs intended for `pcaddi`.
588	case R_LARCH_PCREL20_S2:
589	case R_LARCH_TLS_LD_PCREL20_S2:
590	case R_LARCH_TLS_GD_PCREL20_S2:
591	case R_LARCH_TLS_DESC_PCREL20_S2:
592	checkInt(ctx, loc, v: val, n: `22`, rel);
593	checkAlignment(ctx, loc, v: val, n: `4`, rel);
594	write32le(P: loc, V: setJ20(insn: read32le(P: loc), imm: val >> `2`));
595	return;
596
597	case R_LARCH_B16:
598	checkInt(ctx, loc, v: val, n: `18`, rel);
599	checkAlignment(ctx, loc, v: val, n: `4`, rel);
600	write32le(P: loc, V: setK16(insn: read32le(P: loc), imm: val >> `2`));
601	return;
602
603	case R_LARCH_B21:
604	checkInt(ctx, loc, v: val, n: `23`, rel);
605	checkAlignment(ctx, loc, v: val, n: `4`, rel);
606	write32le(P: loc, V: setD5k16(insn: read32le(P: loc), imm: val >> `2`));
607	return;
608
609	case R_LARCH_B26:
610	checkInt(ctx, loc, v: val, n: `28`, rel);
611	checkAlignment(ctx, loc, v: val, n: `4`, rel);
612	write32le(P: loc, V: setD10k16(insn: read32le(P: loc), imm: val >> `2`));
613	return;
614
615	case R_LARCH_CALL36: {
616	// This relocation is designed for adjacent pcaddu18i+jirl pairs that
617	// are patched in one time. Because of sign extension of these insns'
618	// immediate fields, the relocation range is [-128G - 0x20000, +128G -
619	// 0x20000) (of course must be 4-byte aligned).
620	if (((int64_t)val + `0x20000`) != llvm::SignExtend64(X: val + `0x20000`, B: `38`))
621	reportRangeError(ctx, loc, rel, v: Twine (val), min: llvm::minIntN(N: `38`) - `0x20000`,
622	max: llvm::maxIntN(N: `38`) - `0x20000`);
623	checkAlignment(ctx, loc, v: val, n: `4`, rel);
624	// Since jirl performs sign extension on the offset immediate, adds (1<<17)
625	// to original val to get the correct hi20.
626	uint32_t hi20 = extractBits(v: val + (`1` << `17`), begin: `37`, end: `18`);
627	// Despite the name, the lower part is actually 18 bits with 4-byte aligned.
628	uint32_t lo16 = extractBits(v: val, begin: `17`, end: `2`);
629	write32le(P: loc, V: setJ20(insn: read32le(P: loc), imm: hi20));
630	write32le(P: loc + `4`, V: setK16(insn: read32le(P: loc + `4`), imm: lo16));
631	return;
632	}
633
634	// Relocs intended for `addi`, `ld` or `st`.
635	case R_LARCH_PCALA_LO12:
636	// We have to again inspect the insn word to handle the R_LARCH_PCALA_LO12
637	// on JIRL case: firstly JIRL wants its immediate's 2 lowest zeroes
638	// removed by us (in contrast to regular R_LARCH_PCALA_LO12), secondly
639	// its immediate slot width is different too (16, not 12).
640	// In this case, process like an R_LARCH_B16, but without overflow checking
641	// and only taking the value's lowest 12 bits.
642	if (isJirl(insn: read32le(P: loc))) {
643	checkAlignment(ctx, loc, v: val, n: `4`, rel);
644	val = SignExtend64<`12`>(x: val);
645	write32le(P: loc, V: setK16(insn: read32le(P: loc), imm: val >> `2`));
646	return;
647	}
648	[[fallthrough]];
649	case R_LARCH_ABS_LO12:
650	case R_LARCH_GOT_PC_LO12:
651	case R_LARCH_GOT_LO12:
652	case R_LARCH_TLS_LE_LO12:
653	case R_LARCH_TLS_IE_PC_LO12:
654	case R_LARCH_TLS_IE_LO12:
655	case R_LARCH_TLS_LE_LO12_R:
656	case R_LARCH_TLS_DESC_PC_LO12:
657	case R_LARCH_TLS_DESC_LO12:
658	write32le(P: loc, V: setK12(insn: read32le(P: loc), imm: extractBits(v: val, begin: `11`, end: `0`)));
659	return;
660
661	// Relocs intended for `lu12i.w` or `pcalau12i`.
662	case R_LARCH_ABS_HI20:
663	case R_LARCH_PCALA_HI20:
664	case R_LARCH_GOT_PC_HI20:
665	case R_LARCH_GOT_HI20:
666	case R_LARCH_TLS_LE_HI20:
667	case R_LARCH_TLS_IE_PC_HI20:
668	case R_LARCH_TLS_IE_HI20:
669	case R_LARCH_TLS_LD_PC_HI20:
670	case R_LARCH_TLS_LD_HI20:
671	case R_LARCH_TLS_GD_PC_HI20:
672	case R_LARCH_TLS_GD_HI20:
673	case R_LARCH_TLS_DESC_PC_HI20:
674	case R_LARCH_TLS_DESC_HI20:
675	write32le(P: loc, V: setJ20(insn: read32le(P: loc), imm: extractBits(v: val, begin: `31`, end: `12`)));
676	return;
677	case R_LARCH_TLS_LE_HI20_R:
678	write32le(P: loc, V: setJ20(insn: read32le(P: loc), imm: extractBits(v: val + `0x800`, begin: `31`, end: `12`)));
679	return;
680
681	// Relocs intended for `lu32i.d`.
682	case R_LARCH_ABS64_LO20:
683	case R_LARCH_PCALA64_LO20:
684	case R_LARCH_GOT64_PC_LO20:
685	case R_LARCH_GOT64_LO20:
686	case R_LARCH_TLS_LE64_LO20:
687	case R_LARCH_TLS_IE64_PC_LO20:
688	case R_LARCH_TLS_IE64_LO20:
689	case R_LARCH_TLS_DESC64_PC_LO20:
690	case R_LARCH_TLS_DESC64_LO20:
691	write32le(P: loc, V: setJ20(insn: read32le(P: loc), imm: extractBits(v: val, begin: `51`, end: `32`)));
692	return;
693
694	// Relocs intended for `lu52i.d`.
695	case R_LARCH_ABS64_HI12:
696	case R_LARCH_PCALA64_HI12:
697	case R_LARCH_GOT64_PC_HI12:
698	case R_LARCH_GOT64_HI12:
699	case R_LARCH_TLS_LE64_HI12:
700	case R_LARCH_TLS_IE64_PC_HI12:
701	case R_LARCH_TLS_IE64_HI12:
702	case R_LARCH_TLS_DESC64_PC_HI12:
703	case R_LARCH_TLS_DESC64_HI12:
704	write32le(P: loc, V: setK12(insn: read32le(P: loc), imm: extractBits(v: val, begin: `63`, end: `52`)));
705	return;
706
707	case R_LARCH_ADD6:
708	loc = (loc & `0xc0`) \| ((*loc + val) & `0x3f`);
709	return;
710	case R_LARCH_ADD8:
711	*loc += val;
712	return;
713	case R_LARCH_ADD16:
714	write16le(P: loc, V: read16le(P: loc) + val);
715	return;
716	case R_LARCH_ADD32:
717	write32le(P: loc, V: read32le(P: loc) + val);
718	return;
719	case R_LARCH_ADD64:
720	write64le(P: loc, V: read64le(P: loc) + val);
721	return;
722	case R_LARCH_ADD_ULEB128:
723	handleUleb128(ctx, loc, val);
724	return;
725	case R_LARCH_SUB6:
726	loc = (loc & `0xc0`) \| ((*loc - val) & `0x3f`);
727	return;
728	case R_LARCH_SUB8:
729	*loc -= val;
730	return;
731	case R_LARCH_SUB16:
732	write16le(P: loc, V: read16le(P: loc) - val);
733	return;
734	case R_LARCH_SUB32:
735	write32le(P: loc, V: read32le(P: loc) - val);
736	return;
737	case R_LARCH_SUB64:
738	write64le(P: loc, V: read64le(P: loc) - val);
739	return;
740	case R_LARCH_SUB_ULEB128:
741	handleUleb128(ctx, loc, val: -val);
742	return;
743
744	case R_LARCH_MARK_LA:
745	case R_LARCH_MARK_PCREL:
746	// no-op
747	return;
748
749	case R_LARCH_TLS_LE_ADD_R:
750	case R_LARCH_RELAX:
751	return; // Ignored (for now)
752
753	case R_LARCH_TLS_DESC_LD:
754	return; // nothing to do.
755	case R_LARCH_TLS_DESC32:
756	write32le(P: loc + `4`, V: val);
757	return;
758	case R_LARCH_TLS_DESC64:
759	write64le(P: loc + `8`, V: val);
760	return;
761
762	default:
763	llvm_unreachable("unknown relocation");
764	}
765	}
766
767	static bool relaxable(ArrayRef<Relocation> relocs, size_t i) {
768	return i + `1` < relocs.size() && relocs [i + `1`].type == R_LARCH_RELAX;
769	}
770
771	static bool isPairRelaxable(ArrayRef<Relocation> relocs, size_t i) {
772	return relaxable(relocs, i) && relaxable(relocs, i: i + `2`) &&
773	relocs [i].offset + `4` == relocs [i + `2`].offset;
774	}
775
776	// Relax code sequence.
777	// From:
778	// pcalau12i $a0, %pc_hi20(sym) \| %ld_pc_hi20(sym) \| %gd_pc_hi20(sym)
779	// \| %desc_pc_hi20(sym)
780	// addi.w/d $a0, $a0, %pc_lo12(sym) \| %got_pc_lo12(sym) \| %got_pc_lo12(sym)
781	// \| %desc_pc_lo12(sym)
782	// To:
783	// pcaddi $a0, %pc_lo12(sym) \| %got_pc_lo12(sym) \| %got_pc_lo12(sym)
784	// \| %desc_pcrel_20(sym)
785	//
786	// From:
787	// pcalau12i $a0, %got_pc_hi20(sym_got)
788	// ld.w/d $a0, $a0, %got_pc_lo12(sym_got)
789	// To:
790	// pcaddi $a0, %got_pc_hi20(sym_got)
791	static void relaxPCHi20Lo12(Ctx &ctx, const InputSection &sec, size_t i,
792	uint64_t loc, Relocation &rHi20, Relocation &rLo12,
793	uint32_t &remove) {
794	// check if the relocations are relaxable sequences.
795	if (!((rHi20.type == R_LARCH_PCALA_HI20 &&
796	rLo12.type == R_LARCH_PCALA_LO12) \|\|
797	(rHi20.type == R_LARCH_GOT_PC_HI20 &&
798	rLo12.type == R_LARCH_GOT_PC_LO12) \|\|
799	(rHi20.type == R_LARCH_TLS_GD_PC_HI20 &&
800	rLo12.type == R_LARCH_GOT_PC_LO12) \|\|
801	(rHi20.type == R_LARCH_TLS_LD_PC_HI20 &&
802	rLo12.type == R_LARCH_GOT_PC_LO12) \|\|
803	(rHi20.type == R_LARCH_TLS_DESC_PC_HI20 &&
804	rLo12.type == R_LARCH_TLS_DESC_PC_LO12)))
805	return;
806
807	// GOT references to absolute symbols can't be relaxed to use pcaddi in
808	// position-independent code, because these instructions produce a relative
809	// address.
810	// Meanwhile skip undefined, preemptible and STT_GNU_IFUNC symbols, because
811	// these symbols may be resolve in runtime.
812	if (rHi20.type == R_LARCH_GOT_PC_HI20 &&
813	(!rHi20.sym->isDefined() \|\| rHi20.sym->isPreemptible \|\|
814	rHi20.sym->isGnuIFunc() \|\|
815	(ctx.arg.isPic && !cast<Defined>(Val&: *rHi20.sym).section)))
816	return;
817
818	uint64_t dest = `0`;
819	if (rHi20.expr == RE_LOONGARCH_PLT_PAGE_PC)
820	dest = rHi20.sym->getPltVA(ctx);
821	else if (rHi20.expr == RE_LOONGARCH_PAGE_PC \|\|
822	rHi20.expr == RE_LOONGARCH_GOT_PAGE_PC)
823	dest = rHi20.sym->getVA(ctx);
824	else if (rHi20.expr == RE_LOONGARCH_TLSGD_PAGE_PC)
825	dest = ctx.in.got ->getGlobalDynAddr(b: *rHi20.sym);
826	else if (rHi20.expr == RE_LOONGARCH_TLSDESC_PAGE_PC)
827	dest = ctx.in.got ->getTlsDescAddr(sym: *rHi20.sym);
828	else {
829	Err(ctx) << getErrorLoc(ctx, loc: (const uint8_t *)loc) << "unknown expr ("
830	<< rHi20.expr << ") against symbol " << rHi20.sym
831	<< "in relaxPCHi20Lo12";
832	return;
833	}
834	dest += rHi20.addend;
835
836	const int64_t displace = dest - loc;
837	// Check if the displace aligns 4 bytes or exceeds the range of pcaddi.
838	if ((displace & `0x3`) != `0` \|\| !isInt<`22`>(x: displace))
839	return;
840
841	// Note: If we can ensure that the .o files generated by LLVM only contain
842	// relaxable instruction sequences with R_LARCH_RELAX, then we do not need to
843	// decode instructions. The relaxable instruction sequences imply the
844	// following constraints:
845	// For relocation pairs related to got_pc, the opcodes of instructions*
846	// must be pcalau12i + ld.w/d. In other cases, the opcodes must be pcalau12i +
847	// addi.w/d.
848	// The destination register of pcalau12i is guaranteed to be used only by*
849	// the immediately following instruction.
850	const uint32_t currInsn = read32le(P: sec.content().data() + rHi20.offset);
851	const uint32_t nextInsn = read32le(P: sec.content().data() + rLo12.offset);
852	// Check if use the same register.
853	if (getD5(v: currInsn) != getJ5(v: nextInsn) \|\| getJ5(v: nextInsn) != getD5(v: nextInsn))
854	return;
855
856	sec.relaxAux->relocTypes [i] = R_LARCH_RELAX;
857	if (rHi20.type == R_LARCH_TLS_GD_PC_HI20)
858	sec.relaxAux->relocTypes [i + `2`] = R_LARCH_TLS_GD_PCREL20_S2;
859	else if (rHi20.type == R_LARCH_TLS_LD_PC_HI20)
860	sec.relaxAux->relocTypes [i + `2`] = R_LARCH_TLS_LD_PCREL20_S2;
861	else if (rHi20.type == R_LARCH_TLS_DESC_PC_HI20)
862	sec.relaxAux->relocTypes [i + `2`] = R_LARCH_TLS_DESC_PCREL20_S2;
863	else
864	sec.relaxAux->relocTypes [i + `2`] = R_LARCH_PCREL20_S2;
865	sec.relaxAux->writes.push_back(Elt: insn(op: PCADDI, d: getD5(v: nextInsn), j: `0`, k: `0`));
866	remove = `4`;
867	}
868
869	// Relax code sequence.
870	// From:
871	// pcaddu18i $ra, %call36(foo)
872	// jirl $ra, $ra, 0
873	// To:
874	// b/bl foo
875	static void relaxCall36(Ctx &ctx, const InputSection &sec, size_t i,
876	uint64_t loc, Relocation &r, uint32_t &remove) {
877	const uint64_t dest =
878	(r.expr == R_PLT_PC ? r.sym->getPltVA(ctx) : r.sym->getVA(ctx)) +
879	r.addend;
880
881	const int64_t displace = dest - loc;
882	// Check if the displace aligns 4 bytes or exceeds the range of b[l].
883	if ((displace & `0x3`) != `0` \|\| !isInt<`28`>(x: displace))
884	return;
885
886	const uint32_t nextInsn = read32le(P: sec.content().data() + r.offset + `4`);
887	if (getD5(v: nextInsn) == R_RA) {
888	// convert jirl to bl
889	sec.relaxAux->relocTypes [i] = R_LARCH_B26;
890	sec.relaxAux->writes.push_back(Elt: insn(op: BL, d: `0`, j: `0`, k: `0`));
891	remove = `4`;
892	} else if (getD5(v: nextInsn) == R_ZERO) {
893	// convert jirl to b
894	sec.relaxAux->relocTypes [i] = R_LARCH_B26;
895	sec.relaxAux->writes.push_back(Elt: insn(op: B, d: `0`, j: `0`, k: `0`));
896	remove = `4`;
897	}
898	}
899
900	// Relax code sequence.
901	// From:
902	// lu12i.w $rd, %le_hi20_r(sym)
903	// add.w/d $rd, $rd, $tp, %le_add_r(sym)
904	// addi/ld/st.w/d $rd, $rd, %le_lo12_r(sym)
905	// To:
906	// addi/ld/st.w/d $rd, $tp, %le_lo12_r(sym)
907	static void relaxTlsLe(Ctx &ctx, const InputSection &sec, size_t i,
908	uint64_t loc, Relocation &r, uint32_t &remove) {
909	uint64_t val = r.sym->getVA(ctx, addend: r.addend);
910	// Check if the val exceeds the range of addi/ld/st.
911	if (!isInt<`12`>(x: val))
912	return;
913	uint32_t currInsn = read32le(P: sec.content().data() + r.offset);
914	switch (r.type) {
915	case R_LARCH_TLS_LE_HI20_R:
916	case R_LARCH_TLS_LE_ADD_R:
917	sec.relaxAux->relocTypes [i] = R_LARCH_RELAX;
918	remove = `4`;
919	break;
920	case R_LARCH_TLS_LE_LO12_R:
921	sec.relaxAux->writes.push_back(Elt: setJ5(insn: currInsn, imm: R_TP));
922	sec.relaxAux->relocTypes [i] = R_LARCH_TLS_LE_LO12_R;
923	break;
924	}
925	}
926
927	static bool relax(Ctx &ctx, InputSection &sec) {
928	const uint64_t secAddr = sec.getVA();
929	const MutableArrayRef<Relocation> relocs = sec.relocs();
930	auto &aux = *sec.relaxAux;
931	bool changed = false;
932	ArrayRef<SymbolAnchor> sa = ArrayRef(aux.anchors);
933	uint64_t delta = `0`;
934
935	std::fill_n(first: aux.relocTypes.get(), n: relocs.size(), value: R_LARCH_NONE);
936	aux.writes.clear();
937	for (auto [i, r] : llvm::enumerate(First: relocs)) {
938	const uint64_t loc = secAddr + r.offset - delta;
939	uint32_t &cur = aux.relocDeltas [i], remove = `0`;
940	switch (r.type) {
941	case R_LARCH_ALIGN: {
942	const uint64_t addend =
943	r.sym->isUndefined() ? Log2_64(Value: r.addend) + `1` : r.addend;
944	const uint64_t allBytes = (`1ULL` << (addend & `0xff`)) - `4`;
945	const uint64_t align = `1ULL` << (addend & `0xff`);
946	const uint64_t maxBytes = addend >> `8`;
947	const uint64_t off = loc & (align - `1`);
948	const uint64_t curBytes = off == `0` ? `0` : align - off;
949	// All bytes beyond the alignment boundary should be removed.
950	// If emit bytes more than max bytes to emit, remove all.
951	if (maxBytes != `0` && curBytes > maxBytes)
952	remove = allBytes;
953	else
954	remove = allBytes - curBytes;
955	// If we can't satisfy this alignment, we've found a bad input.
956	if (LLVM_UNLIKELY(static_cast<int32_t>(remove) < `0`)) {
957	Err(ctx) << getErrorLoc(ctx, loc: (const uint8_t *)loc)
958	<< "insufficient padding bytes for " << r.type << ": "
959	<< allBytes << " bytes available for "
960	<< "requested alignment of " << align << " bytes";
961	remove = `0`;
962	}
963	break;
964	}
965	case R_LARCH_PCALA_HI20:
966	case R_LARCH_GOT_PC_HI20:
967	case R_LARCH_TLS_GD_PC_HI20:
968	case R_LARCH_TLS_LD_PC_HI20:
969	case R_LARCH_TLS_DESC_PC_HI20:
970	// The overflow check for i+2 will be carried out in isPairRelaxable.
971	if (r.expr != RE_LOONGARCH_RELAX_TLS_GD_TO_IE_PAGE_PC &&
972	r.expr != R_RELAX_TLS_GD_TO_LE && isPairRelaxable(relocs, i))
973	relaxPCHi20Lo12(ctx, sec, i, loc, rHi20&: r, rLo12&: relocs [i + `2`], remove);
974	break;
975	case R_LARCH_CALL36:
976	if (relaxable(relocs, i))
977	relaxCall36(ctx, sec, i, loc, r, remove);
978	break;
979	case R_LARCH_TLS_LE_HI20_R:
980	case R_LARCH_TLS_LE_ADD_R:
981	case R_LARCH_TLS_LE_LO12_R:
982	if (relaxable(relocs, i))
983	relaxTlsLe(ctx, sec, i, loc, r, remove);
984	break;
985	case R_LARCH_TLS_IE_PC_HI20:
986	if (relaxable(relocs, i) && r.expr == R_RELAX_TLS_IE_TO_LE &&
987	isUInt<`12`>(x: r.sym->getVA(ctx, addend: r.addend)))
988	remove = `4`;
989	break;
990	}
991
992	// For all anchors whose offsets are <= r.offset, they are preceded by
993	// the previous relocation whose `relocDeltas` value equals `delta`.
994	// Decrease their st_value and update their st_size.
995	for (; sa.size() && sa [`0`].offset <= r.offset; sa = sa.slice(N: `1`)) {
996	if (sa [`0`].end)
997	sa [`0`].d->size = sa [`0`].offset - delta - sa [`0`].d->value;
998	else
999	sa [`0`].d->value = sa [`0`].offset - delta;
1000	}
1001	delta += remove;
1002	if (delta != cur) {
1003	cur = delta;
1004	changed = true;
1005	}
1006	}
1007
1008	for (const SymbolAnchor &a : sa) {
1009	if (a.end)
1010	a.d->size = a.offset - delta - a.d->value;
1011	else
1012	a.d->value = a.offset - delta;
1013	}
1014	// Inform assignAddresses that the size has changed.
1015	if (!isUInt<`32`>(x: delta))
1016	Fatal(ctx) << "section size decrease is too large: " << delta;
1017	sec.bytesDropped = delta;
1018	return changed;
1019	}
1020
1021	// Convert TLS IE to LE in the normal or medium code model.
1022	// Original code sequence:
1023	// pcalau12i $a0, %ie_pc_hi20(sym)*
1024	// ld.d $a0, $a0, %ie_pc_lo12(sym)*
1025	//
1026	// The code sequence converted is as follows:
1027	// lu12i.w $a0, %le_hi20(sym) # le_hi20 != 0, otherwise NOP*
1028	// ori $a0, src, %le_lo12(sym) # le_hi20 != 0, src = $a0,*
1029	// # otherwise, src = $zero
1030	//
1031	// When relaxation enables, redundant NOPs can be removed.
1032	static void tlsIeToLe(uint8_t loc, const* Relocation &rel, uint64_t val) {
1033	assert(isInt<`32`>(val) &&
1034	"val exceeds the range of medium code model in tlsIeToLe");
1035
1036	bool isUInt12 = isUInt<`12`>(x: val);
1037	const uint32_t currInsn = read32le(P: loc);
1038	switch (rel.type) {
1039	case R_LARCH_TLS_IE_PC_HI20:
1040	if (isUInt12)
1041	write32le(P: loc, V: insn(op: ANDI, d: R_ZERO, j: R_ZERO, k: `0`)); // nop
1042	else
1043	write32le(P: loc, V: insn(op: LU12I_W, d: getD5(v: currInsn), j: extractBits(v: val, begin: `31`, end: `12`),
1044	k: `0`)); // lu12i.w $a0, %le_hi20
1045	break;
1046	case R_LARCH_TLS_IE_PC_LO12:
1047	if (isUInt12)
1048	write32le(P: loc, V: insn(op: ORI, d: getD5(v: currInsn), j: R_ZERO,
1049	k: val)); // ori $a0, $zero, %le_lo12
1050	else
1051	write32le(P: loc, V: insn(op: ORI, d: getD5(v: currInsn), j: getJ5(v: currInsn),
1052	k: lo12(val))); // ori $a0, $a0, %le_lo12
1053	break;
1054	}
1055	}
1056
1057	// Convert TLSDESC GD/LD to IE.
1058	// In normal or medium code model, there are two forms of code sequences:
1059	// pcalau12i $a0, %desc_pc_hi20(sym_desc)*
1060	// addi.d $a0, $a0, %desc_pc_lo12(sym_desc)*
1061	// ld.d $ra, $a0, %desc_ld(sym_desc)*
1062	// jirl $ra, $ra, %desc_call(sym_desc)*
1063	// ------
1064	// pcaddi $a0, %desc_pcrel_20(a)*
1065	// load $ra, $a0, %desc_ld(a)*
1066	// jirl $ra, $ra, %desc_call(a)*
1067	//
1068	// The code sequence obtained is as follows:
1069	// pcalau12i $a0, %ie_pc_hi20(sym_ie)*
1070	// ld.[wd] $a0, $a0, %ie_pc_lo12(sym_ie)*
1071	//
1072	// Simplicity, whether tlsdescToIe or tlsdescToLe, we always tend to convert the
1073	// preceding instructions to NOPs, due to both forms of code sequence
1074	// (corresponding to relocation combinations:
1075	// R_LARCH_TLS_DESC_PC_HI20+R_LARCH_TLS_DESC_PC_LO12 and
1076	// R_LARCH_TLS_DESC_PCREL20_S2) have same process.
1077	//
1078	// When relaxation enables, redundant NOPs can be removed.
1079	void LoongArch::tlsdescToIe(uint8_t loc, const* Relocation &rel,
1080	uint64_t val) const {
1081	switch (rel.type) {
1082	case R_LARCH_TLS_DESC_PC_HI20:
1083	case R_LARCH_TLS_DESC_PC_LO12:
1084	case R_LARCH_TLS_DESC_PCREL20_S2:
1085	write32le(P: loc, V: insn(op: ANDI, d: R_ZERO, j: R_ZERO, k: `0`)); // nop
1086	break;
1087	case R_LARCH_TLS_DESC_LD:
1088	write32le(P: loc, V: insn(op: PCALAU12I, d: R_A0, j: `0`, k: `0`)); // pcalau12i $a0, %ie_pc_hi20
1089	relocateNoSym(loc, type: R_LARCH_TLS_IE_PC_HI20, val);
1090	break;
1091	case R_LARCH_TLS_DESC_CALL:
1092	write32le(P: loc, V: insn(op: ctx.arg.is64 ? LD_D : LD_W, d: R_A0, j: R_A0,
1093	k: `0`)); // ld.[wd] $a0, $a0, %ie_pc_lo12
1094	relocateNoSym(loc, type: R_LARCH_TLS_IE_PC_LO12, val);
1095	break;
1096	default:
1097	llvm_unreachable("unsupported relocation for TLSDESC to IE");
1098	}
1099	}
1100
1101	// Convert TLSDESC GD/LD to LE.
1102	// The code sequence obtained in the normal or medium code model is as follows:
1103	// lu12i.w $a0, %le_hi20(sym) # le_hi20 != 0, otherwise NOP*
1104	// ori $a0, src, %le_lo12(sym) # le_hi20 != 0, src = $a0,*
1105	// # otherwise, src = $zero
1106	// See the comment in tlsdescToIe for detailed information.
1107	void LoongArch::tlsdescToLe(uint8_t loc, const* Relocation &rel,
1108	uint64_t val) const {
1109	assert(isInt<`32`>(val) &&
1110	"val exceeds the range of medium code model in tlsdescToLe");
1111
1112	bool isUInt12 = isUInt<`12`>(x: val);
1113	switch (rel.type) {
1114	case R_LARCH_TLS_DESC_PC_HI20:
1115	case R_LARCH_TLS_DESC_PC_LO12:
1116	case R_LARCH_TLS_DESC_PCREL20_S2:
1117	write32le(P: loc, V: insn(op: ANDI, d: R_ZERO, j: R_ZERO, k: `0`)); // nop
1118	break;
1119	case R_LARCH_TLS_DESC_LD:
1120	if (isUInt12)
1121	write32le(P: loc, V: insn(op: ANDI, d: R_ZERO, j: R_ZERO, k: `0`)); // nop
1122	else
1123	write32le(P: loc, V: insn(op: LU12I_W, d: R_A0, j: extractBits(v: val, begin: `31`, end: `12`),
1124	k: `0`)); // lu12i.w $a0, %le_hi20
1125	break;
1126	case R_LARCH_TLS_DESC_CALL:
1127	if (isUInt12)
1128	write32le(P: loc, V: insn(op: ORI, d: R_A0, j: R_ZERO, k: val)); // ori $a0, $zero, %le_lo12
1129	else
1130	write32le(P: loc,
1131	V: insn(op: ORI, d: R_A0, j: R_A0, k: lo12(val))); // ori $a0, $a0, %le_lo12
1132	break;
1133	default:
1134	llvm_unreachable("unsupported relocation for TLSDESC to LE");
1135	}
1136	}
1137
1138	// During TLSDESC GD_TO_IE, the converted code sequence always includes an
1139	// instruction related to the Lo12 relocation (ld.[wd]). To obtain correct val
1140	// in `getRelocTargetVA`, expr of this instruction should be adjusted to
1141	// R_RELAX_TLS_GD_TO_IE_ABS, while expr of other instructions related to the
1142	// Hi20 relocation (pcalau12i) should be adjusted to
1143	// RE_LOONGARCH_RELAX_TLS_GD_TO_IE_PAGE_PC. Specifically, in the normal or
1144	// medium code model, the instruction with relocation R_LARCH_TLS_DESC_CALL is
1145	// the candidate of Lo12 relocation.
1146	RelExpr LoongArch::adjustTlsExpr(RelType type, RelExpr expr) const {
1147	if (expr == R_RELAX_TLS_GD_TO_IE) {
1148	if (type != R_LARCH_TLS_DESC_CALL)
1149	return RE_LOONGARCH_RELAX_TLS_GD_TO_IE_PAGE_PC;
1150	return R_RELAX_TLS_GD_TO_IE_ABS;
1151	}
1152	return expr;
1153	}
1154
1155	void LoongArch::relocateAlloc(InputSectionBase &sec, uint8_t buf) const* {
1156	const unsigned bits = ctx.arg.is64 ? `64` : `32`;
1157	uint64_t secAddr = sec.getOutputSection()->addr;
1158	if (auto *s = dyn_cast<InputSection>(Val: &sec))
1159	secAddr += s->outSecOff;
1160	else if (auto *ehIn = dyn_cast<EhInputSection>(Val: &sec))
1161	secAddr += ehIn->getParent()->outSecOff;
1162	bool isExtreme = false, isRelax = false;
1163	const MutableArrayRef<Relocation> relocs = sec.relocs();
1164	for (size_t i = `0`, size = relocs.size(); i != size; ++i) {
1165	Relocation &rel = relocs [i];
1166	uint8_t *loc = buf + rel.offset;
1167	uint64_t val = SignExtend64(
1168	X: sec.getRelocTargetVA(ctx, r: rel, p: secAddr + rel.offset), B: bits);
1169
1170	switch (rel.expr) {
1171	case R_RELAX_HINT:
1172	continue;
1173	case R_RELAX_TLS_IE_TO_LE:
1174	if (rel.type == R_LARCH_TLS_IE_PC_HI20) {
1175	// LoongArch does not support IE to LE optimization in the extreme code
1176	// model. In this case, the relocs are as follows:
1177	//
1178	// i -- R_LARCH_TLS_IE_PC_HI20*
1179	// i+1 -- R_LARCH_TLS_IE_PC_LO12*
1180	// i+2 -- R_LARCH_TLS_IE64_PC_LO20*
1181	// i+3 -- R_LARCH_TLS_IE64_PC_HI12*
1182	isExtreme =
1183	i + `2` < size && relocs [i + `2`].type == R_LARCH_TLS_IE64_PC_LO20;
1184	}
1185	if (isExtreme) {
1186	rel.expr = getRelExpr(type: rel.type, s: *rel.sym, loc);
1187	val = SignExtend64(X: sec.getRelocTargetVA(ctx, r: rel, p: secAddr + rel.offset),
1188	B: bits);
1189	relocateNoSym(loc, type: rel.type, val);
1190	} else {
1191	isRelax = relaxable(relocs, i);
1192	if (isRelax && rel.type == R_LARCH_TLS_IE_PC_HI20 && isUInt<`12`>(x: val))
1193	continue;
1194	tlsIeToLe(loc, rel, val);
1195	}
1196	continue;
1197	case RE_LOONGARCH_RELAX_TLS_GD_TO_IE_PAGE_PC:
1198	if (rel.type == R_LARCH_TLS_DESC_PC_HI20) {
1199	// LoongArch does not support TLSDESC GD/LD to LE/IE optimization in the
1200	// extreme code model. In these cases, the relocs are as follows:
1201	//
1202	// i -- R_LARCH_TLS_DESC_PC_HI20*
1203	// i+1 -- R_LARCH_TLS_DESC_PC_LO12*
1204	// i+2 -- R_LARCH_TLS_DESC64_PC_LO20*
1205	// i+3 -- R_LARCH_TLS_DESC64_PC_HI12*
1206	isExtreme =
1207	i + `2` < size && relocs [i + `2`].type == R_LARCH_TLS_DESC64_PC_LO20;
1208	}
1209	[[fallthrough]];
1210	case R_RELAX_TLS_GD_TO_IE_ABS:
1211	if (isExtreme) {
1212	if (rel.type == R_LARCH_TLS_DESC_CALL)
1213	continue;
1214	rel.expr = getRelExpr(type: rel.type, s: *rel.sym, loc);
1215	val = SignExtend64(X: sec.getRelocTargetVA(ctx, r: rel, p: secAddr + rel.offset),
1216	B: bits);
1217	relocateNoSym(loc, type: rel.type, val);
1218	} else {
1219	tlsdescToIe(loc, rel, val);
1220	}
1221	continue;
1222	case R_RELAX_TLS_GD_TO_LE:
1223	if (rel.type == R_LARCH_TLS_DESC_PC_HI20) {
1224	isExtreme =
1225	i + `2` < size && relocs [i + `2`].type == R_LARCH_TLS_DESC64_PC_LO20;
1226	}
1227	if (isExtreme) {
1228	if (rel.type == R_LARCH_TLS_DESC_CALL)
1229	continue;
1230	rel.expr = getRelExpr(type: rel.type, s: *rel.sym, loc);
1231	val = SignExtend64(X: sec.getRelocTargetVA(ctx, r: rel, p: secAddr + rel.offset),
1232	B: bits);
1233	relocateNoSym(loc, type: rel.type, val);
1234	} else {
1235	tlsdescToLe(loc, rel, val);
1236	}
1237	continue;
1238	default:
1239	break;
1240	}
1241	relocate(loc, rel, val);
1242	}
1243	}
1244
1245	// When relaxing just R_LARCH_ALIGN, relocDeltas is usually changed only once in
1246	// the absence of a linker script. For call and load/store R_LARCH_RELAX, code
1247	// shrinkage may reduce displacement and make more relocations eligible for
1248	// relaxation. Code shrinkage may increase displacement to a call/load/store
1249	// target at a higher fixed address, invalidating an earlier relaxation. Any
1250	// change in section sizes can have cascading effect and require another
1251	// relaxation pass.
1252	bool LoongArch::relaxOnce(int pass) const {
1253	if (ctx.arg.relocatable)
1254	return false;
1255
1256	if (pass == `0`)
1257	initSymbolAnchors(ctx);
1258
1259	SmallVector<InputSection *, `0`> storage;
1260	bool changed = false;
1261	for (OutputSection *osec : ctx.outputSections) {
1262	if (!(osec->flags & SHF_EXECINSTR))
1263	continue;
1264	for (InputSection sec : getInputSections(os: osec, storage))
1265	changed \|= relax(ctx, sec&: *sec);
1266	}
1267	return changed;
1268	}
1269
1270	void LoongArch::finalizeRelax(int passes) const {
1271	Log(ctx) << "relaxation passes: " << passes;
1272	SmallVector<InputSection *, `0`> storage;
1273	for (OutputSection *osec : ctx.outputSections) {
1274	if (!(osec->flags & SHF_EXECINSTR))
1275	continue;
1276	for (InputSection sec : getInputSections(os: osec, storage)) {
1277	RelaxAux &aux = *sec->relaxAux;
1278	if (!aux.relocDeltas)
1279	continue;
1280
1281	MutableArrayRef<Relocation> rels = sec->relocs();
1282	ArrayRef<uint8_t> old = sec->content();
1283	size_t newSize = old.size() - aux.relocDeltas [rels.size() - `1`];
1284	size_t writesIdx = `0`;
1285	uint8_t *p = ctx.bAlloc.Allocate<uint8_t>(Num: newSize);
1286	uint64_t offset = `0`;
1287	int64_t delta = `0`;
1288	sec->content_ = p;
1289	sec->size = newSize;
1290	sec->bytesDropped = `0`;
1291
1292	// Update section content: remove NOPs for R_LARCH_ALIGN and rewrite
1293	// instructions for relaxed relocations.
1294	for (size_t i = `0`, e = rels.size(); i != e; ++i) {
1295	uint32_t remove = aux.relocDeltas [i] - delta;
1296	delta = aux.relocDeltas [i];
1297	if (remove == `0` && aux.relocTypes [i] == R_LARCH_NONE)
1298	continue;
1299
1300	// Copy from last location to the current relocated location.
1301	Relocation &r = rels [i];
1302	uint64_t size = r.offset - offset;
1303	memcpy(dest: p, src: old.data() + offset, n: size);
1304	p += size;
1305
1306	int64_t skip = `0`;
1307	if (RelType newType = aux.relocTypes [i]) {
1308	switch (newType) {
1309	case R_LARCH_RELAX:
1310	break;
1311	case R_LARCH_PCREL20_S2:
1312	skip = `4`;
1313	write32le(P: p, V: aux.writes [writesIdx++]);
1314	// RelExpr is needed for relocating.
1315	r.expr = r.sym->hasFlag(bit: NEEDS_PLT) ? R_PLT_PC : R_PC;
1316	break;
1317	case R_LARCH_B26:
1318	case R_LARCH_TLS_LE_LO12_R:
1319	skip = `4`;
1320	write32le(P: p, V: aux.writes [writesIdx++]);
1321	break;
1322	case R_LARCH_TLS_GD_PCREL20_S2:
1323	// Note: R_LARCH_TLS_LD_PCREL20_S2 must also use R_TLSGD_PC instead
1324	// of R_TLSLD_PC due to historical reasons. In fact, right now TLSLD
1325	// behaves exactly like TLSGD on LoongArch.
1326	//
1327	// This reason has also been mentioned in mold commit:
1328	// https://github.com/rui314/mold/commit/5dfa1cf07c03bd57cb3d493b652ef22441bcd71c
1329	case R_LARCH_TLS_LD_PCREL20_S2:
1330	skip = `4`;
1331	write32le(P: p, V: aux.writes [writesIdx++]);
1332	r.expr = R_TLSGD_PC;
1333	break;
1334	case R_LARCH_TLS_DESC_PCREL20_S2:
1335	skip = `4`;
1336	write32le(P: p, V: aux.writes [writesIdx++]);
1337	r.expr = R_TLSDESC_PC;
1338	break;
1339	default:
1340	llvm_unreachable("unsupported type");
1341	}
1342	}
1343
1344	p += skip;
1345	offset = r.offset + skip + remove;
1346	}
1347	memcpy(dest: p, src: old.data() + offset, n: old.size() - offset);
1348
1349	// Subtract the previous relocDeltas value from the relocation offset.
1350	// For a pair of R_LARCH_XXX/R_LARCH_RELAX with the same offset, decrease
1351	// their r_offset by the same delta.
1352	delta = `0`;
1353	for (size_t i = `0`, e = rels.size(); i != e;) {
1354	uint64_t cur = rels [i].offset;
1355	do {
1356	rels [i].offset -= delta;
1357	if (aux.relocTypes [i] != R_LARCH_NONE)
1358	rels [i].type = aux.relocTypes [i];
1359	} while (++i != e && rels [i].offset == cur);
1360	delta = aux.relocDeltas [i - `1`];
1361	}
1362	}
1363	}
1364	}
1365
1366	void elf::setLoongArchTargetInfo(Ctx &ctx) {
1367	ctx.target.reset(p: new LoongArch (ctx));
1368	}
1369

Browse the source code of llvm_projects/lld/ELF/Arch/LoongArch.cpp