X86_64.cpp source code [llvm_projects/lld/ELF/Arch/X86_64.cpp]

1	//===- X86_64.cpp ---------------------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "OutputSections.h"
10	#include "Relocations.h"
11	#include "Symbols.h"
12	#include "SyntheticSections.h"
13	#include "Target.h"
14	#include "TargetImpl.h"
15	#include "llvm/BinaryFormat/ELF.h"
16	#include "llvm/Support/Endian.h"
17	#include "llvm/Support/MathExtras.h"
18
19	using namespace llvm;
20	using namespace llvm::object;
21	using namespace llvm::support::endian;
22	using namespace llvm::ELF;
23	using namespace lld;
24	using namespace lld::elf;
25
26	namespace {
27	class X86_64 : public TargetInfo {
28	public:
29	X86_64(Ctx &);
30	int getTlsGdRelaxSkip(RelType type) const override;
31	RelExpr getRelExpr(RelType type, const Symbol &s,
32	const uint8_t loc) const* override;
33	RelType getDynRel(RelType type) const override;
34	void writeGotPltHeader(uint8_t buf) const* override;
35	void writeGotPlt(uint8_t buf, const* Symbol &s) const override;
36	void writeIgotPlt(uint8_t buf, const* Symbol &s) const override;
37	void writePltHeader(uint8_t buf) const* override;
38	void writePlt(uint8_t buf, const* Symbol &sym,
39	uint64_t pltEntryAddr) const override;
40	void relocate(uint8_t loc, const* Relocation &rel,
41	uint64_t val) const override;
42	int64_t getImplicitAddend(const uint8_t buf, RelType type) const* override;
43	void applyJumpInstrMod(uint8_t *loc, JumpModType type,
44	unsigned size) const override;
45	RelExpr adjustGotPcExpr(RelType type, int64_t addend,
46	const uint8_t loc) const* override;
47	void relocateAlloc(InputSectionBase &sec, uint8_t buf) const* override;
48	bool adjustPrologueForCrossSplitStack(uint8_t loc, uint8_t end,
49	uint8_t stOther) const override;
50	bool deleteFallThruJmpInsn(InputSection &is, InputFile *file,
51	InputSection nextIS) const* override;
52	bool relaxOnce(int pass) const override;
53	void applyBranchToBranchOpt() const override;
54
55	private:
56	void relaxTlsGdToLe(uint8_t loc, const* Relocation &rel, uint64_t val) const;
57	void relaxTlsGdToIe(uint8_t loc, const* Relocation &rel, uint64_t val) const;
58	void relaxTlsLdToLe(uint8_t loc, const* Relocation &rel, uint64_t val) const;
59	void relaxTlsIeToLe(uint8_t loc, const* Relocation &rel, uint64_t val) const;
60	};
61	} // namespace
62
63	// This is vector of NOP instructions of sizes from 1 to 8 bytes. The
64	// appropriately sized instructions are used to fill the gaps between sections
65	// which are executed during fall through.
66	static const std::vector<std::vector<uint8_t>> nopInstructions = {
67	{`0x90`},
68	{`0x66`, `0x90`},
69	{`0x0f`, `0x1f`, `0x00`},
70	{`0x0f`, `0x1f`, `0x40`, `0x00`},
71	{`0x0f`, `0x1f`, `0x44`, `0x00`, `0x00`},
72	{`0x66`, `0x0f`, `0x1f`, `0x44`, `0x00`, `0x00`},
73	{`0x0F`, `0x1F`, `0x80`, `0x00`, `0x00`, `0x00`, `0x00`},
74	{`0x0F`, `0x1F`, `0x84`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`},
75	{`0x66`, `0x0F`, `0x1F`, `0x84`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`}};
76
77	X86_64::X86_64(Ctx &ctx) : TargetInfo (ctx) {
78	copyRel = R_X86_64_COPY;
79	gotRel = R_X86_64_GLOB_DAT;
80	pltRel = R_X86_64_JUMP_SLOT;
81	relativeRel = R_X86_64_RELATIVE;
82	iRelativeRel = R_X86_64_IRELATIVE;
83	symbolicRel = R_X86_64_64;
84	tlsDescRel = R_X86_64_TLSDESC;
85	tlsGotRel = R_X86_64_TPOFF64;
86	tlsModuleIndexRel = R_X86_64_DTPMOD64;
87	tlsOffsetRel = R_X86_64_DTPOFF64;
88	gotBaseSymInGotPlt = true;
89	gotEntrySize = `8`;
90	pltHeaderSize = `16`;
91	pltEntrySize = `16`;
92	ipltEntrySize = `16`;
93	trapInstr = {`0xcc`, `0xcc`, `0xcc`, `0xcc`}; // 0xcc = INT3
94	nopInstrs = nopInstructions;
95
96	// Align to the large page size (known as a superpage or huge page).
97	// FreeBSD automatically promotes large, superpage-aligned allocations.
98	defaultImageBase = `0x200000`;
99	}
100
101	int X86_64::getTlsGdRelaxSkip(RelType type) const {
102	// TLSDESC relocations are processed separately. See relaxTlsGdToLe below.
103	return type == R_X86_64_GOTPC32_TLSDESC \|\|
104	type == R_X86_64_CODE_4_GOTPC32_TLSDESC \|\|
105	type == R_X86_64_TLSDESC_CALL
106	? `1`
107	: `2`;
108	}
109
110	// Opcodes for the different X86_64 jmp instructions.
111	enum JmpInsnOpcode : uint32_t {
112	J_JMP_32,
113	J_JNE_32,
114	J_JE_32,
115	J_JG_32,
116	J_JGE_32,
117	J_JB_32,
118	J_JBE_32,
119	J_JL_32,
120	J_JLE_32,
121	J_JA_32,
122	J_JAE_32,
123	J_UNKNOWN,
124	};
125
126	// Given the first (optional) and second byte of the insn's opcode, this
127	// returns the corresponding enum value.
128	static JmpInsnOpcode getJmpInsnType(const uint8_t *first,
129	const uint8_t *second) {
130	if (*second == `0xe9`)
131	return J_JMP_32;
132
133	if (first == nullptr)
134	return J_UNKNOWN;
135
136	if (*first == `0x0f`) {
137	switch (*second) {
138	case `0x84`:
139	return J_JE_32;
140	case `0x85`:
141	return J_JNE_32;
142	case `0x8f`:
143	return J_JG_32;
144	case `0x8d`:
145	return J_JGE_32;
146	case `0x82`:
147	return J_JB_32;
148	case `0x86`:
149	return J_JBE_32;
150	case `0x8c`:
151	return J_JL_32;
152	case `0x8e`:
153	return J_JLE_32;
154	case `0x87`:
155	return J_JA_32;
156	case `0x83`:
157	return J_JAE_32;
158	}
159	}
160	return J_UNKNOWN;
161	}
162
163	// Return the relocation index for input section IS with a specific Offset.
164	// Returns the maximum size of the vector if no such relocation is found.
165	static unsigned getRelocationWithOffset(const InputSection &is,
166	uint64_t offset) {
167	unsigned size = is.relocs().size();
168	for (unsigned i = size - `1`; i + `1` > `0`; --i) {
169	if (is.relocs()[i].offset == offset && is.relocs()[i].expr != R_NONE)
170	return i;
171	}
172	return size;
173	}
174
175	// Returns true if R corresponds to a relocation used for a jump instruction.
176	// TODO: Once special relocations for relaxable jump instructions are available,
177	// this should be modified to use those relocations.
178	static bool isRelocationForJmpInsn(Relocation &R) {
179	return R.type == R_X86_64_PLT32 \|\| R.type == R_X86_64_PC32 \|\|
180	R.type == R_X86_64_PC8;
181	}
182
183	// Return true if Relocation R points to the first instruction in the
184	// next section.
185	// TODO: Delete this once psABI reserves a new relocation type for fall thru
186	// jumps.
187	static bool isFallThruRelocation(InputSection &is, InputFile *file,
188	InputSection *nextIS, Relocation &r) {
189	if (!isRelocationForJmpInsn(R&: r))
190	return false;
191
192	uint64_t addrLoc = is.getOutputSection()->addr + is.outSecOff + r.offset;
193	uint64_t targetOffset = is.getRelocTargetVA(is.getCtx(), r, p: addrLoc);
194
195	// If this jmp is a fall thru, the target offset is the beginning of the
196	// next section.
197	uint64_t nextSectionOffset =
198	nextIS->getOutputSection()->addr + nextIS->outSecOff;
199	return (addrLoc + `4` + targetOffset) == nextSectionOffset;
200	}
201
202	// Return the jmp instruction opcode that is the inverse of the given
203	// opcode. For example, JE inverted is JNE.
204	static JmpInsnOpcode invertJmpOpcode(const JmpInsnOpcode opcode) {
205	switch (opcode) {
206	case J_JE_32:
207	return J_JNE_32;
208	case J_JNE_32:
209	return J_JE_32;
210	case J_JG_32:
211	return J_JLE_32;
212	case J_JGE_32:
213	return J_JL_32;
214	case J_JB_32:
215	return J_JAE_32;
216	case J_JBE_32:
217	return J_JA_32;
218	case J_JL_32:
219	return J_JGE_32;
220	case J_JLE_32:
221	return J_JG_32;
222	case J_JA_32:
223	return J_JBE_32;
224	case J_JAE_32:
225	return J_JB_32;
226	default:
227	return J_UNKNOWN;
228	}
229	}
230
231	// Deletes direct jump instruction in input sections that jumps to the
232	// following section as it is not required. If there are two consecutive jump
233	// instructions, it checks if they can be flipped and one can be deleted.
234	// For example:
235	// .section .text
236	// a.BB.foo:
237	// ...
238	// 10: jne aa.BB.foo
239	// 16: jmp bar
240	// aa.BB.foo:
241	// ...
242	//
243	// can be converted to:
244	// a.BB.foo:
245	// ...
246	// 10: je bar #jne flipped to je and the jmp is deleted.
247	// aa.BB.foo:
248	// ...
249	bool X86_64::deleteFallThruJmpInsn(InputSection &is, InputFile *file,
250	InputSection nextIS) const* {
251	const unsigned sizeOfDirectJmpInsn = `5`;
252
253	if (nextIS == nullptr)
254	return false;
255
256	if (is.getSize() < sizeOfDirectJmpInsn)
257	return false;
258
259	// If this jmp insn can be removed, it is the last insn and the
260	// relocation is 4 bytes before the end.
261	unsigned rIndex = getRelocationWithOffset(is, offset: is.getSize() - `4`);
262	if (rIndex == is.relocs().size())
263	return false;
264
265	Relocation &r = is.relocs()[rIndex];
266
267	// Check if the relocation corresponds to a direct jmp.
268	const uint8_t *secContents = is.content().data();
269	// If it is not a direct jmp instruction, there is nothing to do here.
270	if (*(secContents + r.offset - `1`) != `0xe9`)
271	return false;
272
273	if (isFallThruRelocation(is, file, nextIS, r)) {
274	// This is a fall thru and can be deleted.
275	r.expr = R_NONE;
276	r.offset = `0`;
277	is.drop_back(num: sizeOfDirectJmpInsn);
278	is.nopFiller = true;
279	return true;
280	}
281
282	// Now, check if flip and delete is possible.
283	const unsigned sizeOfJmpCCInsn = `6`;
284	// To flip, there must be at least one JmpCC and one direct jmp.
285	if (is.getSize() < sizeOfDirectJmpInsn + sizeOfJmpCCInsn)
286	return false;
287
288	unsigned rbIndex =
289	getRelocationWithOffset(is, offset: (is.getSize() - sizeOfDirectJmpInsn - `4`));
290	if (rbIndex == is.relocs().size())
291	return false;
292
293	Relocation &rB = is.relocs()[rbIndex];
294
295	const uint8_t *jmpInsnB = secContents + rB.offset - `1`;
296	JmpInsnOpcode jmpOpcodeB = getJmpInsnType(first: jmpInsnB - `1`, second: jmpInsnB);
297	if (jmpOpcodeB == J_UNKNOWN)
298	return false;
299
300	if (!isFallThruRelocation(is, file, nextIS, r&: rB))
301	return false;
302
303	// jmpCC jumps to the fall thru block, the branch can be flipped and the
304	// jmp can be deleted.
305	JmpInsnOpcode jInvert = invertJmpOpcode(opcode: jmpOpcodeB);
306	if (jInvert == J_UNKNOWN)
307	return false;
308	is.jumpInstrMod = make<JumpInstrMod>();
309	*is.jumpInstrMod = {.offset: rB.offset - `1`, .original: jInvert, .size: `4`};
310	// Move R's values to rB except the offset.
311	rB = {.expr: r.expr, .type: r.type, .offset: rB.offset, .addend: r.addend, .sym: r.sym};
312	// Cancel R
313	r.expr = R_NONE;
314	r.offset = `0`;
315	is.drop_back(num: sizeOfDirectJmpInsn);
316	is.nopFiller = true;
317	return true;
318	}
319
320	bool X86_64::relaxOnce(int pass) const {
321	uint64_t minVA = UINT64_MAX, maxVA = `0`;
322	for (OutputSection *osec : ctx.outputSections) {
323	if (!(osec->flags & SHF_ALLOC))
324	continue;
325	minVA = std::min(a: minVA, b: osec->addr);
326	maxVA = std::max(a: maxVA, b: osec->addr + osec->size);
327	}
328	// If the max VA is under 2^31, GOTPCRELX relocations cannot overfow. In
329	// -pie/-shared, the condition can be relaxed to test the max VA difference as
330	// there is no R_RELAX_GOT_PC_NOPIC.
331	if (isUInt<`31`>(x: maxVA) \|\| (isUInt<`31`>(x: maxVA - minVA) && ctx.arg.isPic))
332	return false;
333
334	SmallVector<InputSection *, `0`> storage;
335	bool changed = false;
336	for (OutputSection *osec : ctx.outputSections) {
337	if (!(osec->flags & SHF_EXECINSTR))
338	continue;
339	for (InputSection sec : getInputSections(os: osec, storage)) {
340	for (Relocation &rel : sec->relocs()) {
341	if (rel.expr != R_RELAX_GOT_PC && rel.expr != R_RELAX_GOT_PC_NOPIC)
342	continue;
343	assert(rel.addend == -`4`);
344
345	Relocation rel1 = rel;
346	rel1.addend = rel.expr == R_RELAX_GOT_PC_NOPIC ? `0` : -`4`;
347	uint64_t v = sec->getRelocTargetVA(ctx, r: rel1,
348	p: sec->getOutputSection()->addr +
349	sec->outSecOff + rel.offset);
350	if (isInt<`32`>(x: v))
351	continue;
352	if (rel.sym->auxIdx == `0`) {
353	rel.sym->allocateAux(ctx);
354	addGotEntry(ctx, sym&: *rel.sym);
355	changed = true;
356	}
357	rel.expr = R_GOT_PC;
358	}
359	}
360	}
361	return changed;
362	}
363
364	RelExpr X86_64::getRelExpr(RelType type, const Symbol &s,
365	const uint8_t loc) const* {
366	switch (type) {
367	case R_X86_64_8:
368	case R_X86_64_16:
369	case R_X86_64_32:
370	case R_X86_64_32S:
371	case R_X86_64_64:
372	return R_ABS;
373	case R_X86_64_DTPOFF32:
374	case R_X86_64_DTPOFF64:
375	return R_DTPREL;
376	case R_X86_64_TPOFF32:
377	case R_X86_64_TPOFF64:
378	return R_TPREL;
379	case R_X86_64_TLSDESC_CALL:
380	return R_TLSDESC_CALL;
381	case R_X86_64_TLSLD:
382	return R_TLSLD_PC;
383	case R_X86_64_TLSGD:
384	return R_TLSGD_PC;
385	case R_X86_64_SIZE32:
386	case R_X86_64_SIZE64:
387	return R_SIZE;
388	case R_X86_64_PLT32:
389	return R_PLT_PC;
390	case R_X86_64_PC8:
391	case R_X86_64_PC16:
392	case R_X86_64_PC32:
393	case R_X86_64_PC64:
394	return R_PC;
395	case R_X86_64_GOT32:
396	case R_X86_64_GOT64:
397	return R_GOTPLT;
398	case R_X86_64_GOTPC32_TLSDESC:
399	case R_X86_64_CODE_4_GOTPC32_TLSDESC:
400	return R_TLSDESC_PC;
401	case R_X86_64_GOTPCREL:
402	case R_X86_64_GOTPCRELX:
403	case R_X86_64_REX_GOTPCRELX:
404	case R_X86_64_CODE_4_GOTPCRELX:
405	case R_X86_64_GOTTPOFF:
406	case R_X86_64_CODE_4_GOTTPOFF:
407	case R_X86_64_CODE_6_GOTTPOFF:
408	return R_GOT_PC;
409	case R_X86_64_GOTOFF64:
410	return R_GOTPLTREL;
411	case R_X86_64_PLTOFF64:
412	return R_PLT_GOTPLT;
413	case R_X86_64_GOTPC32:
414	case R_X86_64_GOTPC64:
415	return R_GOTPLTONLY_PC;
416	case R_X86_64_NONE:
417	return R_NONE;
418	default:
419	Err(ctx) << getErrorLoc(ctx, loc) << "unknown relocation (" << type.v
420	<< ") against symbol " << &s;
421	return R_NONE;
422	}
423	}
424
425	void X86_64::writeGotPltHeader(uint8_t buf) const* {
426	// The first entry holds the link-time address of _DYNAMIC. It is documented
427	// in the psABI and glibc before Aug 2021 used the entry to compute run-time
428	// load address of the shared object (note that this is relevant for linking
429	// ld.so, not any other program).
430	write64le(P: buf, V: ctx.mainPart->dynamic ->getVA());
431	}
432
433	void X86_64::writeGotPlt(uint8_t buf, const* Symbol &s) const {
434	// See comments in X86::writeGotPlt.
435	write64le(P: buf, V: s.getPltVA(ctx) + `6`);
436	}
437
438	void X86_64::writeIgotPlt(uint8_t buf, const* Symbol &s) const {
439	// An x86 entry is the address of the ifunc resolver function (for -z rel).
440	if (ctx.arg.writeAddends)
441	write64le(P: buf, V: s.getVA(ctx));
442	}
443
444	void X86_64::writePltHeader(uint8_t buf) const* {
445	const uint8_t pltData[] = {
446	`0xff`, `0x35`, `0`, `0`, `0`, `0`, // pushq GOTPLT+8(%rip)
447	`0xff`, `0x25`, `0`, `0`, `0`, `0`, // jmp GOTPLT+16(%rip)*
448	`0x0f`, `0x1f`, `0x40`, `0x00`, // nop
449	};
450	memcpy(dest: buf, src: pltData, n: sizeof(pltData));
451	uint64_t gotPlt = ctx.in.gotPlt ->getVA();
452	uint64_t plt = ctx.in.ibtPlt ? ctx.in.ibtPlt ->getVA() : ctx.in.plt ->getVA();
453	write32le(P: buf + `2`, V: gotPlt - plt + `2`); // GOTPLT+8
454	write32le(P: buf + `8`, V: gotPlt - plt + `4`); // GOTPLT+16
455	}
456
457	void X86_64::writePlt(uint8_t buf, const* Symbol &sym,
458	uint64_t pltEntryAddr) const {
459	const uint8_t inst[] = {
460	`0xff`, `0x25`, `0`, `0`, `0`, `0`, // jmpq got(%rip)*
461	`0x68`, `0`, `0`, `0`, `0`, // pushq <relocation index>
462	`0xe9`, `0`, `0`, `0`, `0`, // jmpq plt[0]
463	};
464	memcpy(dest: buf, src: inst, n: sizeof(inst));
465
466	write32le(P: buf + `2`, V: sym.getGotPltVA(ctx) - pltEntryAddr - `6`);
467	write32le(P: buf + `7`, V: sym.getPltIdx(ctx));
468	write32le(P: buf + `12`, V: ctx.in.plt ->getVA() - pltEntryAddr - `16`);
469	}
470
471	RelType X86_64::getDynRel(RelType type) const {
472	if (type == R_X86_64_64 \|\| type == R_X86_64_PC64 \|\| type == R_X86_64_SIZE32 \|\|
473	type == R_X86_64_SIZE64)
474	return type;
475	return R_X86_64_NONE;
476	}
477
478	void X86_64::relaxTlsGdToLe(uint8_t loc, const* Relocation &rel,
479	uint64_t val) const {
480	if (rel.type == R_X86_64_TLSGD) {
481	// Convert
482	// .byte 0x66
483	// leaq x@tlsgd(%rip), %rdi
484	// .word 0x6666
485	// rex64
486	// call __tls_get_addr@plt
487	// to the following two instructions.
488	const uint8_t inst[] = {
489	`0x64`, `0x48`, `0x8b`, `0x04`, `0x25`, `0x00`, `0x00`,
490	`0x00`, `0x00`, // mov %fs:0x0,%rax
491	`0x48`, `0x8d`, `0x80`, `0`, `0`, `0`, `0`, // lea x@tpoff,%rax
492	};
493	memcpy(dest: loc - `4`, src: inst, n: sizeof(inst));
494
495	// The original code used a pc relative relocation and so we have to
496	// compensate for the -4 in had in the addend.
497	write32le(P: loc + `8`, V: val + `4`);
498	} else if (rel.type == R_X86_64_GOTPC32_TLSDESC \|\|
499	rel.type == R_X86_64_CODE_4_GOTPC32_TLSDESC) {
500	// Convert leaq x@tlsdesc(%rip), %REG to movq $x@tpoff, %REG.
501	if ((loc[-`3`] & `0xfb`) != `0x48` \|\| loc[-`2`] != `0x8d` \|\|
502	(loc[-`1`] & `0xc7`) != `0x05`) {
503	Err(ctx) << getErrorLoc(ctx, loc: (rel.type == R_X86_64_GOTPC32_TLSDESC)
504	? loc - `3`
505	: loc - `4`)
506	<< "R_X86_64_GOTPC32_TLSDESC/R_X86_64_CODE_4_GOTPC32_TLSDESC "
507	"must be used in leaq x@tlsdesc(%rip), %REG";
508	return;
509	}
510	if (rel.type == R_X86_64_GOTPC32_TLSDESC) {
511	loc[-`3`] = `0x48` \| ((loc[-`3`] >> `2`) & `1`);
512	} else {
513	loc[-`3`] = (loc[-`3`] & ~`0x44`) \| ((loc[-`3`] & `0x44`) >> `2`);
514	}
515	loc[-`2`] = `0xc7`;
516	loc[-`1`] = `0xc0` \| ((loc[-`1`] >> `3`) & `7`);
517
518	write32le(P: loc, V: val + `4`);
519	} else {
520	// Convert call x@tlsdesc(%REG) to xchg ax, ax.*
521	assert(rel.type == R_X86_64_TLSDESC_CALL);
522	loc[`0`] = `0x66`;
523	loc[`1`] = `0x90`;
524	}
525	}
526
527	void X86_64::relaxTlsGdToIe(uint8_t loc, const* Relocation &rel,
528	uint64_t val) const {
529	if (rel.type == R_X86_64_TLSGD) {
530	// Convert
531	// .byte 0x66
532	// leaq x@tlsgd(%rip), %rdi
533	// .word 0x6666
534	// rex64
535	// call __tls_get_addr@plt
536	// to the following two instructions.
537	const uint8_t inst[] = {
538	`0x64`, `0x48`, `0x8b`, `0x04`, `0x25`, `0x00`, `0x00`,
539	`0x00`, `0x00`, // mov %fs:0x0,%rax
540	`0x48`, `0x03`, `0x05`, `0`, `0`, `0`, `0`, // addq x@gottpoff(%rip),%rax
541	};
542	memcpy(dest: loc - `4`, src: inst, n: sizeof(inst));
543
544	// Both code sequences are PC relatives, but since we are moving the
545	// constant forward by 8 bytes we have to subtract the value by 8.
546	write32le(P: loc + `8`, V: val - `8`);
547	} else if (rel.type == R_X86_64_GOTPC32_TLSDESC \|\|
548	rel.type == R_X86_64_CODE_4_GOTPC32_TLSDESC) {
549	// Convert leaq x@tlsdesc(%rip), %REG to movq x@gottpoff(%rip), %REG.
550	if ((loc[-`3`] & `0xfb`) != `0x48` \|\| loc[-`2`] != `0x8d` \|\|
551	(loc[-`1`] & `0xc7`) != `0x05`) {
552	Err(ctx) << getErrorLoc(ctx, loc: (rel.type == R_X86_64_GOTPC32_TLSDESC)
553	? loc - `3`
554	: loc - `4`)
555	<< "R_X86_64_GOTPC32_TLSDESC/R_X86_64_CODE_4_GOTPC32_TLSDESC "
556	"must be used in leaq x@tlsdesc(%rip), %REG";
557	return;
558	}
559	loc[-`2`] = `0x8b`;
560	write32le(P: loc, V: val);
561	} else {
562	// Convert call x@tlsdesc(%rax) to xchg ax, ax.*
563	assert(rel.type == R_X86_64_TLSDESC_CALL);
564	loc[`0`] = `0x66`;
565	loc[`1`] = `0x90`;
566	}
567	}
568
569	// In some conditions,
570	// R_X86_64_GOTTPOFF/R_X86_64_CODE_4_GOTTPOFF/R_X86_64_CODE_6_GOTTPOFF
571	// relocation can be optimized to R_X86_64_TPOFF32 so that it does not use GOT.
572	void X86_64::relaxTlsIeToLe(uint8_t loc, const* Relocation &rel,
573	uint64_t val) const {
574	uint8_t *inst = loc - `3`;
575	uint8_t reg = loc[-`1`] >> `3`;
576	uint8_t *regSlot = loc - `1`;
577
578	if (rel.type == R_X86_64_GOTTPOFF) {
579	// Note that ADD with RSP or R12 is converted to ADD instead of LEA
580	// because LEA with these registers needs 4 bytes to encode and thus
581	// wouldn't fit the space.
582
583	if (memcmp(s1: inst, s2: "\x48\x03\x25", n: `3`) == `0`) {
584	// "addq foo@gottpoff(%rip),%rsp" -> "addq $foo,%rsp"
585	memcpy(dest: inst, src: "\x48\x81\xc4", n: `3`);
586	} else if (memcmp(s1: inst, s2: "\x4c\x03\x25", n: `3`) == `0`) {
587	// "addq foo@gottpoff(%rip),%r12" -> "addq $foo,%r12"
588	memcpy(dest: inst, src: "\x49\x81\xc4", n: `3`);
589	} else if (memcmp(s1: inst, s2: "\x4c\x03", n: `2`) == `0`) {
590	// "addq foo@gottpoff(%rip),%r[8-15]" -> "leaq foo(%r[8-15]),%r[8-15]"
591	memcpy(dest: inst, src: "\x4d\x8d", n: `2`);
592	*regSlot = `0x80` \| (reg << `3`) \| reg;
593	} else if (memcmp(s1: inst, s2: "\x48\x03", n: `2`) == `0`) {
594	// "addq foo@gottpoff(%rip),%reg -> "leaq foo(%reg),%reg"
595	memcpy(dest: inst, src: "\x48\x8d", n: `2`);
596	*regSlot = `0x80` \| (reg << `3`) \| reg;
597	} else if (memcmp(s1: inst, s2: "\x4c\x8b", n: `2`) == `0`) {
598	// "movq foo@gottpoff(%rip),%r[8-15]" -> "movq $foo,%r[8-15]"
599	memcpy(dest: inst, src: "\x49\xc7", n: `2`);
600	*regSlot = `0xc0` \| reg;
601	} else if (memcmp(s1: inst, s2: "\x48\x8b", n: `2`) == `0`) {
602	// "movq foo@gottpoff(%rip),%reg" -> "movq $foo,%reg"
603	memcpy(dest: inst, src: "\x48\xc7", n: `2`);
604	*regSlot = `0xc0` \| reg;
605	} else {
606	Err(ctx)
607	<< getErrorLoc(ctx, loc: loc - `3`)
608	<< "R_X86_64_GOTTPOFF must be used in MOVQ or ADDQ instructions only";
609	}
610	} else if (rel.type == R_X86_64_CODE_4_GOTTPOFF) {
611	if (loc[-`4`] != `0xd5`) {
612	Err(ctx) << getErrorLoc(ctx, loc: loc - `4`)
613	<< "invalid prefix with R_X86_64_CODE_4_GOTTPOFF!";
614	return;
615	}
616	const uint8_t rex = loc[-`3`];
617	loc[-`3`] = (rex & ~`0x44`) \| (rex & `0x44`) >> `2`;
618	*regSlot = `0xc0` \| reg;
619
620	if (loc[-`2`] == `0x8b`) {
621	// "movq foo@gottpoff(%rip),%r[16-31]" -> "movq $foo,%r[16-31]"
622	loc[-`2`] = `0xc7`;
623	} else if (loc[-`2`] == `0x03`) {
624	// "addq foo@gottpoff(%rip),%r[16-31]" -> "addq $foo,%r[16-31]"
625	loc[-`2`] = `0x81`;
626	} else {
627	Err(ctx) << getErrorLoc(ctx, loc: loc - `4`)
628	<< "R_X86_64_CODE_4_GOTTPOFF must be used in MOVQ or ADDQ "
629	"instructions only";
630	}
631	} else if (rel.type == R_X86_64_CODE_6_GOTTPOFF) {
632	if (loc[-`6`] != `0x62`) {
633	Err(ctx) << getErrorLoc(ctx, loc: loc - `6`)
634	<< "invalid prefix with R_X86_64_CODE_6_GOTTPOFF!";
635	return;
636	}
637	// Check bits are satisfied:
638	// loc[-5]: X==1 (inverted polarity), (loc[-5] & 0x7) == 0x4
639	// loc[-4]: W==1, X2==1 (inverted polarity), pp==0b00(NP)
640	// loc[-3]: NF==1 or ND==1
641	// loc[-2]: opcode==0x1 or opcode==0x3
642	// loc[-1]: Mod==0b00, RM==0b101
643	if (((loc[-`5`] & `0x47`) == `0x44`) && ((loc[-`4`] & `0x87`) == `0x84`) &&
644	((loc[-`3`] & `0x14`) != `0`) && (loc[-`2`] == `0x1` \|\| loc[-`2`] == `0x3`) &&
645	((loc[-`1`] & `0xc7`) == `0x5`)) {
646	// "addq %reg1, foo@GOTTPOFF(%rip), %reg2" -> "addq $foo, %reg1, %reg2"
647	// "addq foo@GOTTPOFF(%rip), %reg1, %reg2" -> "addq $foo, %reg1, %reg2"
648	// "{nf} addq %reg1, foo@GOTTPOFF(%rip), %reg2"
649	// -> "{nf} addq $foo, %reg1, %reg2"
650	// "{nf} addq name@GOTTPOFF(%rip), %reg1, %reg2"
651	// -> "{nf} addq $foo, %reg1, %reg2"
652	// "{nf} addq name@GOTTPOFF(%rip), %reg" -> "{nf} addq $foo, %reg"
653	loc[-`2`] = `0x81`;
654	// Move R bits to B bits in EVEX payloads and ModRM byte.
655	const uint8_t evexPayload0 = loc[-`5`];
656	if ((evexPayload0 & (`1` << `7`)) == `0`)
657	loc[-`5`] = (evexPayload0 \| (`1` << `7`)) & ~(`1` << `5`);
658	if ((evexPayload0 & (`1` << `4`)) == `0`)
659	loc[-`5`] = evexPayload0 \| (`1` << `4`) \| (`1` << `3`);
660	*regSlot = `0xc0` \| reg;
661	} else {
662	Err(ctx) << getErrorLoc(ctx, loc: loc - `6`)
663	<< "R_X86_64_CODE_6_GOTTPOFF must be used in ADDQ instructions "
664	"with NDD/NF/NDD+NF only";
665	}
666	} else {
667	llvm_unreachable("Unsupported relocation type!");
668	}
669
670	// The original code used a PC relative relocation.
671	// Need to compensate for the -4 it had in the addend.
672	write32le(P: loc, V: val + `4`);
673	}
674
675	void X86_64::relaxTlsLdToLe(uint8_t loc, const* Relocation &rel,
676	uint64_t val) const {
677	const uint8_t inst[] = {
678	`0x66`, `0x66`, // .word 0x6666
679	`0x66`, // .byte 0x66
680	`0x64`, `0x48`, `0x8b`, `0x04`, `0x25`, `0x00`, `0x00`, `0x00`, `0x00`, // mov %fs:0,%rax
681	};
682
683	if (loc[`4`] == `0xe8`) {
684	// Convert
685	// leaq bar@tlsld(%rip), %rdi # 48 8d 3d <Loc>
686	// callq __tls_get_addr@PLT # e8 <disp32>
687	// leaq bar@dtpoff(%rax), %rcx
688	// to
689	// .word 0x6666
690	// .byte 0x66
691	// mov %fs:0,%rax
692	// leaq bar@tpoff(%rax), %rcx
693	memcpy(dest: loc - `3`, src: inst, n: sizeof(inst));
694	return;
695	}
696
697	if (loc[`4`] == `0xff` && loc[`5`] == `0x15`) {
698	// Convert
699	// leaq x@tlsld(%rip),%rdi # 48 8d 3d <Loc>
700	// call __tls_get_addr@GOTPCREL(%rip) # ff 15 <disp32>*
701	// to
702	// .long 0x66666666
703	// movq %fs:0,%rax
704	// See "Table 11.9: LD -> LE Code Transition (LP64)" in
705	// https://raw.githubusercontent.com/wiki/hjl-tools/x86-psABI/x86-64-psABI-1.0.pdf
706	loc[-`3`] = `0x66`;
707	memcpy(dest: loc - `2`, src: inst, n: sizeof(inst));
708	return;
709	}
710
711	ErrAlways(ctx)
712	<< getErrorLoc(ctx, loc: loc - `3`)
713	<< "expected R_X86_64_PLT32 or R_X86_64_GOTPCRELX after R_X86_64_TLSLD";
714	}
715
716	// A JumpInstrMod at a specific offset indicates that the jump instruction
717	// opcode at that offset must be modified. This is specifically used to relax
718	// jump instructions with basic block sections. This function looks at the
719	// JumpMod and effects the change.
720	void X86_64::applyJumpInstrMod(uint8_t *loc, JumpModType type,
721	unsigned size) const {
722	switch (type) {
723	case J_JMP_32:
724	if (size == `4`)
725	*loc = `0xe9`;
726	else
727	*loc = `0xeb`;
728	break;
729	case J_JE_32:
730	if (size == `4`) {
731	loc[-`1`] = `0x0f`;
732	*loc = `0x84`;
733	} else
734	*loc = `0x74`;
735	break;
736	case J_JNE_32:
737	if (size == `4`) {
738	loc[-`1`] = `0x0f`;
739	*loc = `0x85`;
740	} else
741	*loc = `0x75`;
742	break;
743	case J_JG_32:
744	if (size == `4`) {
745	loc[-`1`] = `0x0f`;
746	*loc = `0x8f`;
747	} else
748	*loc = `0x7f`;
749	break;
750	case J_JGE_32:
751	if (size == `4`) {
752	loc[-`1`] = `0x0f`;
753	*loc = `0x8d`;
754	} else
755	*loc = `0x7d`;
756	break;
757	case J_JB_32:
758	if (size == `4`) {
759	loc[-`1`] = `0x0f`;
760	*loc = `0x82`;
761	} else
762	*loc = `0x72`;
763	break;
764	case J_JBE_32:
765	if (size == `4`) {
766	loc[-`1`] = `0x0f`;
767	*loc = `0x86`;
768	} else
769	*loc = `0x76`;
770	break;
771	case J_JL_32:
772	if (size == `4`) {
773	loc[-`1`] = `0x0f`;
774	*loc = `0x8c`;
775	} else
776	*loc = `0x7c`;
777	break;
778	case J_JLE_32:
779	if (size == `4`) {
780	loc[-`1`] = `0x0f`;
781	*loc = `0x8e`;
782	} else
783	*loc = `0x7e`;
784	break;
785	case J_JA_32:
786	if (size == `4`) {
787	loc[-`1`] = `0x0f`;
788	*loc = `0x87`;
789	} else
790	*loc = `0x77`;
791	break;
792	case J_JAE_32:
793	if (size == `4`) {
794	loc[-`1`] = `0x0f`;
795	*loc = `0x83`;
796	} else
797	*loc = `0x73`;
798	break;
799	case J_UNKNOWN:
800	llvm_unreachable("Unknown Jump Relocation");
801	}
802	}
803
804	int64_t X86_64::getImplicitAddend(const uint8_t buf, RelType type) const* {
805	switch (type) {
806	case R_X86_64_8:
807	case R_X86_64_PC8:
808	return SignExtend64<`8`>(x: *buf);
809	case R_X86_64_16:
810	case R_X86_64_PC16:
811	return SignExtend64<`16`>(x: read16le(P: buf));
812	case R_X86_64_32:
813	case R_X86_64_32S:
814	case R_X86_64_TPOFF32:
815	case R_X86_64_GOT32:
816	case R_X86_64_GOTPC32:
817	case R_X86_64_GOTPC32_TLSDESC:
818	case R_X86_64_GOTPCREL:
819	case R_X86_64_GOTPCRELX:
820	case R_X86_64_REX_GOTPCRELX:
821	case R_X86_64_CODE_4_GOTPCRELX:
822	case R_X86_64_PC32:
823	case R_X86_64_GOTTPOFF:
824	case R_X86_64_CODE_4_GOTTPOFF:
825	case R_X86_64_CODE_6_GOTTPOFF:
826	case R_X86_64_PLT32:
827	case R_X86_64_TLSGD:
828	case R_X86_64_TLSLD:
829	case R_X86_64_DTPOFF32:
830	case R_X86_64_SIZE32:
831	return SignExtend64<`32`>(x: read32le(P: buf));
832	case R_X86_64_64:
833	case R_X86_64_TPOFF64:
834	case R_X86_64_DTPOFF64:
835	case R_X86_64_DTPMOD64:
836	case R_X86_64_PC64:
837	case R_X86_64_SIZE64:
838	case R_X86_64_GLOB_DAT:
839	case R_X86_64_GOT64:
840	case R_X86_64_GOTOFF64:
841	case R_X86_64_GOTPC64:
842	case R_X86_64_PLTOFF64:
843	case R_X86_64_IRELATIVE:
844	case R_X86_64_RELATIVE:
845	return read64le(P: buf);
846	case R_X86_64_TLSDESC:
847	return read64le(P: buf + `8`);
848	case R_X86_64_JUMP_SLOT:
849	case R_X86_64_NONE:
850	// These relocations are defined as not having an implicit addend.
851	return `0`;
852	default:
853	InternalErr(ctx, buf) << "cannot read addend for relocation " << type;
854	return `0`;
855	}
856	}
857
858	static void relaxGot(uint8_t loc, const* Relocation &rel, uint64_t val);
859
860	void X86_64::relocate(uint8_t loc, const* Relocation &rel, uint64_t val) const {
861	switch (rel.type) {
862	case R_X86_64_8:
863	checkIntUInt(ctx, loc, v: val, n: `8`, rel);
864	*loc = val;
865	break;
866	case R_X86_64_PC8:
867	checkInt(ctx, loc, v: val, n: `8`, rel);
868	*loc = val;
869	break;
870	case R_X86_64_16:
871	checkIntUInt(ctx, loc, v: val, n: `16`, rel);
872	write16le(P: loc, V: val);
873	break;
874	case R_X86_64_PC16:
875	checkInt(ctx, loc, v: val, n: `16`, rel);
876	write16le(P: loc, V: val);
877	break;
878	case R_X86_64_32:
879	checkUInt(ctx, loc, v: val, n: `32`, rel);
880	write32le(P: loc, V: val);
881	break;
882	case R_X86_64_32S:
883	case R_X86_64_GOT32:
884	case R_X86_64_GOTPC32:
885	case R_X86_64_GOTPCREL:
886	case R_X86_64_PC32:
887	case R_X86_64_PLT32:
888	case R_X86_64_DTPOFF32:
889	case R_X86_64_SIZE32:
890	checkInt(ctx, loc, v: val, n: `32`, rel);
891	write32le(P: loc, V: val);
892	break;
893	case R_X86_64_64:
894	case R_X86_64_TPOFF64:
895	case R_X86_64_DTPOFF64:
896	case R_X86_64_PC64:
897	case R_X86_64_SIZE64:
898	case R_X86_64_GOT64:
899	case R_X86_64_GOTOFF64:
900	case R_X86_64_GOTPC64:
901	case R_X86_64_PLTOFF64:
902	write64le(P: loc, V: val);
903	break;
904	case R_X86_64_GOTPCRELX:
905	case R_X86_64_REX_GOTPCRELX:
906	case R_X86_64_CODE_4_GOTPCRELX:
907	if (rel.expr != R_GOT_PC) {
908	relaxGot(loc, rel, val);
909	} else {
910	checkInt(ctx, loc, v: val, n: `32`, rel);
911	write32le(P: loc, V: val);
912	}
913	break;
914	case R_X86_64_GOTPC32_TLSDESC:
915	case R_X86_64_CODE_4_GOTPC32_TLSDESC:
916	case R_X86_64_TLSDESC_CALL:
917	case R_X86_64_TLSGD:
918	if (rel.expr == R_RELAX_TLS_GD_TO_LE) {
919	relaxTlsGdToLe(loc, rel, val);
920	} else if (rel.expr == R_RELAX_TLS_GD_TO_IE) {
921	relaxTlsGdToIe(loc, rel, val);
922	} else {
923	checkInt(ctx, loc, v: val, n: `32`, rel);
924	write32le(P: loc, V: val);
925	}
926	break;
927	case R_X86_64_TLSLD:
928	if (rel.expr == R_RELAX_TLS_LD_TO_LE) {
929	relaxTlsLdToLe(loc, rel, val);
930	} else {
931	checkInt(ctx, loc, v: val, n: `32`, rel);
932	write32le(P: loc, V: val);
933	}
934	break;
935	case R_X86_64_GOTTPOFF:
936	case R_X86_64_CODE_4_GOTTPOFF:
937	case R_X86_64_CODE_6_GOTTPOFF:
938	if (rel.expr == R_RELAX_TLS_IE_TO_LE) {
939	relaxTlsIeToLe(loc, rel, val);
940	} else {
941	checkInt(ctx, loc, v: val, n: `32`, rel);
942	write32le(P: loc, V: val);
943	}
944	break;
945	case R_X86_64_TPOFF32:
946	checkInt(ctx, loc, v: val, n: `32`, rel);
947	write32le(P: loc, V: val);
948	break;
949
950	case R_X86_64_TLSDESC:
951	// The addend is stored in the second 64-bit word.
952	write64le(P: loc + `8`, V: val);
953	break;
954	default:
955	llvm_unreachable("unknown relocation");
956	}
957	}
958
959	RelExpr X86_64::adjustGotPcExpr(RelType type, int64_t addend,
960	const uint8_t loc) const* {
961	// Only R_X86_64_[REX_]\|[CODE_4_]GOTPCRELX can be relaxed. GNU as may emit
962	// GOTPCRELX with addend != -4. Such an instruction does not load the full GOT
963	// entry, so we cannot relax the relocation. E.g. movl x@GOTPCREL+4(%rip),
964	// %rax (addend=0) loads the high 32 bits of the GOT entry.
965	if (!ctx.arg.relax \|\| addend != -`4` \|\|
966	(type != R_X86_64_GOTPCRELX && type != R_X86_64_REX_GOTPCRELX &&
967	type != R_X86_64_CODE_4_GOTPCRELX))
968	return R_GOT_PC;
969	const uint8_t op = loc[-`2`];
970	const uint8_t modRm = loc[-`1`];
971
972	// FIXME: When PIC is disabled and foo is defined locally in the
973	// lower 32 bit address space, memory operand in mov can be converted into
974	// immediate operand. Otherwise, mov must be changed to lea. We support only
975	// latter relaxation at this moment.
976	if (op == `0x8b`)
977	return R_RELAX_GOT_PC;
978
979	// Relax call and jmp.
980	if (op == `0xff` && (modRm == `0x15` \|\| modRm == `0x25`))
981	return R_RELAX_GOT_PC;
982
983	// We don't support test/binop instructions without a REX/REX2 prefix.
984	if (type == R_X86_64_GOTPCRELX)
985	return R_GOT_PC;
986
987	// Relaxation of test, adc, add, and, cmp, or, sbb, sub, xor.
988	// If PIC then no relaxation is available.
989	return ctx.arg.isPic ? R_GOT_PC : R_RELAX_GOT_PC_NOPIC;
990	}
991
992	// A subset of relaxations can only be applied for no-PIC. This method
993	// handles such relaxations. Instructions encoding information was taken from:
994	// "Intel 64 and IA-32 Architectures Software Developer's Manual V2"
995	// (http://www.intel.com/content/dam/www/public/us/en/documents/manuals/
996	// 64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf)
997	static void relaxGotNoPic(uint8_t *loc, uint64_t val, uint8_t op, uint8_t modRm,
998	bool isRex2) {
999	const uint8_t rex = loc[-`3`];
1000	// Convert "test %reg, foo@GOTPCREL(%rip)" to "test $foo, %reg".
1001	if (op == `0x85`) {
1002	// See "TEST-Logical Compare" (4-428 Vol. 2B),
1003	// TEST r/m64, r64 uses "full" ModR / M byte (no opcode extension).
1004
1005	// ModR/M byte has form XX YYY ZZZ, where
1006	// YYY is MODRM.reg(register 2), ZZZ is MODRM.rm(register 1).
1007	// XX has different meanings:
1008	// 00: The operand's memory address is in reg1.
1009	// 01: The operand's memory address is reg1 + a byte-sized displacement.
1010	// 10: The operand's memory address is reg1 + a word-sized displacement.
1011	// 11: The operand is reg1 itself.
1012	// If an instruction requires only one operand, the unused reg2 field
1013	// holds extra opcode bits rather than a register code
1014	// 0xC0 == 11 000 000 binary.
1015	// 0x38 == 00 111 000 binary.
1016	// We transfer reg2 to reg1 here as operand.
1017	// See "2.1.3 ModR/M and SIB Bytes" (Vol. 2A 2-3).
1018	loc[-`1`] = `0xc0` \| (modRm & `0x38`) >> `3`; // ModR/M byte.
1019
1020	// Change opcode from TEST r/m64, r64 to TEST r/m64, imm32
1021	// See "TEST-Logical Compare" (4-428 Vol. 2B).
1022	loc[-`2`] = `0xf7`;
1023
1024	// Move R bit to the B bit in REX/REX2 byte.
1025	// REX byte is encoded as 0100WRXB, where
1026	// 0100 is 4bit fixed pattern.
1027	// REX.W When 1, a 64-bit operand size is used. Otherwise, when 0, the
1028	// default operand size is used (which is 32-bit for most but not all
1029	// instructions).
1030	// REX.R This 1-bit value is an extension to the MODRM.reg field.
1031	// REX.X This 1-bit value is an extension to the SIB.index field.
1032	// REX.B This 1-bit value is an extension to the MODRM.rm field or the
1033	// SIB.base field.
1034	// See "2.2.1.2 More on REX Prefix Fields " (2-8 Vol. 2A).
1035	//
1036	// REX2 prefix is encoded as 0xd5\|M\|R2\|X2\|B2\|WRXB, where
1037	// 0xd5 is 1byte fixed pattern.
1038	// REX2's [W,R,X,B] have the same meanings as REX's.
1039	// REX2.M encodes the map id.
1040	// R2/X2/B2 provides the fifth and most siginicant bits of the R/X/B
1041	// register identifiers, each of which can now address all 32 GPRs.
1042	if (isRex2)
1043	loc[-`3`] = (rex & ~`0x44`) \| (rex & `0x44`) >> `2`;
1044	else
1045	loc[-`3`] = (rex & ~`0x4`) \| (rex & `0x4`) >> `2`;
1046	write32le(P: loc, V: val);
1047	return;
1048	}
1049
1050	// If we are here then we need to relax the adc, add, and, cmp, or, sbb, sub
1051	// or xor operations.
1052
1053	// Convert "binop foo@GOTPCREL(%rip), %reg" to "binop $foo, %reg".
1054	// Logic is close to one for test instruction above, but we also
1055	// write opcode extension here, see below for details.
1056	loc[-`1`] = `0xc0` \| (modRm & `0x38`) >> `3` \| (op & `0x3c`); // ModR/M byte.
1057
1058	// Primary opcode is 0x81, opcode extension is one of:
1059	// 000b = ADD, 001b is OR, 010b is ADC, 011b is SBB,
1060	// 100b is AND, 101b is SUB, 110b is XOR, 111b is CMP.
1061	// This value was wrote to MODRM.reg in a line above.
1062	// See "3.2 INSTRUCTIONS (A-M)" (Vol. 2A 3-15),
1063	// "INSTRUCTION SET REFERENCE, N-Z" (Vol. 2B 4-1) for
1064	// descriptions about each operation.
1065	loc[-`2`] = `0x81`;
1066	if (isRex2)
1067	loc[-`3`] = (rex & ~`0x44`) \| (rex & `0x44`) >> `2`;
1068	else
1069	loc[-`3`] = (rex & ~`0x4`) \| (rex & `0x4`) >> `2`;
1070	write32le(P: loc, V: val);
1071	}
1072
1073	static void relaxGot(uint8_t loc, const* Relocation &rel, uint64_t val) {
1074	assert(isInt<`32`>(val) &&
1075	"GOTPCRELX should not have been relaxed if it overflows");
1076	const uint8_t op = loc[-`2`];
1077	const uint8_t modRm = loc[-`1`];
1078
1079	// Convert "mov foo@GOTPCREL(%rip),%reg" to "lea foo(%rip),%reg".
1080	if (op == `0x8b`) {
1081	loc[-`2`] = `0x8d`;
1082	write32le(P: loc, V: val);
1083	return;
1084	}
1085
1086	if (op != `0xff`) {
1087	// We are relaxing a rip relative to an absolute, so compensate
1088	// for the old -4 addend.
1089	assert(!rel.sym->file->ctx.arg.isPic);
1090	relaxGotNoPic(loc, val: val + `4`, op, modRm,
1091	isRex2: rel.type == R_X86_64_CODE_4_GOTPCRELX);
1092	return;
1093	}
1094
1095	// Convert call/jmp instructions.
1096	if (modRm == `0x15`) {
1097	// ABI says we can convert "call foo@GOTPCREL(%rip)" to "nop; call foo".*
1098	// Instead we convert to "addr32 call foo" where addr32 is an instruction
1099	// prefix. That makes result expression to be a single instruction.
1100	loc[-`2`] = `0x67`; // addr32 prefix
1101	loc[-`1`] = `0xe8`; // call
1102	write32le(P: loc, V: val);
1103	return;
1104	}
1105
1106	// Convert "jmp foo@GOTPCREL(%rip)" to "jmp foo; nop".*
1107	// jmp doesn't return, so it is fine to use nop here, it is just a stub.
1108	assert(modRm == `0x25`);
1109	loc[-`2`] = `0xe9`; // jmp
1110	loc[`3`] = `0x90`; // nop
1111	write32le(P: loc - `1`, V: val + `1`);
1112	}
1113
1114	// A split-stack prologue starts by checking the amount of stack remaining
1115	// in one of two ways:
1116	// A) Comparing of the stack pointer to a field in the tcb.
1117	// B) Or a load of a stack pointer offset with an lea to r10 or r11.
1118	bool X86_64::adjustPrologueForCrossSplitStack(uint8_t loc, uint8_t end,
1119	uint8_t stOther) const {
1120	if (!ctx.arg.is64) {
1121	ErrAlways(ctx) << "target doesn't support split stacks";
1122	return false;
1123	}
1124
1125	if (loc + `8` >= end)
1126	return false;
1127
1128	// Replace "cmp %fs:0x70,%rsp" and subsequent branch
1129	// with "stc, nopl 0x0(%rax,%rax,1)"
1130	if (memcmp(s1: loc, s2: "\x64\x48\x3b\x24\x25", n: `5`) == `0`) {
1131	memcpy(dest: loc, src: "\xf9\x0f\x1f\x84\x00\x00\x00\x00", n: `8`);
1132	return true;
1133	}
1134
1135	// Adjust "lea X(%rsp),%rYY" to lea "(X - 0x4000)(%rsp),%rYY" where rYY could
1136	// be r10 or r11. The lea instruction feeds a subsequent compare which checks
1137	// if there is X available stack space. Making X larger effectively reserves
1138	// that much additional space. The stack grows downward so subtract the value.
1139	if (memcmp(s1: loc, s2: "\x4c\x8d\x94\x24", n: `4`) == `0` \|\|
1140	memcmp(s1: loc, s2: "\x4c\x8d\x9c\x24", n: `4`) == `0`) {
1141	// The offset bytes are encoded four bytes after the start of the
1142	// instruction.
1143	write32le(P: loc + `4`, V: read32le(P: loc + `4`) - `0x4000`);
1144	return true;
1145	}
1146	return false;
1147	}
1148
1149	void X86_64::relocateAlloc(InputSectionBase &sec, uint8_t buf) const* {
1150	uint64_t secAddr = sec.getOutputSection()->addr;
1151	if (auto *s = dyn_cast<InputSection>(Val: &sec))
1152	secAddr += s->outSecOff;
1153	else if (auto *ehIn = dyn_cast<EhInputSection>(Val: &sec))
1154	secAddr += ehIn->getParent()->outSecOff;
1155	for (const Relocation &rel : sec.relocs()) {
1156	if (rel.expr == R_NONE) // See deleteFallThruJmpInsn
1157	continue;
1158	uint8_t *loc = buf + rel.offset;
1159	const uint64_t val = sec.getRelocTargetVA(ctx, r: rel, p: secAddr + rel.offset);
1160	relocate(loc, rel, val);
1161	}
1162	if (sec.jumpInstrMod) {
1163	applyJumpInstrMod(loc: buf + sec.jumpInstrMod->offset,
1164	type: sec.jumpInstrMod->original, size: sec.jumpInstrMod->size);
1165	}
1166	}
1167
1168	static std::optional<uint64_t> getControlTransferAddend(InputSection &is,
1169	Relocation &r) {
1170	// Identify a control transfer relocation for the branch-to-branch
1171	// optimization. A "control transfer relocation" usually means a CALL or JMP
1172	// target but it also includes relative vtable relocations for example.
1173	//
1174	// We require the relocation type to be PLT32. With a relocation type of PLT32
1175	// the value may be assumed to be used for branching directly to the symbol
1176	// and the addend is only used to produce the relocated value (hence the
1177	// effective addend is always 0). This is because if a PLT is needed the
1178	// addend will be added to the address of the PLT, and it doesn't make sense
1179	// to branch into the middle of a PLT. For example, relative vtable
1180	// relocations use PLT32 and 0 or a positive value as the addend but still are
1181	// used to branch to the symbol.
1182	//
1183	// STT_SECTION symbols are a special case on x86 because the LLVM assembler
1184	// uses them for branches to local symbols which are assembled as referring to
1185	// the section symbol with the addend equal to the symbol value - 4.
1186	if (r.type == R_X86_64_PLT32) {
1187	if (r.sym->isSection())
1188	return r.addend + `4`;
1189	return `0`;
1190	}
1191	return std::nullopt;
1192	}
1193
1194	static std::pair<Relocation *, uint64_t>
1195	getBranchInfoAtTarget(InputSection &is, uint64_t offset) {
1196	auto content = is.contentMaybeDecompress();
1197	if (content.size() > offset && content [offset] == `0xe9`) { // JMP immediate
1198	auto *i = llvm::partition_point(
1199	Range&: is.relocations, P: [&](Relocation &r) { return r.offset < offset + `1`; });
1200	// Unlike with getControlTransferAddend() it is valid to accept a PC32
1201	// relocation here because we know that this is actually a JMP and not some
1202	// other reference, so the interpretation is that we add 4 to the addend and
1203	// use that as the effective addend.
1204	if (i != is.relocations.end() && i->offset == offset + `1` &&
1205	(i->type == R_X86_64_PC32 \|\| i->type == R_X86_64_PLT32)) {
1206	return {i, i->addend + `4`};
1207	}
1208	}
1209	return {nullptr, `0`};
1210	}
1211
1212	static void redirectControlTransferRelocations(Relocation &r1,
1213	const Relocation &r2) {
1214	// The isSection() check handles the STT_SECTION case described above.
1215	// In that case the original addend is irrelevant because it referred to an
1216	// offset within the original target section so we overwrite it.
1217	//
1218	// The +4 is here to compensate for r2.addend which will likely be -4,
1219	// but may also be addend-4 in case of a PC32 branch to symbol+addend.
1220	if (r1.sym->isSection())
1221	r1.addend = r2.addend;
1222	else
1223	r1.addend += r2.addend + `4`;
1224	r1.expr = r2.expr;
1225	r1.sym = r2.sym;
1226	}
1227
1228	void X86_64::applyBranchToBranchOpt() const {
1229	applyBranchToBranchOptImpl(ctx, getControlTransferAddend,
1230	getBranchInfoAtTarget,
1231	redirectControlTransferRelocations);
1232	}
1233
1234	// If Intel Indirect Branch Tracking is enabled, we have to emit special PLT
1235	// entries containing endbr64 instructions. A PLT entry will be split into two
1236	// parts, one in .plt.sec (writePlt), and the other in .plt (writeIBTPlt).
1237	namespace {
1238	class IntelIBT : public X86_64 {
1239	public:
1240	IntelIBT(Ctx &ctx) : X86_64 (ctx) { pltHeaderSize = `0`; };
1241	void writeGotPlt(uint8_t buf, const* Symbol &s) const override;
1242	void writePlt(uint8_t buf, const* Symbol &sym,
1243	uint64_t pltEntryAddr) const override;
1244	void writeIBTPlt(uint8_t buf, size_t numEntries) const* override;
1245
1246	static const unsigned IBTPltHeaderSize = `16`;
1247	};
1248	} // namespace
1249
1250	void IntelIBT::writeGotPlt(uint8_t buf, const* Symbol &s) const {
1251	uint64_t va = ctx.in.ibtPlt ->getVA() + IBTPltHeaderSize +
1252	s.getPltIdx(ctx) * pltEntrySize;
1253	write64le(P: buf, V: va);
1254	}
1255
1256	void IntelIBT::writePlt(uint8_t buf, const* Symbol &sym,
1257	uint64_t pltEntryAddr) const {
1258	const uint8_t Inst[] = {
1259	`0xf3`, `0x0f`, `0x1e`, `0xfa`, // endbr64
1260	`0xff`, `0x25`, `0`, `0`, `0`, `0`, // jmpq got(%rip)*
1261	`0x66`, `0x0f`, `0x1f`, `0x44`, `0`, `0`, // nop
1262	};
1263	memcpy(dest: buf, src: Inst, n: sizeof(Inst));
1264	write32le(P: buf + `6`, V: sym.getGotPltVA(ctx) - pltEntryAddr - `10`);
1265	}
1266
1267	void IntelIBT::writeIBTPlt(uint8_t buf, size_t numEntries) const* {
1268	writePltHeader(buf);
1269	buf += IBTPltHeaderSize;
1270
1271	const uint8_t inst[] = {
1272	`0xf3`, `0x0f`, `0x1e`, `0xfa`, // endbr64
1273	`0x68`, `0`, `0`, `0`, `0`, // pushq <relocation index>
1274	`0xe9`, `0`, `0`, `0`, `0`, // jmpq plt[0]
1275	`0x66`, `0x90`, // nop
1276	};
1277
1278	for (size_t i = `0`; i < numEntries; ++i) {
1279	memcpy(dest: buf, src: inst, n: sizeof(inst));
1280	write32le(P: buf + `5`, V: i);
1281	write32le(P: buf + `10`, V: -pltHeaderSize - sizeof(inst) * i - `30`);
1282	buf += sizeof(inst);
1283	}
1284	}
1285
1286	// These nonstandard PLT entries are to migtigate Spectre v2 security
1287	// vulnerability. In order to mitigate Spectre v2, we want to avoid indirect
1288	// branch instructions such as `jmp GOTPLT(%rip)`. So, in the following PLT*
1289	// entries, we use a CALL followed by MOV and RET to do the same thing as an
1290	// indirect jump. That instruction sequence is so-called "retpoline".
1291	//
1292	// We have two types of retpoline PLTs as a size optimization. If `-z now`
1293	// is specified, all dynamic symbols are resolved at load-time. Thus, when
1294	// that option is given, we can omit code for symbol lazy resolution.
1295	namespace {
1296	class Retpoline : public X86_64 {
1297	public:
1298	Retpoline(Ctx &);
1299	void writeGotPlt(uint8_t buf, const* Symbol &s) const override;
1300	void writePltHeader(uint8_t buf) const* override;
1301	void writePlt(uint8_t buf, const* Symbol &sym,
1302	uint64_t pltEntryAddr) const override;
1303	};
1304
1305	class RetpolineZNow : public X86_64 {
1306	public:
1307	RetpolineZNow(Ctx &);
1308	void writeGotPlt(uint8_t buf, const* Symbol &s) const override {}
1309	void writePltHeader(uint8_t buf) const* override;
1310	void writePlt(uint8_t buf, const* Symbol &sym,
1311	uint64_t pltEntryAddr) const override;
1312	};
1313	} // namespace
1314
1315	Retpoline::Retpoline(Ctx &ctx) : X86_64 (ctx) {
1316	pltHeaderSize = `48`;
1317	pltEntrySize = `32`;
1318	ipltEntrySize = `32`;
1319	}
1320
1321	void Retpoline::writeGotPlt(uint8_t buf, const* Symbol &s) const {
1322	write64le(P: buf, V: s.getPltVA(ctx) + `17`);
1323	}
1324
1325	void Retpoline::writePltHeader(uint8_t buf) const* {
1326	const uint8_t insn[] = {
1327	`0xff`, `0x35`, `0`, `0`, `0`, `0`, // 0: pushq GOTPLT+8(%rip)
1328	`0x4c`, `0x8b`, `0x1d`, `0`, `0`, `0`, `0`, // 6: mov GOTPLT+16(%rip), %r11
1329	`0xe8`, `0x0e`, `0x00`, `0x00`, `0x00`, // d: callq next
1330	`0xf3`, `0x90`, // 12: loop: pause
1331	`0x0f`, `0xae`, `0xe8`, // 14: lfence
1332	`0xeb`, `0xf9`, // 17: jmp loop
1333	`0xcc`, `0xcc`, `0xcc`, `0xcc`, `0xcc`, `0xcc`, `0xcc`, // 19: int3; .align 16
1334	`0x4c`, `0x89`, `0x1c`, `0x24`, // 20: next: mov %r11, (%rsp)
1335	`0xc3`, // 24: ret
1336	`0xcc`, `0xcc`, `0xcc`, `0xcc`, `0xcc`, `0xcc`, `0xcc`, // 25: int3; padding
1337	`0xcc`, `0xcc`, `0xcc`, `0xcc`, // 2c: int3; padding
1338	};
1339	memcpy(dest: buf, src: insn, n: sizeof(insn));
1340
1341	uint64_t gotPlt = ctx.in.gotPlt ->getVA();
1342	uint64_t plt = ctx.in.plt ->getVA();
1343	write32le(P: buf + `2`, V: gotPlt - plt - `6` + `8`);
1344	write32le(P: buf + `9`, V: gotPlt - plt - `13` + `16`);
1345	}
1346
1347	void Retpoline::writePlt(uint8_t buf, const* Symbol &sym,
1348	uint64_t pltEntryAddr) const {
1349	const uint8_t insn[] = {
1350	`0x4c`, `0x8b`, `0x1d`, `0`, `0`, `0`, `0`, // 0: mov foo@GOTPLT(%rip), %r11
1351	`0xe8`, `0`, `0`, `0`, `0`, // 7: callq plt+0x20
1352	`0xe9`, `0`, `0`, `0`, `0`, // c: jmp plt+0x12
1353	`0x68`, `0`, `0`, `0`, `0`, // 11: pushq <relocation index>
1354	`0xe9`, `0`, `0`, `0`, `0`, // 16: jmp plt+0
1355	`0xcc`, `0xcc`, `0xcc`, `0xcc`, `0xcc`, // 1b: int3; padding
1356	};
1357	memcpy(dest: buf, src: insn, n: sizeof(insn));
1358
1359	uint64_t off = pltEntryAddr - ctx.in.plt ->getVA();
1360
1361	write32le(P: buf + `3`, V: sym.getGotPltVA(ctx) - pltEntryAddr - `7`);
1362	write32le(P: buf + `8`, V: -off - `12` + `32`);
1363	write32le(P: buf + `13`, V: -off - `17` + `18`);
1364	write32le(P: buf + `18`, V: sym.getPltIdx(ctx));
1365	write32le(P: buf + `23`, V: -off - `27`);
1366	}
1367
1368	RetpolineZNow::RetpolineZNow(Ctx &ctx) : X86_64 (ctx) {
1369	pltHeaderSize = `32`;
1370	pltEntrySize = `16`;
1371	ipltEntrySize = `16`;
1372	}
1373
1374	void RetpolineZNow::writePltHeader(uint8_t buf) const* {
1375	const uint8_t insn[] = {
1376	`0xe8`, `0x0b`, `0x00`, `0x00`, `0x00`, // 0: call next
1377	`0xf3`, `0x90`, // 5: loop: pause
1378	`0x0f`, `0xae`, `0xe8`, // 7: lfence
1379	`0xeb`, `0xf9`, // a: jmp loop
1380	`0xcc`, `0xcc`, `0xcc`, `0xcc`, // c: int3; .align 16
1381	`0x4c`, `0x89`, `0x1c`, `0x24`, // 10: next: mov %r11, (%rsp)
1382	`0xc3`, // 14: ret
1383	`0xcc`, `0xcc`, `0xcc`, `0xcc`, `0xcc`, // 15: int3; padding
1384	`0xcc`, `0xcc`, `0xcc`, `0xcc`, `0xcc`, // 1a: int3; padding
1385	`0xcc`, // 1f: int3; padding
1386	};
1387	memcpy(dest: buf, src: insn, n: sizeof(insn));
1388	}
1389
1390	void RetpolineZNow::writePlt(uint8_t buf, const* Symbol &sym,
1391	uint64_t pltEntryAddr) const {
1392	const uint8_t insn[] = {
1393	`0x4c`, `0x8b`, `0x1d`, `0`, `0`, `0`, `0`, // mov foo@GOTPLT(%rip), %r11
1394	`0xe9`, `0`, `0`, `0`, `0`, // jmp plt+0
1395	`0xcc`, `0xcc`, `0xcc`, `0xcc`, // int3; padding
1396	};
1397	memcpy(dest: buf, src: insn, n: sizeof(insn));
1398
1399	write32le(P: buf + `3`, V: sym.getGotPltVA(ctx) - pltEntryAddr - `7`);
1400	write32le(P: buf + `8`, V: ctx.in.plt ->getVA() - pltEntryAddr - `12`);
1401	}
1402
1403	void elf::setX86_64TargetInfo(Ctx &ctx) {
1404	if (ctx.arg.zRetpolineplt) {
1405	if (ctx.arg.zNow)
1406	ctx.target.reset(p: new RetpolineZNow (ctx));
1407	else
1408	ctx.target.reset(p: new Retpoline (ctx));
1409	return;
1410	}
1411
1412	if (ctx.arg.andFeatures & GNU_PROPERTY_X86_FEATURE_1_IBT)
1413	ctx.target.reset(p: new IntelIBT (ctx));
1414	else
1415	ctx.target.reset(p: new X86_64 (ctx));
1416	}
1417

Browse the source code of llvm_projects/lld/ELF/Arch/X86_64.cpp