X86_64.cpp source code [llvm_projects/lld/ELF/Arch/X86_64.cpp]

1	//===- X86_64.cpp ---------------------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "OutputSections.h"
10	#include "RelocScan.h"
11	#include "Relocations.h"
12	#include "Symbols.h"
13	#include "SyntheticSections.h"
14	#include "Target.h"
15	#include "TargetImpl.h"
16	#include "llvm/BinaryFormat/ELF.h"
17	#include "llvm/Support/Endian.h"
18	#include "llvm/Support/MathExtras.h"
19
20	using namespace llvm;
21	using namespace llvm::object;
22	using namespace llvm::support::endian;
23	using namespace llvm::ELF;
24	using namespace lld;
25	using namespace lld::elf;
26
27	namespace {
28	class X86_64 : public TargetInfo {
29	public:
30	X86_64(Ctx &);
31	void initTargetSpecificSections() override;
32	RelExpr getRelExpr(RelType type, const Symbol &s,
33	const uint8_t loc) const* override;
34	RelType getDynRel(RelType type) const override;
35	void writeGotPltHeader(uint8_t buf) const* override;
36	void writeGotPlt(uint8_t buf, const* Symbol &s) const override;
37	void writeIgotPlt(uint8_t buf, const* Symbol &s) const override;
38	void writePltHeader(uint8_t buf) const* override;
39	void writePlt(uint8_t buf, const* Symbol &sym,
40	uint64_t pltEntryAddr) const override;
41	void relocate(uint8_t loc, const* Relocation &rel,
42	uint64_t val) const override;
43	int64_t getImplicitAddend(const uint8_t buf, RelType type) const* override;
44	void applyJumpInstrMod(uint8_t *loc, JumpModType type,
45	unsigned size) const override;
46	RelExpr adjustGotPcExpr(RelType type, int64_t addend,
47	const uint8_t loc) const* override;
48	void relocateAlloc(InputSection &sec, uint8_t buf) const* override;
49	bool adjustPrologueForCrossSplitStack(uint8_t loc, uint8_t end,
50	uint8_t stOther) const override;
51	bool deleteFallThruJmpInsn(InputSection &is,
52	InputSection nextIS) const* override;
53	bool relaxOnce(int pass) const override;
54	void relaxCFIJumpTables() const override;
55	void applyBranchToBranchOpt() const override;
56	template <class ELFT, class RelTy>
57	void scanSectionImpl(InputSectionBase &sec, Relocs<RelTy> rels,
58	unsigned shard);
59	void scanSection(InputSectionBase &sec, unsigned shard) override;
60
61	private:
62	void relaxTlsGdToLe(uint8_t loc, const* Relocation &rel, uint64_t val) const;
63	void relaxTlsGdToIe(uint8_t loc, const* Relocation &rel, uint64_t val) const;
64	void relaxTlsLdToLe(uint8_t loc, const* Relocation &rel, uint64_t val) const;
65	void relaxTlsIeToLe(uint8_t loc, const* Relocation &rel, uint64_t val) const;
66	};
67	} // namespace
68
69	// This is vector of NOP instructions of sizes from 1 to 8 bytes. The
70	// appropriately sized instructions are used to fill the gaps between sections
71	// which are executed during fall through.
72	static const std::vector<std::vector<uint8_t>> nopInstructions = {
73	{`0x90`},
74	{`0x66`, `0x90`},
75	{`0x0f`, `0x1f`, `0x00`},
76	{`0x0f`, `0x1f`, `0x40`, `0x00`},
77	{`0x0f`, `0x1f`, `0x44`, `0x00`, `0x00`},
78	{`0x66`, `0x0f`, `0x1f`, `0x44`, `0x00`, `0x00`},
79	{`0x0F`, `0x1F`, `0x80`, `0x00`, `0x00`, `0x00`, `0x00`},
80	{`0x0F`, `0x1F`, `0x84`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`},
81	{`0x66`, `0x0F`, `0x1F`, `0x84`, `0x00`, `0x00`, `0x00`, `0x00`, `0x00`}};
82
83	X86_64::X86_64(Ctx &ctx) : TargetInfo (ctx) {
84	copyRel = R_X86_64_COPY;
85	gotRel = R_X86_64_GLOB_DAT;
86	pltRel = R_X86_64_JUMP_SLOT;
87	relativeRel = R_X86_64_RELATIVE;
88	iRelativeRel = R_X86_64_IRELATIVE;
89	symbolicRel = ctx.arg.is64 ? R_X86_64_64 : R_X86_64_32;
90	tlsDescRel = R_X86_64_TLSDESC;
91	tlsGotRel = R_X86_64_TPOFF64;
92	tlsModuleIndexRel = R_X86_64_DTPMOD64;
93	tlsOffsetRel = R_X86_64_DTPOFF64;
94	gotBaseSymInGotPlt = true;
95	gotEntrySize = `8`;
96	pltHeaderSize = `16`;
97	pltEntrySize = `16`;
98	ipltEntrySize = `16`;
99	trapInstr = {`0xcc`, `0xcc`, `0xcc`, `0xcc`}; // 0xcc = INT3
100	nopInstrs = nopInstructions;
101
102	// Align to the large page size (known as a superpage or huge page).
103	// FreeBSD automatically promotes large, superpage-aligned allocations.
104	defaultImageBase = `0x200000`;
105	}
106
107	// Opcodes for the different X86_64 jmp instructions.
108	enum JmpInsnOpcode : uint32_t {
109	J_JMP_32,
110	J_JNE_32,
111	J_JE_32,
112	J_JG_32,
113	J_JGE_32,
114	J_JB_32,
115	J_JBE_32,
116	J_JL_32,
117	J_JLE_32,
118	J_JA_32,
119	J_JAE_32,
120	J_UNKNOWN,
121	};
122
123	// Given the first (optional) and second byte of the insn's opcode, this
124	// returns the corresponding enum value.
125	static JmpInsnOpcode getJmpInsnType(const uint8_t *first,
126	const uint8_t *second) {
127	if (*second == `0xe9`)
128	return J_JMP_32;
129
130	if (first == nullptr)
131	return J_UNKNOWN;
132
133	if (*first == `0x0f`) {
134	switch (*second) {
135	case `0x84`:
136	return J_JE_32;
137	case `0x85`:
138	return J_JNE_32;
139	case `0x8f`:
140	return J_JG_32;
141	case `0x8d`:
142	return J_JGE_32;
143	case `0x82`:
144	return J_JB_32;
145	case `0x86`:
146	return J_JBE_32;
147	case `0x8c`:
148	return J_JL_32;
149	case `0x8e`:
150	return J_JLE_32;
151	case `0x87`:
152	return J_JA_32;
153	case `0x83`:
154	return J_JAE_32;
155	}
156	}
157	return J_UNKNOWN;
158	}
159
160	// Return the relocation index for input section IS with a specific Offset.
161	// Returns the maximum size of the vector if no such relocation is found.
162	static unsigned getRelocationWithOffset(const InputSection &is,
163	uint64_t offset) {
164	unsigned size = is.relocs().size();
165	for (unsigned i = size - `1`; i + `1` > `0`; --i) {
166	if (is.relocs()[i].offset == offset && is.relocs()[i].expr != R_NONE)
167	return i;
168	}
169	return size;
170	}
171
172	// Returns true if R corresponds to a relocation used for a jump instruction.
173	// TODO: Once special relocations for relaxable jump instructions are available,
174	// this should be modified to use those relocations.
175	static bool isRelocationForJmpInsn(Relocation &R) {
176	return R.type == R_X86_64_PLT32 \|\| R.type == R_X86_64_PC32 \|\|
177	R.type == R_X86_64_PC8;
178	}
179
180	// Return true if Relocation R points to the first instruction in the
181	// next section.
182	// TODO: Delete this once psABI reserves a new relocation type for fall thru
183	// jumps.
184	static bool isFallThruRelocation(InputSection &is, InputSection *nextIS,
185	Relocation &r) {
186	if (!isRelocationForJmpInsn(R&: r))
187	return false;
188
189	uint64_t addrLoc = is.getOutputSection()->addr + is.outSecOff + r.offset;
190	uint64_t targetOffset = is.getRelocTargetVA(is.getCtx(), r, p: addrLoc);
191
192	// If this jmp is a fall thru, the target offset is the beginning of the
193	// next section.
194	uint64_t nextSectionOffset =
195	nextIS->getOutputSection()->addr + nextIS->outSecOff;
196	return (addrLoc + `4` + targetOffset) == nextSectionOffset;
197	}
198
199	// Return the jmp instruction opcode that is the inverse of the given
200	// opcode. For example, JE inverted is JNE.
201	static JmpInsnOpcode invertJmpOpcode(const JmpInsnOpcode opcode) {
202	switch (opcode) {
203	case J_JE_32:
204	return J_JNE_32;
205	case J_JNE_32:
206	return J_JE_32;
207	case J_JG_32:
208	return J_JLE_32;
209	case J_JGE_32:
210	return J_JL_32;
211	case J_JB_32:
212	return J_JAE_32;
213	case J_JBE_32:
214	return J_JA_32;
215	case J_JL_32:
216	return J_JGE_32;
217	case J_JLE_32:
218	return J_JG_32;
219	case J_JA_32:
220	return J_JBE_32;
221	case J_JAE_32:
222	return J_JB_32;
223	default:
224	return J_UNKNOWN;
225	}
226	}
227
228	// Deletes direct jump instruction in input sections that jumps to the
229	// following section as it is not required. If there are two consecutive jump
230	// instructions, it checks if they can be flipped and one can be deleted.
231	// For example:
232	// .section .text
233	// a.BB.foo:
234	// ...
235	// 10: jne aa.BB.foo
236	// 16: jmp bar
237	// aa.BB.foo:
238	// ...
239	//
240	// can be converted to:
241	// a.BB.foo:
242	// ...
243	// 10: je bar #jne flipped to je and the jmp is deleted.
244	// aa.BB.foo:
245	// ...
246	bool X86_64::deleteFallThruJmpInsn(InputSection &is,
247	InputSection nextIS) const* {
248	const unsigned sizeOfDirectJmpInsn = `5`;
249
250	if (nextIS == nullptr)
251	return false;
252
253	if (is.getSize() < sizeOfDirectJmpInsn)
254	return false;
255
256	// If this jmp insn can be removed, it is the last insn and the
257	// relocation is 4 bytes before the end.
258	unsigned rIndex = getRelocationWithOffset(is, offset: is.getSize() - `4`);
259	if (rIndex == is.relocs().size())
260	return false;
261
262	Relocation &r = is.relocs()[rIndex];
263
264	// Check if the relocation corresponds to a direct jmp.
265	const uint8_t *secContents = is.content().data();
266	// If it is not a direct jmp instruction, there is nothing to do here.
267	if (*(secContents + r.offset - `1`) != `0xe9`)
268	return false;
269
270	if (isFallThruRelocation(is, nextIS, r)) {
271	// This is a fall thru and can be deleted.
272	r.expr = R_NONE;
273	r.offset = `0`;
274	is.drop_back(num: sizeOfDirectJmpInsn);
275	is.nopFiller = true;
276	return true;
277	}
278
279	// Now, check if flip and delete is possible.
280	const unsigned sizeOfJmpCCInsn = `6`;
281	// To flip, there must be at least one JmpCC and one direct jmp.
282	if (is.getSize() < sizeOfDirectJmpInsn + sizeOfJmpCCInsn)
283	return false;
284
285	unsigned rbIndex =
286	getRelocationWithOffset(is, offset: (is.getSize() - sizeOfDirectJmpInsn - `4`));
287	if (rbIndex == is.relocs().size())
288	return false;
289
290	Relocation &rB = is.relocs()[rbIndex];
291
292	const uint8_t *jmpInsnB = secContents + rB.offset - `1`;
293	JmpInsnOpcode jmpOpcodeB = getJmpInsnType(first: jmpInsnB - `1`, second: jmpInsnB);
294	if (jmpOpcodeB == J_UNKNOWN)
295	return false;
296
297	if (!isFallThruRelocation(is, nextIS, r&: rB))
298	return false;
299
300	// jmpCC jumps to the fall thru block, the branch can be flipped and the
301	// jmp can be deleted.
302	JmpInsnOpcode jInvert = invertJmpOpcode(opcode: jmpOpcodeB);
303	if (jInvert == J_UNKNOWN)
304	return false;
305	is.jumpInstrMod = make<JumpInstrMod>();
306	*is.jumpInstrMod = {.offset: rB.offset - `1`, .original: jInvert, .size: `4`};
307	// Move R's values to rB except the offset.
308	rB = {.expr: r.expr, .type: r.type, .offset: rB.offset, .addend: r.addend, .sym: r.sym};
309	// Cancel R
310	r.expr = R_NONE;
311	r.offset = `0`;
312	is.drop_back(num: sizeOfDirectJmpInsn);
313	is.nopFiller = true;
314	return true;
315	}
316
317	void X86_64::relaxCFIJumpTables() const {
318	// Relax CFI jump tables.
319	// - Split jump table into pieces and place target functions inside the jump
320	// table if small enough.
321	// - Move jump table before last called function and delete last branch
322	// instruction.
323	DenseMap<InputSection , SmallVector<InputSection , `0`>> sectionReplacements;
324	SmallVector<InputSection *, `0`> storage;
325	for (OutputSection *osec : ctx.outputSections) {
326	if (!(osec->flags & SHF_EXECINSTR))
327	continue;
328	for (InputSection sec : getInputSections(os: osec, storage)) {
329	if (sec->type != SHT_LLVM_CFI_JUMP_TABLE \|\| sec->entsize == `0` \|\|
330	sec->size % sec->entsize != `0`)
331	continue;
332
333	// We're going to replace the jump table with this list of sections. This
334	// list will be made up of slices of the original section and function
335	// bodies that were moved into the jump table.
336	SmallVector<InputSection *, `0`> replacements;
337
338	// r is the only relocation in a jump table entry. Figure out whether it
339	// is a branch pointing to the start of a statically known section that
340	// hasn't already been moved while processing a different jump table
341	// section, and if so return it.
342	auto getMovableSection = [&](Relocation &r) -> InputSection * {
343	if (r.type != R_X86_64_PC32 && r.type != R_X86_64_PLT32)
344	return nullptr;
345	auto *sym = dyn_cast<Defined>(Val: r.sym);
346	if (!sym \|\| sym->isPreemptible \|\| sym->isGnuIFunc() \|\|
347	sym->value + r.addend != -`4ull`) // Usual addend for branch targets.
348	return nullptr;
349	auto *target = dyn_cast_or_null<InputSection>(Val: sym->section);
350	if (!target \|\| sectionReplacements.count(Val: target))
351	return nullptr;
352	return target;
353	};
354
355	// Figure out the movable section for the last entry. We do this first
356	// because the last entry controls which output section the jump table is
357	// placed into, which affects move eligibility for other sections.
358	auto lastSec = [&]() -> InputSection {
359	// If the jump table section is more aligned than the entry size, skip
360	// this because there's no guarantee that we'll be able to emit a
361	// padding section that places the last entry at a correctly aligned
362	// address.
363	if (sec->addralign > sec->entsize)
364	return nullptr;
365
366	auto rels = sec->relocs();
367	if (rels.empty() \|\| rels.back().offset < sec->size - sec->entsize)
368	return nullptr;
369	if (rels.size() >= `2` &&
370	rels [rels.size() - `2`].offset >= sec->size - sec->entsize)
371	return nullptr;
372	return getMovableSection (rels.back());
373	}();
374	OutputSection *targetOutputSec;
375	if (lastSec) {
376	// If the last section is more aligned than the jump table, we need
377	// to emit a padding section before the jump table to ensure that the
378	// last section ends up at the correct alignment.
379	if (lastSec->addralign > sec->addralign) {
380	// We need to add enough padding to make this equal to zero.
381	size_t mod = (sec->size - sec->entsize) % lastSec->addralign;
382	if (mod != `0`) {
383	auto *pad = make<PaddingSection>(args&: ctx, args: lastSec->addralign - mod,
384	args: lastSec->getParent());
385	pad->addralign = lastSec->addralign;
386	replacements.push_back(Elt: pad);
387	} else {
388	sec->addralign = lastSec->addralign;
389	}
390	}
391
392	// We've already decided to move the output section so make sure that we
393	// don't try to move it again.
394	sectionReplacements [lastSec] = {};
395	targetOutputSec = lastSec->getParent();
396	} else {
397	targetOutputSec = sec->getParent();
398	}
399
400	// First, push the original jump table section. This is only so that it
401	// can act as a relocation target. Later on, we will set the size of the
402	// jump table section to 0 so that the slices and moved function bodies
403	// become the actual relocation targets.
404	replacements.push_back(Elt: sec);
405
406	// Add the slice [begin, end) of the original section to the replacement
407	// list. [rbegin, rend) is the slice of the relocation list that covers
408	// [begin, end).
409	auto addSectionSlice = [&](size_t begin, size_t end, Relocation *rbegin,
410	Relocation *rend) {
411	auto *slice = make<InputSection>(
412	args&: sec->file, args&: sec->name, args&: sec->type, args&: sec->flags, args&: sec->entsize,
413	args&: sec->entsize,
414	args: sec->contentMaybeDecompress().slice(N: begin, M: end - begin));
415	for (const Relocation &r : ArrayRef<Relocation>(rbegin, rend)) {
416	slice->relocations.push_back(
417	Elt: Relocation{.expr: r.expr, .type: r.type, .offset: r.offset - begin, .addend: r.addend, .sym: r.sym});
418	}
419	replacements.push_back(Elt: slice);
420	};
421
422	// Walk the jump table entries other than the last one looking for
423	// sections that are small enough to be moved into the jump table and in
424	// the same section as the jump table's destination.
425	size_t begin = `0`, cur = `0`;
426	Relocation rbegin = sec->relocs().begin(), rcur = rbegin;
427	while (cur != sec->size - sec->entsize) {
428	size_t next = cur + sec->entsize;
429	Relocation *rnext = rcur;
430	while (rnext != sec->relocs().end() && rnext->offset < next)
431	++rnext;
432	if (rcur + `1` == rnext) {
433	if (InputSection target = getMovableSection (rcur);
434	target && target->size != `0` && target->size <= sec->entsize &&
435	target->addralign <= sec->entsize &&
436	target->getParent() == targetOutputSec) {
437	// Okay, we found a small enough section. Move it into the jump
438	// table. First add a slice for the unmodified jump table entries
439	// before this one. This slice may be of zero size if two
440	// consecutive functions are moved to the jump table, and is
441	// used to correctly align the target function.
442	addSectionSlice (begin, cur, rbegin, rcur);
443	// Add the target to our replacement list, and set the target's
444	// replacement list to the empty list. This removes it from its
445	// original position and adds it here, as well as causing
446	// future getMovableSection() queries to return nullptr.
447	replacements.push_back(Elt: target);
448	sectionReplacements [target] = {};
449	begin = next;
450	rbegin = rnext;
451	}
452	}
453	cur = next;
454	rcur = rnext;
455	}
456
457	// Finally, process the last entry. If it is movable, move the entire
458	// jump table behind it and delete the last entry (so that the last
459	// function's body acts as the last jump table entry), otherwise leave the
460	// jump table where it is and keep the last entry.
461	if (lastSec) {
462	addSectionSlice (begin, cur, rbegin, rcur);
463	replacements.push_back(Elt: lastSec);
464	sectionReplacements [sec] = {};
465	for (auto *s : replacements)
466	s->parent = lastSec->parent;
467	sectionReplacements [lastSec] = std::move(replacements);
468	} else {
469	addSectionSlice (begin, sec->size, rbegin, sec->relocs().end());
470	for (auto *s : replacements)
471	s->parent = sec->parent;
472	sectionReplacements [sec] = std::move(replacements);
473	}
474
475	// Everything from the original section has been recreated, so delete the
476	// original contents.
477	sec->relocations.clear();
478	sec->size = `0`;
479	}
480	}
481
482	if (sectionReplacements.empty())
483	return;
484
485	// Now that we have the complete mapping of replacements, go through the input
486	// section lists and apply the replacements.
487	for (OutputSection *osec : ctx.outputSections) {
488	if (!(osec->flags & SHF_EXECINSTR))
489	continue;
490	for (SectionCommand *cmd : osec->commands) {
491	auto *isd = dyn_cast<InputSectionDescription>(Val: cmd);
492	if (!isd)
493	continue;
494	SmallVector<InputSection *, `0`> newSections;
495	for (auto *sec : isd->sections) {
496	auto i = sectionReplacements.find(Val: sec);
497	if (i == sectionReplacements.end())
498	newSections.push_back(Elt: sec);
499	else
500	newSections.append(in_start: i ->second.begin(), in_end: i ->second.end());
501	}
502	isd->sections = std::move(newSections);
503	}
504	}
505	}
506
507	bool X86_64::relaxOnce(int pass) const {
508	uint64_t minVA = UINT64_MAX, maxVA = `0`;
509	for (OutputSection *osec : ctx.outputSections) {
510	if (!(osec->flags & SHF_ALLOC))
511	continue;
512	minVA = std::min(a: minVA, b: osec->addr);
513	maxVA = std::max(a: maxVA, b: osec->addr + osec->size);
514	}
515	// If the max VA is under 2^31, GOTPCRELX relocations cannot overflow. In
516	// -pie/-shared, the condition can be relaxed to test the max VA difference as
517	// there is no R_RELAX_GOT_PC_NOPIC.
518	if (isUInt<`31`>(x: maxVA) \|\| (isUInt<`31`>(x: maxVA - minVA) && ctx.arg.isPic))
519	return false;
520
521	SmallVector<InputSection *, `0`> storage;
522	bool changed = false;
523	for (OutputSection *osec : ctx.outputSections) {
524	if (!(osec->flags & SHF_EXECINSTR))
525	continue;
526	for (InputSection sec : getInputSections(os: osec, storage)) {
527	for (Relocation &rel : sec->relocs()) {
528	if (rel.expr != R_RELAX_GOT_PC && rel.expr != R_RELAX_GOT_PC_NOPIC)
529	continue;
530	assert(rel.addend == -`4`);
531
532	Relocation rel1 = rel;
533	rel1.addend = rel.expr == R_RELAX_GOT_PC_NOPIC ? `0` : -`4`;
534	uint64_t v = sec->getRelocTargetVA(ctx, r: rel1,
535	p: sec->getOutputSection()->addr +
536	sec->outSecOff + rel.offset);
537	if (isInt<`32`>(x: v))
538	continue;
539	if (rel.sym->auxIdx == `0`) {
540	rel.sym->allocateAux(ctx);
541	addGotEntry(ctx, sym&: *rel.sym);
542	changed = true;
543	}
544	rel.expr = R_GOT_PC;
545	}
546	}
547	}
548	return changed;
549	}
550
551	void X86_64::initTargetSpecificSections() {
552	if (ctx.arg.andFeatures & GNU_PROPERTY_X86_FEATURE_1_IBT) {
553	ctx.in.ibtPlt = std::make_unique<IBTPltSection>(args&: ctx);
554	ctx.inputSections.push_back(Elt: ctx.in.ibtPlt.get());
555	}
556	}
557
558	// Only needed to support relocations used by relocateNonAlloc and relocateEh.
559	RelExpr X86_64::getRelExpr(RelType type, const Symbol &s,
560	const uint8_t loc) const* {
561	switch (type) {
562	case R_X86_64_8:
563	case R_X86_64_16:
564	case R_X86_64_32:
565	case R_X86_64_32S:
566	case R_X86_64_64:
567	return R_ABS;
568	case R_X86_64_SIZE32:
569	case R_X86_64_SIZE64:
570	return R_SIZE;
571	case R_X86_64_DTPOFF32:
572	case R_X86_64_DTPOFF64:
573	return R_DTPREL;
574	case R_X86_64_PC8:
575	case R_X86_64_PC16:
576	case R_X86_64_PC32:
577	case R_X86_64_PC64:
578	return R_PC;
579	case R_X86_64_GOTOFF64:
580	return R_GOTPLTREL;
581	case R_X86_64_GOTPC32:
582	case R_X86_64_GOTPC64:
583	return R_GOTPLTONLY_PC;
584	case R_X86_64_NONE:
585	return R_NONE;
586	default:
587	Err(ctx) << getErrorLoc(ctx, loc) << "unknown relocation (" << type.v
588	<< ") against symbol " << &s;
589	return R_NONE;
590	}
591	}
592
593	void X86_64::writeGotPltHeader(uint8_t buf) const* {
594	// The first entry holds the link-time address of _DYNAMIC. It is documented
595	// in the psABI and glibc before Aug 2021 used the entry to compute run-time
596	// load address of the shared object (note that this is relevant for linking
597	// ld.so, not any other program).
598	write64le(P: buf, V: ctx.in.dynamic ->getVA());
599	}
600
601	void X86_64::writeGotPlt(uint8_t buf, const* Symbol &s) const {
602	// See comments in X86::writeGotPlt.
603	write64le(P: buf, V: s.getPltVA(ctx) + `6`);
604	}
605
606	void X86_64::writeIgotPlt(uint8_t buf, const* Symbol &s) const {
607	// An x86 entry is the address of the ifunc resolver function (for -z rel).
608	if (ctx.arg.writeAddends)
609	write64le(P: buf, V: s.getVA(ctx));
610	}
611
612	void X86_64::writePltHeader(uint8_t buf) const* {
613	const uint8_t pltData[] = {
614	`0xff`, `0x35`, `0`, `0`, `0`, `0`, // pushq GOTPLT+8(%rip)
615	`0xff`, `0x25`, `0`, `0`, `0`, `0`, // jmp GOTPLT+16(%rip)*
616	`0x0f`, `0x1f`, `0x40`, `0x00`, // nop
617	};
618	memcpy(dest: buf, src: pltData, n: sizeof(pltData));
619	uint64_t gotPlt = ctx.in.gotPlt ->getVA();
620	uint64_t plt = ctx.in.ibtPlt ? ctx.in.ibtPlt ->getVA() : ctx.in.plt ->getVA();
621	write32le(P: buf + `2`, V: gotPlt - plt + `2`); // GOTPLT+8
622	write32le(P: buf + `8`, V: gotPlt - plt + `4`); // GOTPLT+16
623	}
624
625	void X86_64::writePlt(uint8_t buf, const* Symbol &sym,
626	uint64_t pltEntryAddr) const {
627	const uint8_t inst[] = {
628	`0xff`, `0x25`, `0`, `0`, `0`, `0`, // jmpq got(%rip)*
629	`0x68`, `0`, `0`, `0`, `0`, // pushq <relocation index>
630	`0xe9`, `0`, `0`, `0`, `0`, // jmpq plt[0]
631	};
632	memcpy(dest: buf, src: inst, n: sizeof(inst));
633
634	write32le(P: buf + `2`, V: sym.getGotPltVA(ctx) - pltEntryAddr - `6`);
635	write32le(P: buf + `7`, V: sym.getPltIdx(ctx));
636	write32le(P: buf + `12`, V: ctx.in.plt ->getVA() - pltEntryAddr - `16`);
637	}
638
639	RelType X86_64::getDynRel(RelType type) const {
640	if (type == symbolicRel \|\| type == R_X86_64_SIZE32 \|\| type == R_X86_64_SIZE64)
641	return type;
642	return R_X86_64_NONE;
643	}
644
645	template <class ELFT, class RelTy>
646	void X86_64::scanSectionImpl(InputSectionBase &sec, Relocs<RelTy> rels,
647	unsigned shard) {
648	RelocScan rs(ctx, &sec, shard);
649	sec.relocations.reserve(N: rels.size());
650
651	for (auto it = rels.begin(); it != rels.end(); ++it) {
652	const RelTy &rel = *it;
653	uint32_t symIdx = rel.getSymbol(false);
654	Symbol &sym = sec.getFile<ELFT>()->getSymbol(symIdx);
655	uint64_t offset = rel.r_offset;
656	RelType type = rel.getType(false);
657	if (sym.isUndefined() && symIdx != `0` &&
658	rs.maybeReportUndefined(sym&: cast<Undefined>(Val&: sym), offset))
659	continue;
660	int64_t addend = rs.getAddend<ELFT>(rel, type);
661	RelExpr expr;
662	// Relocation types that only need a RelExpr set `expr` and break out of
663	// the switch to reach rs.process(). Types that need special handling
664	// (fast-path helpers, TLS) call a handler and use `continue`.
665	switch (type) {
666	case R_X86_64_NONE:
667	continue;
668
669	// Absolute relocations:
670	case R_X86_64_8:
671	case R_X86_64_16:
672	case R_X86_64_32:
673	case R_X86_64_32S:
674	case R_X86_64_64:
675	expr = R_ABS;
676	break;
677
678	// PC-relative relocations:
679	case R_X86_64_PC8:
680	case R_X86_64_PC16:
681	case R_X86_64_PC32:
682	case R_X86_64_PC64:
683	rs.processR_PC(type, offset, addend, sym);
684	continue;
685
686	// GOT-generating relocations:
687	case R_X86_64_GOTPC32:
688	case R_X86_64_GOTPC64:
689	ctx.in.gotPlt ->hasGotPltOffRel.store(i: true, m: std::memory_order_relaxed);
690	expr = R_GOTPLTONLY_PC;
691	break;
692	case R_X86_64_GOTOFF64:
693	ctx.in.gotPlt ->hasGotPltOffRel.store(i: true, m: std::memory_order_relaxed);
694	expr = R_GOTPLTREL;
695	break;
696	case R_X86_64_GOT32:
697	case R_X86_64_GOT64:
698	ctx.in.gotPlt ->hasGotPltOffRel.store(i: true, m: std::memory_order_relaxed);
699	expr = R_GOTPLT;
700	break;
701	case R_X86_64_PLTOFF64:
702	ctx.in.gotPlt ->hasGotPltOffRel.store(i: true, m: std::memory_order_relaxed);
703	expr = R_PLT_GOTPLT;
704	break;
705	case R_X86_64_GOTPCREL:
706	case R_X86_64_GOTPCRELX:
707	case R_X86_64_REX_GOTPCRELX:
708	case R_X86_64_CODE_4_GOTPCRELX:
709	expr = R_GOT_PC;
710	break;
711
712	// PLT-generating relocation:
713	case R_X86_64_PLT32:
714	rs.processR_PLT_PC(type, offset, addend, sym);
715	continue;
716
717	// TLS relocations:
718	case R_X86_64_TPOFF32:
719	case R_X86_64_TPOFF64:
720	if (rs.checkTlsLe(offset, sym, type))
721	continue;
722	expr = R_TPREL;
723	break;
724	case R_X86_64_GOTTPOFF:
725	case R_X86_64_CODE_4_GOTTPOFF:
726	case R_X86_64_CODE_6_GOTTPOFF:
727	rs.handleTlsIe(ieExpr: R_GOT_PC, type, offset, addend, sym);
728	continue;
729	case R_X86_64_TLSGD:
730	if (rs.handleTlsGd(sharedExpr: R_TLSGD_PC, ieExpr: R_GOT_PC, leExpr: R_TPREL, type, offset, addend,
731	sym))
732	++it;
733	continue;
734	case R_X86_64_TLSLD:
735	if (rs.handleTlsLd(sharedExpr: R_TLSLD_PC, type, offset, addend, sym))
736	++it;
737	continue;
738	case R_X86_64_DTPOFF32:
739	case R_X86_64_DTPOFF64:
740	sec.addReloc(
741	r: {.expr: ctx.arg.shared ? R_DTPREL : R_TPREL, .type: type, .offset: offset, .addend: addend, .sym: &sym});
742	continue;
743	case R_X86_64_TLSDESC_CALL:
744	// For executables, TLSDESC is optimized to IE or LE. Use R_TPREL as the
745	// rewrites for this relocation are identical.
746	if (!ctx.arg.shared)
747	sec.addReloc(r: {.expr: R_TPREL, .type: type, .offset: offset, .addend: addend, .sym: &sym});
748	continue;
749	case R_X86_64_GOTPC32_TLSDESC:
750	case R_X86_64_CODE_4_GOTPC32_TLSDESC:
751	rs.handleTlsDesc(sharedExpr: R_TLSDESC_PC, ieExpr: R_GOT_PC, type, offset, addend, sym);
752	continue;
753
754	// Misc relocations:
755	case R_X86_64_SIZE32:
756	case R_X86_64_SIZE64:
757	expr = R_SIZE;
758	break;
759
760	default:
761	Err(ctx) << getErrorLoc(ctx, loc: sec.content().data() + offset)
762	<< "unknown relocation (" << type.v << ") against symbol "
763	<< &sym;
764	continue;
765	}
766	rs.process(expr, type, offset, sym, addend);
767	}
768
769	if (ctx.arg.branchToBranch)
770	llvm::stable_sort(sec.relocs(),
771	[](auto &l, auto &r) { return l.offset < r.offset; });
772	}
773
774	void X86_64::scanSection(InputSectionBase &sec, unsigned shard) {
775	if (ctx.arg.is64)
776	elf::scanSection1<X86_64, ELF64LE>(target&: *this, sec, shard);
777	else // ilp32
778	elf::scanSection1<X86_64, ELF32LE>(target&: *this, sec, shard);
779	}
780
781	void X86_64::relaxTlsGdToLe(uint8_t loc, const* Relocation &rel,
782	uint64_t val) const {
783	if (rel.type == R_X86_64_TLSGD) {
784	// Convert
785	// .byte 0x66
786	// leaq x@tlsgd(%rip), %rdi
787	// .word 0x6666
788	// rex64
789	// call __tls_get_addr@plt
790	// to the following two instructions.
791	const uint8_t inst[] = {
792	`0x64`, `0x48`, `0x8b`, `0x04`, `0x25`, `0x00`, `0x00`,
793	`0x00`, `0x00`, // mov %fs:0x0,%rax
794	`0x48`, `0x8d`, `0x80`, `0`, `0`, `0`, `0`, // lea x@tpoff,%rax
795	};
796	memcpy(dest: loc - `4`, src: inst, n: sizeof(inst));
797
798	// The original code used a pc relative relocation and so we have to
799	// compensate for the -4 in had in the addend.
800	write32le(P: loc + `8`, V: val + `4`);
801	} else if (rel.type == R_X86_64_GOTPC32_TLSDESC \|\|
802	rel.type == R_X86_64_CODE_4_GOTPC32_TLSDESC) {
803	// Convert leaq x@tlsdesc(%rip), %REG to movq $x@tpoff, %REG.
804	if ((loc[-`3`] & `0xfb`) != `0x48` \|\| loc[-`2`] != `0x8d` \|\|
805	(loc[-`1`] & `0xc7`) != `0x05`) {
806	Err(ctx) << getErrorLoc(ctx, loc: (rel.type == R_X86_64_GOTPC32_TLSDESC)
807	? loc - `3`
808	: loc - `4`)
809	<< "R_X86_64_GOTPC32_TLSDESC/R_X86_64_CODE_4_GOTPC32_TLSDESC "
810	"must be used in leaq x@tlsdesc(%rip), %REG";
811	return;
812	}
813	if (rel.type == R_X86_64_GOTPC32_TLSDESC) {
814	loc[-`3`] = `0x48` \| ((loc[-`3`] >> `2`) & `1`);
815	} else {
816	loc[-`3`] = (loc[-`3`] & ~`0x44`) \| ((loc[-`3`] & `0x44`) >> `2`);
817	}
818	loc[-`2`] = `0xc7`;
819	loc[-`1`] = `0xc0` \| ((loc[-`1`] >> `3`) & `7`);
820
821	write32le(P: loc, V: val + `4`);
822	} else {
823	// Convert call x@tlsdesc(%REG) to xchg ax, ax.*
824	assert(rel.type == R_X86_64_TLSDESC_CALL);
825	loc[`0`] = `0x66`;
826	loc[`1`] = `0x90`;
827	}
828	}
829
830	void X86_64::relaxTlsGdToIe(uint8_t loc, const* Relocation &rel,
831	uint64_t val) const {
832	if (rel.type == R_X86_64_TLSGD) {
833	// Convert
834	// .byte 0x66
835	// leaq x@tlsgd(%rip), %rdi
836	// .word 0x6666
837	// rex64
838	// call __tls_get_addr@plt
839	// to the following two instructions.
840	const uint8_t inst[] = {
841	`0x64`, `0x48`, `0x8b`, `0x04`, `0x25`, `0x00`, `0x00`,
842	`0x00`, `0x00`, // mov %fs:0x0,%rax
843	`0x48`, `0x03`, `0x05`, `0`, `0`, `0`, `0`, // addq x@gottpoff(%rip),%rax
844	};
845	memcpy(dest: loc - `4`, src: inst, n: sizeof(inst));
846
847	// Both code sequences are PC relatives, but since we are moving the
848	// constant forward by 8 bytes we have to subtract the value by 8.
849	write32le(P: loc + `8`, V: val - `8`);
850	} else if (rel.type == R_X86_64_GOTPC32_TLSDESC \|\|
851	rel.type == R_X86_64_CODE_4_GOTPC32_TLSDESC) {
852	// Convert leaq x@tlsdesc(%rip), %REG to movq x@gottpoff(%rip), %REG.
853	if ((loc[-`3`] & `0xfb`) != `0x48` \|\| loc[-`2`] != `0x8d` \|\|
854	(loc[-`1`] & `0xc7`) != `0x05`) {
855	Err(ctx) << getErrorLoc(ctx, loc: (rel.type == R_X86_64_GOTPC32_TLSDESC)
856	? loc - `3`
857	: loc - `4`)
858	<< "R_X86_64_GOTPC32_TLSDESC/R_X86_64_CODE_4_GOTPC32_TLSDESC "
859	"must be used in leaq x@tlsdesc(%rip), %REG";
860	return;
861	}
862	loc[-`2`] = `0x8b`;
863	write32le(P: loc, V: val);
864	}
865	}
866
867	// In some conditions,
868	// R_X86_64_GOTTPOFF/R_X86_64_CODE_4_GOTTPOFF/R_X86_64_CODE_6_GOTTPOFF
869	// relocation can be optimized to R_X86_64_TPOFF32 so that it does not use GOT.
870	void X86_64::relaxTlsIeToLe(uint8_t loc, const* Relocation &rel,
871	uint64_t val) const {
872	uint8_t *inst = loc - `3`;
873	uint8_t reg = loc[-`1`] >> `3`;
874	uint8_t *regSlot = loc - `1`;
875
876	if (rel.type == R_X86_64_GOTTPOFF) {
877	// Note that ADD with RSP or R12 is converted to ADD instead of LEA
878	// because LEA with these registers needs 4 bytes to encode and thus
879	// wouldn't fit the space.
880
881	if (memcmp(s1: inst, s2: "\x48\x03\x25", n: `3`) == `0`) {
882	// "addq foo@gottpoff(%rip),%rsp" -> "addq $foo,%rsp"
883	memcpy(dest: inst, src: "\x48\x81\xc4", n: `3`);
884	} else if (memcmp(s1: inst, s2: "\x4c\x03\x25", n: `3`) == `0`) {
885	// "addq foo@gottpoff(%rip),%r12" -> "addq $foo,%r12"
886	memcpy(dest: inst, src: "\x49\x81\xc4", n: `3`);
887	} else if (memcmp(s1: inst, s2: "\x4c\x03", n: `2`) == `0`) {
888	// "addq foo@gottpoff(%rip),%r[8-15]" -> "leaq foo(%r[8-15]),%r[8-15]"
889	memcpy(dest: inst, src: "\x4d\x8d", n: `2`);
890	*regSlot = `0x80` \| (reg << `3`) \| reg;
891	} else if (memcmp(s1: inst, s2: "\x48\x03", n: `2`) == `0`) {
892	// "addq foo@gottpoff(%rip),%reg -> "leaq foo(%reg),%reg"
893	memcpy(dest: inst, src: "\x48\x8d", n: `2`);
894	*regSlot = `0x80` \| (reg << `3`) \| reg;
895	} else if (memcmp(s1: inst, s2: "\x4c\x8b", n: `2`) == `0`) {
896	// "movq foo@gottpoff(%rip),%r[8-15]" -> "movq $foo,%r[8-15]"
897	memcpy(dest: inst, src: "\x49\xc7", n: `2`);
898	*regSlot = `0xc0` \| reg;
899	} else if (memcmp(s1: inst, s2: "\x48\x8b", n: `2`) == `0`) {
900	// "movq foo@gottpoff(%rip),%reg" -> "movq $foo,%reg"
901	memcpy(dest: inst, src: "\x48\xc7", n: `2`);
902	*regSlot = `0xc0` \| reg;
903	} else {
904	Err(ctx)
905	<< getErrorLoc(ctx, loc: loc - `3`)
906	<< "R_X86_64_GOTTPOFF must be used in MOVQ or ADDQ instructions only";
907	}
908	} else if (rel.type == R_X86_64_CODE_4_GOTTPOFF) {
909	if (loc[-`4`] != `0xd5`) {
910	Err(ctx) << getErrorLoc(ctx, loc: loc - `4`)
911	<< "invalid prefix with R_X86_64_CODE_4_GOTTPOFF!";
912	return;
913	}
914	const uint8_t rex = loc[-`3`];
915	loc[-`3`] = (rex & ~`0x44`) \| (rex & `0x44`) >> `2`;
916	*regSlot = `0xc0` \| reg;
917
918	if (loc[-`2`] == `0x8b`) {
919	// "movq foo@gottpoff(%rip),%r[16-31]" -> "movq $foo,%r[16-31]"
920	loc[-`2`] = `0xc7`;
921	} else if (loc[-`2`] == `0x03`) {
922	// "addq foo@gottpoff(%rip),%r[16-31]" -> "addq $foo,%r[16-31]"
923	loc[-`2`] = `0x81`;
924	} else {
925	Err(ctx) << getErrorLoc(ctx, loc: loc - `4`)
926	<< "R_X86_64_CODE_4_GOTTPOFF must be used in MOVQ or ADDQ "
927	"instructions only";
928	}
929	} else if (rel.type == R_X86_64_CODE_6_GOTTPOFF) {
930	if (loc[-`6`] != `0x62`) {
931	Err(ctx) << getErrorLoc(ctx, loc: loc - `6`)
932	<< "invalid prefix with R_X86_64_CODE_6_GOTTPOFF!";
933	return;
934	}
935	// Check bits are satisfied:
936	// loc[-5]: X==1 (inverted polarity), (loc[-5] & 0x7) == 0x4
937	// loc[-4]: W==1, X2==1 (inverted polarity), pp==0b00(NP)
938	// loc[-3]: NF==1 or ND==1
939	// loc[-2]: opcode==0x1 or opcode==0x3
940	// loc[-1]: Mod==0b00, RM==0b101
941	if (((loc[-`5`] & `0x47`) == `0x44`) && ((loc[-`4`] & `0x87`) == `0x84`) &&
942	((loc[-`3`] & `0x14`) != `0`) && (loc[-`2`] == `0x1` \|\| loc[-`2`] == `0x3`) &&
943	((loc[-`1`] & `0xc7`) == `0x5`)) {
944	// "addq %reg1, foo@GOTTPOFF(%rip), %reg2" -> "addq $foo, %reg1, %reg2"
945	// "addq foo@GOTTPOFF(%rip), %reg1, %reg2" -> "addq $foo, %reg1, %reg2"
946	// "{nf} addq %reg1, foo@GOTTPOFF(%rip), %reg2"
947	// -> "{nf} addq $foo, %reg1, %reg2"
948	// "{nf} addq name@GOTTPOFF(%rip), %reg1, %reg2"
949	// -> "{nf} addq $foo, %reg1, %reg2"
950	// "{nf} addq name@GOTTPOFF(%rip), %reg" -> "{nf} addq $foo, %reg"
951	loc[-`2`] = `0x81`;
952	// Move R bits to B bits in EVEX payloads and ModRM byte.
953	const uint8_t evexPayload0 = loc[-`5`];
954	if ((evexPayload0 & (`1` << `7`)) == `0`)
955	loc[-`5`] = (evexPayload0 \| (`1` << `7`)) & ~(`1` << `5`);
956	if ((evexPayload0 & (`1` << `4`)) == `0`)
957	loc[-`5`] = evexPayload0 \| (`1` << `4`) \| (`1` << `3`);
958	*regSlot = `0xc0` \| reg;
959	} else {
960	Err(ctx) << getErrorLoc(ctx, loc: loc - `6`)
961	<< "R_X86_64_CODE_6_GOTTPOFF must be used in ADDQ instructions "
962	"with NDD/NF/NDD+NF only";
963	}
964	} else {
965	llvm_unreachable("Unsupported relocation type!");
966	}
967
968	// The original code used a PC relative relocation.
969	// Need to compensate for the -4 it had in the addend.
970	write32le(P: loc, V: val + `4`);
971	}
972
973	void X86_64::relaxTlsLdToLe(uint8_t loc, const* Relocation &rel,
974	uint64_t val) const {
975	const uint8_t inst[] = {
976	`0x66`, `0x66`, // .word 0x6666
977	`0x66`, // .byte 0x66
978	`0x64`, `0x48`, `0x8b`, `0x04`, `0x25`, `0x00`, `0x00`, `0x00`, `0x00`, // mov %fs:0,%rax
979	};
980
981	if (loc[`4`] == `0xe8`) {
982	// Convert
983	// leaq bar@tlsld(%rip), %rdi # 48 8d 3d <Loc>
984	// callq __tls_get_addr@PLT # e8 <disp32>
985	// leaq bar@dtpoff(%rax), %rcx
986	// to
987	// .word 0x6666
988	// .byte 0x66
989	// mov %fs:0,%rax
990	// leaq bar@tpoff(%rax), %rcx
991	memcpy(dest: loc - `3`, src: inst, n: sizeof(inst));
992	return;
993	}
994
995	if (loc[`4`] == `0xff` && loc[`5`] == `0x15`) {
996	// Convert
997	// leaq x@tlsld(%rip),%rdi # 48 8d 3d <Loc>
998	// call __tls_get_addr@GOTPCREL(%rip) # ff 15 <disp32>*
999	// to
1000	// .long 0x66666666
1001	// movq %fs:0,%rax
1002	// See "Table 11.9: LD -> LE Code Transition (LP64)" in
1003	// https://raw.githubusercontent.com/wiki/hjl-tools/x86-psABI/x86-64-psABI-1.0.pdf
1004	loc[-`3`] = `0x66`;
1005	memcpy(dest: loc - `2`, src: inst, n: sizeof(inst));
1006	return;
1007	}
1008
1009	ErrAlways(ctx)
1010	<< getErrorLoc(ctx, loc: loc - `3`)
1011	<< "expected R_X86_64_PLT32 or R_X86_64_GOTPCRELX after R_X86_64_TLSLD";
1012	}
1013
1014	// A JumpInstrMod at a specific offset indicates that the jump instruction
1015	// opcode at that offset must be modified. This is specifically used to relax
1016	// jump instructions with basic block sections. This function looks at the
1017	// JumpMod and effects the change.
1018	void X86_64::applyJumpInstrMod(uint8_t *loc, JumpModType type,
1019	unsigned size) const {
1020	switch (type) {
1021	case J_JMP_32:
1022	if (size == `4`)
1023	*loc = `0xe9`;
1024	else
1025	*loc = `0xeb`;
1026	break;
1027	case J_JE_32:
1028	if (size == `4`) {
1029	loc[-`1`] = `0x0f`;
1030	*loc = `0x84`;
1031	} else
1032	*loc = `0x74`;
1033	break;
1034	case J_JNE_32:
1035	if (size == `4`) {
1036	loc[-`1`] = `0x0f`;
1037	*loc = `0x85`;
1038	} else
1039	*loc = `0x75`;
1040	break;
1041	case J_JG_32:
1042	if (size == `4`) {
1043	loc[-`1`] = `0x0f`;
1044	*loc = `0x8f`;
1045	} else
1046	*loc = `0x7f`;
1047	break;
1048	case J_JGE_32:
1049	if (size == `4`) {
1050	loc[-`1`] = `0x0f`;
1051	*loc = `0x8d`;
1052	} else
1053	*loc = `0x7d`;
1054	break;
1055	case J_JB_32:
1056	if (size == `4`) {
1057	loc[-`1`] = `0x0f`;
1058	*loc = `0x82`;
1059	} else
1060	*loc = `0x72`;
1061	break;
1062	case J_JBE_32:
1063	if (size == `4`) {
1064	loc[-`1`] = `0x0f`;
1065	*loc = `0x86`;
1066	} else
1067	*loc = `0x76`;
1068	break;
1069	case J_JL_32:
1070	if (size == `4`) {
1071	loc[-`1`] = `0x0f`;
1072	*loc = `0x8c`;
1073	} else
1074	*loc = `0x7c`;
1075	break;
1076	case J_JLE_32:
1077	if (size == `4`) {
1078	loc[-`1`] = `0x0f`;
1079	*loc = `0x8e`;
1080	} else
1081	*loc = `0x7e`;
1082	break;
1083	case J_JA_32:
1084	if (size == `4`) {
1085	loc[-`1`] = `0x0f`;
1086	*loc = `0x87`;
1087	} else
1088	*loc = `0x77`;
1089	break;
1090	case J_JAE_32:
1091	if (size == `4`) {
1092	loc[-`1`] = `0x0f`;
1093	*loc = `0x83`;
1094	} else
1095	*loc = `0x73`;
1096	break;
1097	case J_UNKNOWN:
1098	llvm_unreachable("Unknown Jump Relocation");
1099	}
1100	}
1101
1102	int64_t X86_64::getImplicitAddend(const uint8_t buf, RelType type) const* {
1103	switch (type) {
1104	case R_X86_64_8:
1105	case R_X86_64_PC8:
1106	return SignExtend64<`8`>(x: *buf);
1107	case R_X86_64_16:
1108	case R_X86_64_PC16:
1109	return SignExtend64<`16`>(x: read16le(P: buf));
1110	case R_X86_64_32:
1111	case R_X86_64_32S:
1112	case R_X86_64_TPOFF32:
1113	case R_X86_64_GOT32:
1114	case R_X86_64_GOTPC32:
1115	case R_X86_64_GOTPC32_TLSDESC:
1116	case R_X86_64_GOTPCREL:
1117	case R_X86_64_GOTPCRELX:
1118	case R_X86_64_REX_GOTPCRELX:
1119	case R_X86_64_CODE_4_GOTPCRELX:
1120	case R_X86_64_PC32:
1121	case R_X86_64_GOTTPOFF:
1122	case R_X86_64_CODE_4_GOTTPOFF:
1123	case R_X86_64_CODE_6_GOTTPOFF:
1124	case R_X86_64_PLT32:
1125	case R_X86_64_TLSGD:
1126	case R_X86_64_TLSLD:
1127	case R_X86_64_DTPOFF32:
1128	case R_X86_64_SIZE32:
1129	return SignExtend64<`32`>(x: read32le(P: buf));
1130	case R_X86_64_64:
1131	case R_X86_64_TPOFF64:
1132	case R_X86_64_DTPOFF64:
1133	case R_X86_64_DTPMOD64:
1134	case R_X86_64_PC64:
1135	case R_X86_64_SIZE64:
1136	case R_X86_64_GLOB_DAT:
1137	case R_X86_64_GOT64:
1138	case R_X86_64_GOTOFF64:
1139	case R_X86_64_GOTPC64:
1140	case R_X86_64_PLTOFF64:
1141	case R_X86_64_IRELATIVE:
1142	case R_X86_64_RELATIVE:
1143	return read64le(P: buf);
1144	case R_X86_64_TLSDESC:
1145	return read64le(P: buf + `8`);
1146	case R_X86_64_JUMP_SLOT:
1147	case R_X86_64_NONE:
1148	// These relocations are defined as not having an implicit addend.
1149	return `0`;
1150	default:
1151	InternalErr(ctx, buf) << "cannot read addend for relocation " << type;
1152	return `0`;
1153	}
1154	}
1155
1156	static void relaxGot(uint8_t loc, const* Relocation &rel, uint64_t val);
1157
1158	void X86_64::relocate(uint8_t loc, const* Relocation &rel, uint64_t val) const {
1159	switch (rel.type) {
1160	case R_X86_64_8:
1161	checkIntUInt(ctx, loc, v: val, n: `8`, rel);
1162	*loc = val;
1163	break;
1164	case R_X86_64_PC8:
1165	checkInt(ctx, loc, v: val, n: `8`, rel);
1166	*loc = val;
1167	break;
1168	case R_X86_64_16:
1169	checkIntUInt(ctx, loc, v: val, n: `16`, rel);
1170	write16le(P: loc, V: val);
1171	break;
1172	case R_X86_64_PC16:
1173	checkInt(ctx, loc, v: val, n: `16`, rel);
1174	write16le(P: loc, V: val);
1175	break;
1176	case R_X86_64_32:
1177	checkUInt(ctx, loc, v: val, n: `32`, rel);
1178	write32le(P: loc, V: val);
1179	break;
1180	case R_X86_64_32S:
1181	case R_X86_64_GOT32:
1182	case R_X86_64_GOTPC32:
1183	case R_X86_64_GOTPCREL:
1184	case R_X86_64_PC32:
1185	case R_X86_64_PLT32:
1186	case R_X86_64_DTPOFF32:
1187	case R_X86_64_SIZE32:
1188	checkInt(ctx, loc, v: val, n: `32`, rel);
1189	write32le(P: loc, V: val);
1190	break;
1191	case R_X86_64_64:
1192	case R_X86_64_TPOFF64:
1193	case R_X86_64_DTPOFF64:
1194	case R_X86_64_PC64:
1195	case R_X86_64_SIZE64:
1196	case R_X86_64_GOT64:
1197	case R_X86_64_GOTOFF64:
1198	case R_X86_64_GOTPC64:
1199	case R_X86_64_PLTOFF64:
1200	write64le(P: loc, V: val);
1201	break;
1202	case R_X86_64_GOTPCRELX:
1203	case R_X86_64_REX_GOTPCRELX:
1204	case R_X86_64_CODE_4_GOTPCRELX:
1205	if (rel.expr != R_GOT_PC) {
1206	relaxGot(loc, rel, val);
1207	} else {
1208	checkInt(ctx, loc, v: val, n: `32`, rel);
1209	write32le(P: loc, V: val);
1210	}
1211	break;
1212	case R_X86_64_GOTPC32_TLSDESC:
1213	case R_X86_64_CODE_4_GOTPC32_TLSDESC:
1214	case R_X86_64_TLSDESC_CALL:
1215	case R_X86_64_TLSGD:
1216	if (rel.expr == R_TPREL) {
1217	relaxTlsGdToLe(loc, rel, val);
1218	} else if (rel.expr == R_GOT_PC) {
1219	relaxTlsGdToIe(loc, rel, val);
1220	} else {
1221	checkInt(ctx, loc, v: val, n: `32`, rel);
1222	write32le(P: loc, V: val);
1223	}
1224	break;
1225	case R_X86_64_TLSLD:
1226	if (rel.expr == R_TPREL) {
1227	relaxTlsLdToLe(loc, rel, val);
1228	} else {
1229	checkInt(ctx, loc, v: val, n: `32`, rel);
1230	write32le(P: loc, V: val);
1231	}
1232	break;
1233	case R_X86_64_GOTTPOFF:
1234	case R_X86_64_CODE_4_GOTTPOFF:
1235	case R_X86_64_CODE_6_GOTTPOFF:
1236	if (rel.expr == R_TPREL) {
1237	relaxTlsIeToLe(loc, rel, val);
1238	} else {
1239	checkInt(ctx, loc, v: val, n: `32`, rel);
1240	write32le(P: loc, V: val);
1241	}
1242	break;
1243	case R_X86_64_TPOFF32:
1244	checkInt(ctx, loc, v: val, n: `32`, rel);
1245	write32le(P: loc, V: val);
1246	break;
1247
1248	case R_X86_64_TLSDESC:
1249	// The addend is stored in the second 64-bit word.
1250	write64le(P: loc + `8`, V: val);
1251	break;
1252	default:
1253	llvm_unreachable("unknown relocation");
1254	}
1255	}
1256
1257	RelExpr X86_64::adjustGotPcExpr(RelType type, int64_t addend,
1258	const uint8_t loc) const* {
1259	// Only R_X86_64_[REX_]\|[CODE_4_]GOTPCRELX can be relaxed. GNU as may emit
1260	// GOTPCRELX with addend != -4. Such an instruction does not load the full GOT
1261	// entry, so we cannot relax the relocation. E.g. movl x@GOTPCREL+4(%rip),
1262	// %rax (addend=0) loads the high 32 bits of the GOT entry.
1263	if (!ctx.arg.relax \|\| addend != -`4` \|\|
1264	(type != R_X86_64_GOTPCRELX && type != R_X86_64_REX_GOTPCRELX &&
1265	type != R_X86_64_CODE_4_GOTPCRELX))
1266	return R_GOT_PC;
1267	const uint8_t op = loc[-`2`];
1268	const uint8_t modRm = loc[-`1`];
1269
1270	// FIXME: When PIC is disabled and foo is defined locally in the
1271	// lower 32 bit address space, memory operand in mov can be converted into
1272	// immediate operand. Otherwise, mov must be changed to lea. We support only
1273	// latter relaxation at this moment.
1274	if (op == `0x8b`)
1275	return R_RELAX_GOT_PC;
1276
1277	// Relax call and jmp.
1278	if (op == `0xff` && (modRm == `0x15` \|\| modRm == `0x25`))
1279	return R_RELAX_GOT_PC;
1280
1281	// We don't support test/binop instructions without a REX/REX2 prefix.
1282	if (type == R_X86_64_GOTPCRELX)
1283	return R_GOT_PC;
1284
1285	// Relaxation of test, adc, add, and, cmp, or, sbb, sub, xor.
1286	// If PIC then no relaxation is available.
1287	return ctx.arg.isPic ? R_GOT_PC : R_RELAX_GOT_PC_NOPIC;
1288	}
1289
1290	// A subset of relaxations can only be applied for no-PIC. This method
1291	// handles such relaxations. Instructions encoding information was taken from:
1292	// "Intel 64 and IA-32 Architectures Software Developer's Manual V2"
1293	// (http://www.intel.com/content/dam/www/public/us/en/documents/manuals/
1294	// 64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf)
1295	static void relaxGotNoPic(uint8_t *loc, uint64_t val, uint8_t op, uint8_t modRm,
1296	bool isRex2) {
1297	const uint8_t rex = loc[-`3`];
1298	// Convert "test %reg, foo@GOTPCREL(%rip)" to "test $foo, %reg".
1299	if (op == `0x85`) {
1300	// See "TEST-Logical Compare" (4-428 Vol. 2B),
1301	// TEST r/m64, r64 uses "full" ModR / M byte (no opcode extension).
1302
1303	// ModR/M byte has form XX YYY ZZZ, where
1304	// YYY is MODRM.reg(register 2), ZZZ is MODRM.rm(register 1).
1305	// XX has different meanings:
1306	// 00: The operand's memory address is in reg1.
1307	// 01: The operand's memory address is reg1 + a byte-sized displacement.
1308	// 10: The operand's memory address is reg1 + a word-sized displacement.
1309	// 11: The operand is reg1 itself.
1310	// If an instruction requires only one operand, the unused reg2 field
1311	// holds extra opcode bits rather than a register code
1312	// 0xC0 == 11 000 000 binary.
1313	// 0x38 == 00 111 000 binary.
1314	// We transfer reg2 to reg1 here as operand.
1315	// See "2.1.3 ModR/M and SIB Bytes" (Vol. 2A 2-3).
1316	loc[-`1`] = `0xc0` \| (modRm & `0x38`) >> `3`; // ModR/M byte.
1317
1318	// Change opcode from TEST r/m64, r64 to TEST r/m64, imm32
1319	// See "TEST-Logical Compare" (4-428 Vol. 2B).
1320	loc[-`2`] = `0xf7`;
1321
1322	// Move R bit to the B bit in REX/REX2 byte.
1323	// REX byte is encoded as 0100WRXB, where
1324	// 0100 is 4bit fixed pattern.
1325	// REX.W When 1, a 64-bit operand size is used. Otherwise, when 0, the
1326	// default operand size is used (which is 32-bit for most but not all
1327	// instructions).
1328	// REX.R This 1-bit value is an extension to the MODRM.reg field.
1329	// REX.X This 1-bit value is an extension to the SIB.index field.
1330	// REX.B This 1-bit value is an extension to the MODRM.rm field or the
1331	// SIB.base field.
1332	// See "2.2.1.2 More on REX Prefix Fields " (2-8 Vol. 2A).
1333	//
1334	// REX2 prefix is encoded as 0xd5\|M\|R2\|X2\|B2\|WRXB, where
1335	// 0xd5 is 1byte fixed pattern.
1336	// REX2's [W,R,X,B] have the same meanings as REX's.
1337	// REX2.M encodes the map id.
1338	// R2/X2/B2 provides the fifth and most siginicant bits of the R/X/B
1339	// register identifiers, each of which can now address all 32 GPRs.
1340	if (isRex2)
1341	loc[-`3`] = (rex & ~`0x44`) \| (rex & `0x44`) >> `2`;
1342	else
1343	loc[-`3`] = (rex & ~`0x4`) \| (rex & `0x4`) >> `2`;
1344	write32le(P: loc, V: val);
1345	return;
1346	}
1347
1348	// If we are here then we need to relax the adc, add, and, cmp, or, sbb, sub
1349	// or xor operations.
1350
1351	// Convert "binop foo@GOTPCREL(%rip), %reg" to "binop $foo, %reg".
1352	// Logic is close to one for test instruction above, but we also
1353	// write opcode extension here, see below for details.
1354	loc[-`1`] = `0xc0` \| (modRm & `0x38`) >> `3` \| (op & `0x3c`); // ModR/M byte.
1355
1356	// Primary opcode is 0x81, opcode extension is one of:
1357	// 000b = ADD, 001b is OR, 010b is ADC, 011b is SBB,
1358	// 100b is AND, 101b is SUB, 110b is XOR, 111b is CMP.
1359	// This value was wrote to MODRM.reg in a line above.
1360	// See "3.2 INSTRUCTIONS (A-M)" (Vol. 2A 3-15),
1361	// "INSTRUCTION SET REFERENCE, N-Z" (Vol. 2B 4-1) for
1362	// descriptions about each operation.
1363	loc[-`2`] = `0x81`;
1364	if (isRex2)
1365	loc[-`3`] = (rex & ~`0x44`) \| (rex & `0x44`) >> `2`;
1366	else
1367	loc[-`3`] = (rex & ~`0x4`) \| (rex & `0x4`) >> `2`;
1368	write32le(P: loc, V: val);
1369	}
1370
1371	static void relaxGot(uint8_t loc, const* Relocation &rel, uint64_t val) {
1372	assert(isInt<`32`>(val) &&
1373	"GOTPCRELX should not have been relaxed if it overflows");
1374	const uint8_t op = loc[-`2`];
1375	const uint8_t modRm = loc[-`1`];
1376
1377	// Convert "mov foo@GOTPCREL(%rip),%reg" to "lea foo(%rip),%reg".
1378	if (op == `0x8b`) {
1379	loc[-`2`] = `0x8d`;
1380	write32le(P: loc, V: val);
1381	return;
1382	}
1383
1384	if (op != `0xff`) {
1385	// We are relaxing a rip relative to an absolute, so compensate
1386	// for the old -4 addend.
1387	assert(!rel.sym->file->ctx.arg.isPic);
1388	relaxGotNoPic(loc, val: val + `4`, op, modRm,
1389	isRex2: rel.type == R_X86_64_CODE_4_GOTPCRELX);
1390	return;
1391	}
1392
1393	// Convert call/jmp instructions.
1394	if (modRm == `0x15`) {
1395	// ABI says we can convert "call foo@GOTPCREL(%rip)" to "nop; call foo".*
1396	// Instead we convert to "addr32 call foo" where addr32 is an instruction
1397	// prefix. That makes result expression to be a single instruction.
1398	loc[-`2`] = `0x67`; // addr32 prefix
1399	loc[-`1`] = `0xe8`; // call
1400	write32le(P: loc, V: val);
1401	return;
1402	}
1403
1404	// Convert "jmp foo@GOTPCREL(%rip)" to "jmp foo; nop".*
1405	// jmp doesn't return, so it is fine to use nop here, it is just a stub.
1406	assert(modRm == `0x25`);
1407	loc[-`2`] = `0xe9`; // jmp
1408	loc[`3`] = `0x90`; // nop
1409	write32le(P: loc - `1`, V: val + `1`);
1410	}
1411
1412	// A split-stack prologue starts by checking the amount of stack remaining
1413	// in one of two ways:
1414	// A) Comparing of the stack pointer to a field in the tcb.
1415	// B) Or a load of a stack pointer offset with an lea to r10 or r11.
1416	bool X86_64::adjustPrologueForCrossSplitStack(uint8_t loc, uint8_t end,
1417	uint8_t stOther) const {
1418	if (!ctx.arg.is64) {
1419	ErrAlways(ctx) << "target doesn't support split stacks";
1420	return false;
1421	}
1422
1423	if (loc + `8` >= end)
1424	return false;
1425
1426	// Replace "cmp %fs:0x70,%rsp" and subsequent branch
1427	// with "stc, nopl 0x0(%rax,%rax,1)"
1428	if (memcmp(s1: loc, s2: "\x64\x48\x3b\x24\x25", n: `5`) == `0`) {
1429	memcpy(dest: loc, src: "\xf9\x0f\x1f\x84\x00\x00\x00\x00", n: `8`);
1430	return true;
1431	}
1432
1433	// Adjust "lea X(%rsp),%rYY" to lea "(X - 0x4000)(%rsp),%rYY" where rYY could
1434	// be r10 or r11. The lea instruction feeds a subsequent compare which checks
1435	// if there is X available stack space. Making X larger effectively reserves
1436	// that much additional space. The stack grows downward so subtract the value.
1437	if (memcmp(s1: loc, s2: "\x4c\x8d\x94\x24", n: `4`) == `0` \|\|
1438	memcmp(s1: loc, s2: "\x4c\x8d\x9c\x24", n: `4`) == `0`) {
1439	// The offset bytes are encoded four bytes after the start of the
1440	// instruction.
1441	write32le(P: loc + `4`, V: read32le(P: loc + `4`) - `0x4000`);
1442	return true;
1443	}
1444	return false;
1445	}
1446
1447	void X86_64::relocateAlloc(InputSection &sec, uint8_t buf) const* {
1448	uint64_t secAddr = sec.getOutputSection()->addr + sec.outSecOff;
1449	for (const Relocation &rel : sec.relocs()) {
1450	if (rel.expr == R_NONE) // See deleteFallThruJmpInsn
1451	continue;
1452	uint8_t *loc = buf + rel.offset;
1453	const uint64_t val = sec.getRelocTargetVA(ctx, r: rel, p: secAddr + rel.offset);
1454	relocate(loc, rel, val);
1455	}
1456	if (sec.jumpInstrMod) {
1457	applyJumpInstrMod(loc: buf + sec.jumpInstrMod->offset,
1458	type: sec.jumpInstrMod->original, size: sec.jumpInstrMod->size);
1459	}
1460	}
1461
1462	static std::optional<uint64_t> getControlTransferAddend(InputSection &is,
1463	Relocation &r) {
1464	// Identify a control transfer relocation for the branch-to-branch
1465	// optimization. A "control transfer relocation" usually means a CALL or JMP
1466	// target but it also includes relative vtable relocations for example.
1467	//
1468	// We require the relocation type to be PLT32. With a relocation type of PLT32
1469	// the value may be assumed to be used for branching directly to the symbol
1470	// and the addend is only used to produce the relocated value (hence the
1471	// effective addend is always 0). This is because if a PLT is needed the
1472	// addend will be added to the address of the PLT, and it doesn't make sense
1473	// to branch into the middle of a PLT. For example, relative vtable
1474	// relocations use PLT32 and 0 or a positive value as the addend but still are
1475	// used to branch to the symbol.
1476	//
1477	// STT_SECTION symbols are a special case on x86 because the LLVM assembler
1478	// uses them for branches to local symbols which are assembled as referring to
1479	// the section symbol with the addend equal to the symbol value - 4.
1480	if (r.type == R_X86_64_PLT32) {
1481	if (r.sym->isSection())
1482	return r.addend + `4`;
1483	return `0`;
1484	}
1485	return std::nullopt;
1486	}
1487
1488	static std::pair<Relocation *, uint64_t>
1489	getBranchInfoAtTarget(InputSection &is, uint64_t offset) {
1490	auto content = is.contentMaybeDecompress();
1491	if (content.size() > offset && content [offset] == `0xe9`) { // JMP immediate
1492	auto *i = llvm::partition_point(
1493	Range&: is.relocations, P: [&](Relocation &r) { return r.offset < offset + `1`; });
1494	// Unlike with getControlTransferAddend() it is valid to accept a PC32
1495	// relocation here because we know that this is actually a JMP and not some
1496	// other reference, so the interpretation is that we add 4 to the addend and
1497	// use that as the effective addend.
1498	if (i != is.relocations.end() && i->offset == offset + `1` &&
1499	(i->type == R_X86_64_PC32 \|\| i->type == R_X86_64_PLT32)) {
1500	return {i, i->addend + `4`};
1501	}
1502	}
1503	return {nullptr, `0`};
1504	}
1505
1506	static void redirectControlTransferRelocations(Relocation &r1,
1507	const Relocation &r2) {
1508	// The isSection() check handles the STT_SECTION case described above.
1509	// In that case the original addend is irrelevant because it referred to an
1510	// offset within the original target section so we overwrite it.
1511	//
1512	// The +4 is here to compensate for r2.addend which will likely be -4,
1513	// but may also be addend-4 in case of a PC32 branch to symbol+addend.
1514	if (r1.sym->isSection())
1515	r1.addend = r2.addend;
1516	else
1517	r1.addend += r2.addend + `4`;
1518	r1.expr = r2.expr;
1519	r1.sym = r2.sym;
1520	}
1521
1522	void X86_64::applyBranchToBranchOpt() const {
1523	applyBranchToBranchOptImpl(ctx, getControlTransferAddend,
1524	getBranchInfoAtTarget,
1525	redirectControlTransferRelocations);
1526	}
1527
1528	// If Intel Indirect Branch Tracking is enabled, we have to emit special PLT
1529	// entries containing endbr64 instructions. A PLT entry will be split into two
1530	// parts, one in .plt.sec (writePlt), and the other in .plt (writeIBTPlt).
1531	namespace {
1532	class IntelIBT : public X86_64 {
1533	public:
1534	IntelIBT(Ctx &ctx) : X86_64 (ctx) { pltHeaderSize = `0`; };
1535	void writeGotPlt(uint8_t buf, const* Symbol &s) const override;
1536	void writePlt(uint8_t buf, const* Symbol &sym,
1537	uint64_t pltEntryAddr) const override;
1538	void writeIBTPlt(uint8_t buf, size_t numEntries) const* override;
1539
1540	static const unsigned IBTPltHeaderSize = `16`;
1541	};
1542	} // namespace
1543
1544	void IntelIBT::writeGotPlt(uint8_t buf, const* Symbol &s) const {
1545	uint64_t va = ctx.in.ibtPlt ->getVA() + IBTPltHeaderSize +
1546	s.getPltIdx(ctx) * pltEntrySize;
1547	write64le(P: buf, V: va);
1548	}
1549
1550	void IntelIBT::writePlt(uint8_t buf, const* Symbol &sym,
1551	uint64_t pltEntryAddr) const {
1552	const uint8_t Inst[] = {
1553	`0xf3`, `0x0f`, `0x1e`, `0xfa`, // endbr64
1554	`0xff`, `0x25`, `0`, `0`, `0`, `0`, // jmpq got(%rip)*
1555	`0x66`, `0x0f`, `0x1f`, `0x44`, `0`, `0`, // nop
1556	};
1557	memcpy(dest: buf, src: Inst, n: sizeof(Inst));
1558	write32le(P: buf + `6`, V: sym.getGotPltVA(ctx) - pltEntryAddr - `10`);
1559	}
1560
1561	void IntelIBT::writeIBTPlt(uint8_t buf, size_t numEntries) const* {
1562	writePltHeader(buf);
1563	buf += IBTPltHeaderSize;
1564
1565	const uint8_t inst[] = {
1566	`0xf3`, `0x0f`, `0x1e`, `0xfa`, // endbr64
1567	`0x68`, `0`, `0`, `0`, `0`, // pushq <relocation index>
1568	`0xe9`, `0`, `0`, `0`, `0`, // jmpq plt[0]
1569	`0x66`, `0x90`, // nop
1570	};
1571
1572	for (size_t i = `0`; i < numEntries; ++i) {
1573	memcpy(dest: buf, src: inst, n: sizeof(inst));
1574	write32le(P: buf + `5`, V: i);
1575	write32le(P: buf + `10`, V: -pltHeaderSize - sizeof(inst) * i - `30`);
1576	buf += sizeof(inst);
1577	}
1578	}
1579
1580	// These nonstandard PLT entries are to migtigate Spectre v2 security
1581	// vulnerability. In order to mitigate Spectre v2, we want to avoid indirect
1582	// branch instructions such as `jmp GOTPLT(%rip)`. So, in the following PLT*
1583	// entries, we use a CALL followed by MOV and RET to do the same thing as an
1584	// indirect jump. That instruction sequence is so-called "retpoline".
1585	//
1586	// We have two types of retpoline PLTs as a size optimization. If `-z now`
1587	// is specified, all dynamic symbols are resolved at load-time. Thus, when
1588	// that option is given, we can omit code for symbol lazy resolution.
1589	namespace {
1590	class Retpoline : public X86_64 {
1591	public:
1592	Retpoline(Ctx &);
1593	void writeGotPlt(uint8_t buf, const* Symbol &s) const override;
1594	void writePltHeader(uint8_t buf) const* override;
1595	void writePlt(uint8_t buf, const* Symbol &sym,
1596	uint64_t pltEntryAddr) const override;
1597	};
1598
1599	class RetpolineZNow : public X86_64 {
1600	public:
1601	RetpolineZNow(Ctx &);
1602	void writeGotPlt(uint8_t buf, const* Symbol &s) const override {}
1603	void writePltHeader(uint8_t buf) const* override;
1604	void writePlt(uint8_t buf, const* Symbol &sym,
1605	uint64_t pltEntryAddr) const override;
1606	};
1607	} // namespace
1608
1609	Retpoline::Retpoline(Ctx &ctx) : X86_64 (ctx) {
1610	pltHeaderSize = `48`;
1611	pltEntrySize = `32`;
1612	ipltEntrySize = `32`;
1613	}
1614
1615	void Retpoline::writeGotPlt(uint8_t buf, const* Symbol &s) const {
1616	write64le(P: buf, V: s.getPltVA(ctx) + `17`);
1617	}
1618
1619	void Retpoline::writePltHeader(uint8_t buf) const* {
1620	const uint8_t insn[] = {
1621	`0xff`, `0x35`, `0`, `0`, `0`, `0`, // 0: pushq GOTPLT+8(%rip)
1622	`0x4c`, `0x8b`, `0x1d`, `0`, `0`, `0`, `0`, // 6: mov GOTPLT+16(%rip), %r11
1623	`0xe8`, `0x0e`, `0x00`, `0x00`, `0x00`, // d: callq next
1624	`0xf3`, `0x90`, // 12: loop: pause
1625	`0x0f`, `0xae`, `0xe8`, // 14: lfence
1626	`0xeb`, `0xf9`, // 17: jmp loop
1627	`0xcc`, `0xcc`, `0xcc`, `0xcc`, `0xcc`, `0xcc`, `0xcc`, // 19: int3; .align 16
1628	`0x4c`, `0x89`, `0x1c`, `0x24`, // 20: next: mov %r11, (%rsp)
1629	`0xc3`, // 24: ret
1630	`0xcc`, `0xcc`, `0xcc`, `0xcc`, `0xcc`, `0xcc`, `0xcc`, // 25: int3; padding
1631	`0xcc`, `0xcc`, `0xcc`, `0xcc`, // 2c: int3; padding
1632	};
1633	memcpy(dest: buf, src: insn, n: sizeof(insn));
1634
1635	uint64_t gotPlt = ctx.in.gotPlt ->getVA();
1636	uint64_t plt = ctx.in.plt ->getVA();
1637	write32le(P: buf + `2`, V: gotPlt - plt - `6` + `8`);
1638	write32le(P: buf + `9`, V: gotPlt - plt - `13` + `16`);
1639	}
1640
1641	void Retpoline::writePlt(uint8_t buf, const* Symbol &sym,
1642	uint64_t pltEntryAddr) const {
1643	const uint8_t insn[] = {
1644	`0x4c`, `0x8b`, `0x1d`, `0`, `0`, `0`, `0`, // 0: mov foo@GOTPLT(%rip), %r11
1645	`0xe8`, `0`, `0`, `0`, `0`, // 7: callq plt+0x20
1646	`0xe9`, `0`, `0`, `0`, `0`, // c: jmp plt+0x12
1647	`0x68`, `0`, `0`, `0`, `0`, // 11: pushq <relocation index>
1648	`0xe9`, `0`, `0`, `0`, `0`, // 16: jmp plt+0
1649	`0xcc`, `0xcc`, `0xcc`, `0xcc`, `0xcc`, // 1b: int3; padding
1650	};
1651	memcpy(dest: buf, src: insn, n: sizeof(insn));
1652
1653	uint64_t off = pltEntryAddr - ctx.in.plt ->getVA();
1654
1655	write32le(P: buf + `3`, V: sym.getGotPltVA(ctx) - pltEntryAddr - `7`);
1656	write32le(P: buf + `8`, V: -off - `12` + `32`);
1657	write32le(P: buf + `13`, V: -off - `17` + `18`);
1658	write32le(P: buf + `18`, V: sym.getPltIdx(ctx));
1659	write32le(P: buf + `23`, V: -off - `27`);
1660	}
1661
1662	RetpolineZNow::RetpolineZNow(Ctx &ctx) : X86_64 (ctx) {
1663	pltHeaderSize = `32`;
1664	pltEntrySize = `16`;
1665	ipltEntrySize = `16`;
1666	}
1667
1668	void RetpolineZNow::writePltHeader(uint8_t buf) const* {
1669	const uint8_t insn[] = {
1670	`0xe8`, `0x0b`, `0x00`, `0x00`, `0x00`, // 0: call next
1671	`0xf3`, `0x90`, // 5: loop: pause
1672	`0x0f`, `0xae`, `0xe8`, // 7: lfence
1673	`0xeb`, `0xf9`, // a: jmp loop
1674	`0xcc`, `0xcc`, `0xcc`, `0xcc`, // c: int3; .align 16
1675	`0x4c`, `0x89`, `0x1c`, `0x24`, // 10: next: mov %r11, (%rsp)
1676	`0xc3`, // 14: ret
1677	`0xcc`, `0xcc`, `0xcc`, `0xcc`, `0xcc`, // 15: int3; padding
1678	`0xcc`, `0xcc`, `0xcc`, `0xcc`, `0xcc`, // 1a: int3; padding
1679	`0xcc`, // 1f: int3; padding
1680	};
1681	memcpy(dest: buf, src: insn, n: sizeof(insn));
1682	}
1683
1684	void RetpolineZNow::writePlt(uint8_t buf, const* Symbol &sym,
1685	uint64_t pltEntryAddr) const {
1686	const uint8_t insn[] = {
1687	`0x4c`, `0x8b`, `0x1d`, `0`, `0`, `0`, `0`, // mov foo@GOTPLT(%rip), %r11
1688	`0xe9`, `0`, `0`, `0`, `0`, // jmp plt+0
1689	`0xcc`, `0xcc`, `0xcc`, `0xcc`, // int3; padding
1690	};
1691	memcpy(dest: buf, src: insn, n: sizeof(insn));
1692
1693	write32le(P: buf + `3`, V: sym.getGotPltVA(ctx) - pltEntryAddr - `7`);
1694	write32le(P: buf + `8`, V: ctx.in.plt ->getVA() - pltEntryAddr - `12`);
1695	}
1696
1697	void elf::setX86_64TargetInfo(Ctx &ctx) {
1698	if (ctx.arg.zRetpolineplt) {
1699	if (ctx.arg.zNow)
1700	ctx.target.reset(p: new RetpolineZNow (ctx));
1701	else
1702	ctx.target.reset(p: new Retpoline (ctx));
1703	return;
1704	}
1705
1706	if (ctx.arg.andFeatures & GNU_PROPERTY_X86_FEATURE_1_IBT)
1707	ctx.target.reset(p: new IntelIBT (ctx));
1708	else
1709	ctx.target.reset(p: new X86_64 (ctx));
1710	}
1711

Browse the source code of llvm_projects/lld/ELF/Arch/X86_64.cpp