1//===- X86_64.cpp ---------------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "OutputSections.h"
10#include "RelocScan.h"
11#include "Relocations.h"
12#include "Symbols.h"
13#include "SyntheticSections.h"
14#include "Target.h"
15#include "TargetImpl.h"
16#include "llvm/BinaryFormat/ELF.h"
17#include "llvm/Support/Endian.h"
18#include "llvm/Support/MathExtras.h"
19
20using namespace llvm;
21using namespace llvm::object;
22using namespace llvm::support::endian;
23using namespace llvm::ELF;
24using namespace lld;
25using namespace lld::elf;
26
27namespace {
28class X86_64 : public TargetInfo {
29public:
30 X86_64(Ctx &);
31 RelExpr getRelExpr(RelType type, const Symbol &s,
32 const uint8_t *loc) const override;
33 RelType getDynRel(RelType type) const override;
34 void writeGotPltHeader(uint8_t *buf) const override;
35 void writeGotPlt(uint8_t *buf, const Symbol &s) const override;
36 void writeIgotPlt(uint8_t *buf, const Symbol &s) const override;
37 void writePltHeader(uint8_t *buf) const override;
38 void writePlt(uint8_t *buf, const Symbol &sym,
39 uint64_t pltEntryAddr) const override;
40 void relocate(uint8_t *loc, const Relocation &rel,
41 uint64_t val) const override;
42 int64_t getImplicitAddend(const uint8_t *buf, RelType type) const override;
43 void applyJumpInstrMod(uint8_t *loc, JumpModType type,
44 unsigned size) const override;
45 RelExpr adjustGotPcExpr(RelType type, int64_t addend,
46 const uint8_t *loc) const override;
47 void relocateAlloc(InputSection &sec, uint8_t *buf) const override;
48 bool adjustPrologueForCrossSplitStack(uint8_t *loc, uint8_t *end,
49 uint8_t stOther) const override;
50 bool deleteFallThruJmpInsn(InputSection &is,
51 InputSection *nextIS) const override;
52 bool relaxOnce(int pass) const override;
53 void applyBranchToBranchOpt() const override;
54 template <class ELFT, class RelTy>
55 void scanSectionImpl(InputSectionBase &sec, Relocs<RelTy> rels);
56 void scanSection(InputSectionBase &sec) override;
57
58private:
59 void relaxTlsGdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
60 void relaxTlsGdToIe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
61 void relaxTlsLdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
62 void relaxTlsIeToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
63};
64} // namespace
65
66// This is vector of NOP instructions of sizes from 1 to 8 bytes. The
67// appropriately sized instructions are used to fill the gaps between sections
68// which are executed during fall through.
69static const std::vector<std::vector<uint8_t>> nopInstructions = {
70 {0x90},
71 {0x66, 0x90},
72 {0x0f, 0x1f, 0x00},
73 {0x0f, 0x1f, 0x40, 0x00},
74 {0x0f, 0x1f, 0x44, 0x00, 0x00},
75 {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00},
76 {0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00},
77 {0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
78 {0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}};
79
80X86_64::X86_64(Ctx &ctx) : TargetInfo(ctx) {
81 copyRel = R_X86_64_COPY;
82 gotRel = R_X86_64_GLOB_DAT;
83 pltRel = R_X86_64_JUMP_SLOT;
84 relativeRel = R_X86_64_RELATIVE;
85 iRelativeRel = R_X86_64_IRELATIVE;
86 symbolicRel = ctx.arg.is64 ? R_X86_64_64 : R_X86_64_32;
87 tlsDescRel = R_X86_64_TLSDESC;
88 tlsGotRel = R_X86_64_TPOFF64;
89 tlsModuleIndexRel = R_X86_64_DTPMOD64;
90 tlsOffsetRel = R_X86_64_DTPOFF64;
91 gotBaseSymInGotPlt = true;
92 gotEntrySize = 8;
93 pltHeaderSize = 16;
94 pltEntrySize = 16;
95 ipltEntrySize = 16;
96 trapInstr = {0xcc, 0xcc, 0xcc, 0xcc}; // 0xcc = INT3
97 nopInstrs = nopInstructions;
98
99 // Align to the large page size (known as a superpage or huge page).
100 // FreeBSD automatically promotes large, superpage-aligned allocations.
101 defaultImageBase = 0x200000;
102}
103
104// Opcodes for the different X86_64 jmp instructions.
105enum JmpInsnOpcode : uint32_t {
106 J_JMP_32,
107 J_JNE_32,
108 J_JE_32,
109 J_JG_32,
110 J_JGE_32,
111 J_JB_32,
112 J_JBE_32,
113 J_JL_32,
114 J_JLE_32,
115 J_JA_32,
116 J_JAE_32,
117 J_UNKNOWN,
118};
119
120// Given the first (optional) and second byte of the insn's opcode, this
121// returns the corresponding enum value.
122static JmpInsnOpcode getJmpInsnType(const uint8_t *first,
123 const uint8_t *second) {
124 if (*second == 0xe9)
125 return J_JMP_32;
126
127 if (first == nullptr)
128 return J_UNKNOWN;
129
130 if (*first == 0x0f) {
131 switch (*second) {
132 case 0x84:
133 return J_JE_32;
134 case 0x85:
135 return J_JNE_32;
136 case 0x8f:
137 return J_JG_32;
138 case 0x8d:
139 return J_JGE_32;
140 case 0x82:
141 return J_JB_32;
142 case 0x86:
143 return J_JBE_32;
144 case 0x8c:
145 return J_JL_32;
146 case 0x8e:
147 return J_JLE_32;
148 case 0x87:
149 return J_JA_32;
150 case 0x83:
151 return J_JAE_32;
152 }
153 }
154 return J_UNKNOWN;
155}
156
157// Return the relocation index for input section IS with a specific Offset.
158// Returns the maximum size of the vector if no such relocation is found.
159static unsigned getRelocationWithOffset(const InputSection &is,
160 uint64_t offset) {
161 unsigned size = is.relocs().size();
162 for (unsigned i = size - 1; i + 1 > 0; --i) {
163 if (is.relocs()[i].offset == offset && is.relocs()[i].expr != R_NONE)
164 return i;
165 }
166 return size;
167}
168
169// Returns true if R corresponds to a relocation used for a jump instruction.
170// TODO: Once special relocations for relaxable jump instructions are available,
171// this should be modified to use those relocations.
172static bool isRelocationForJmpInsn(Relocation &R) {
173 return R.type == R_X86_64_PLT32 || R.type == R_X86_64_PC32 ||
174 R.type == R_X86_64_PC8;
175}
176
177// Return true if Relocation R points to the first instruction in the
178// next section.
179// TODO: Delete this once psABI reserves a new relocation type for fall thru
180// jumps.
181static bool isFallThruRelocation(InputSection &is, InputSection *nextIS,
182 Relocation &r) {
183 if (!isRelocationForJmpInsn(R&: r))
184 return false;
185
186 uint64_t addrLoc = is.getOutputSection()->addr + is.outSecOff + r.offset;
187 uint64_t targetOffset = is.getRelocTargetVA(is.getCtx(), r, p: addrLoc);
188
189 // If this jmp is a fall thru, the target offset is the beginning of the
190 // next section.
191 uint64_t nextSectionOffset =
192 nextIS->getOutputSection()->addr + nextIS->outSecOff;
193 return (addrLoc + 4 + targetOffset) == nextSectionOffset;
194}
195
196// Return the jmp instruction opcode that is the inverse of the given
197// opcode. For example, JE inverted is JNE.
198static JmpInsnOpcode invertJmpOpcode(const JmpInsnOpcode opcode) {
199 switch (opcode) {
200 case J_JE_32:
201 return J_JNE_32;
202 case J_JNE_32:
203 return J_JE_32;
204 case J_JG_32:
205 return J_JLE_32;
206 case J_JGE_32:
207 return J_JL_32;
208 case J_JB_32:
209 return J_JAE_32;
210 case J_JBE_32:
211 return J_JA_32;
212 case J_JL_32:
213 return J_JGE_32;
214 case J_JLE_32:
215 return J_JG_32;
216 case J_JA_32:
217 return J_JBE_32;
218 case J_JAE_32:
219 return J_JB_32;
220 default:
221 return J_UNKNOWN;
222 }
223}
224
225// Deletes direct jump instruction in input sections that jumps to the
226// following section as it is not required. If there are two consecutive jump
227// instructions, it checks if they can be flipped and one can be deleted.
228// For example:
229// .section .text
230// a.BB.foo:
231// ...
232// 10: jne aa.BB.foo
233// 16: jmp bar
234// aa.BB.foo:
235// ...
236//
237// can be converted to:
238// a.BB.foo:
239// ...
240// 10: je bar #jne flipped to je and the jmp is deleted.
241// aa.BB.foo:
242// ...
243bool X86_64::deleteFallThruJmpInsn(InputSection &is,
244 InputSection *nextIS) const {
245 const unsigned sizeOfDirectJmpInsn = 5;
246
247 if (nextIS == nullptr)
248 return false;
249
250 if (is.getSize() < sizeOfDirectJmpInsn)
251 return false;
252
253 // If this jmp insn can be removed, it is the last insn and the
254 // relocation is 4 bytes before the end.
255 unsigned rIndex = getRelocationWithOffset(is, offset: is.getSize() - 4);
256 if (rIndex == is.relocs().size())
257 return false;
258
259 Relocation &r = is.relocs()[rIndex];
260
261 // Check if the relocation corresponds to a direct jmp.
262 const uint8_t *secContents = is.content().data();
263 // If it is not a direct jmp instruction, there is nothing to do here.
264 if (*(secContents + r.offset - 1) != 0xe9)
265 return false;
266
267 if (isFallThruRelocation(is, nextIS, r)) {
268 // This is a fall thru and can be deleted.
269 r.expr = R_NONE;
270 r.offset = 0;
271 is.drop_back(num: sizeOfDirectJmpInsn);
272 is.nopFiller = true;
273 return true;
274 }
275
276 // Now, check if flip and delete is possible.
277 const unsigned sizeOfJmpCCInsn = 6;
278 // To flip, there must be at least one JmpCC and one direct jmp.
279 if (is.getSize() < sizeOfDirectJmpInsn + sizeOfJmpCCInsn)
280 return false;
281
282 unsigned rbIndex =
283 getRelocationWithOffset(is, offset: (is.getSize() - sizeOfDirectJmpInsn - 4));
284 if (rbIndex == is.relocs().size())
285 return false;
286
287 Relocation &rB = is.relocs()[rbIndex];
288
289 const uint8_t *jmpInsnB = secContents + rB.offset - 1;
290 JmpInsnOpcode jmpOpcodeB = getJmpInsnType(first: jmpInsnB - 1, second: jmpInsnB);
291 if (jmpOpcodeB == J_UNKNOWN)
292 return false;
293
294 if (!isFallThruRelocation(is, nextIS, r&: rB))
295 return false;
296
297 // jmpCC jumps to the fall thru block, the branch can be flipped and the
298 // jmp can be deleted.
299 JmpInsnOpcode jInvert = invertJmpOpcode(opcode: jmpOpcodeB);
300 if (jInvert == J_UNKNOWN)
301 return false;
302 is.jumpInstrMod = make<JumpInstrMod>();
303 *is.jumpInstrMod = {.offset: rB.offset - 1, .original: jInvert, .size: 4};
304 // Move R's values to rB except the offset.
305 rB = {.expr: r.expr, .type: r.type, .offset: rB.offset, .addend: r.addend, .sym: r.sym};
306 // Cancel R
307 r.expr = R_NONE;
308 r.offset = 0;
309 is.drop_back(num: sizeOfDirectJmpInsn);
310 is.nopFiller = true;
311 return true;
312}
313
314bool X86_64::relaxOnce(int pass) const {
315 uint64_t minVA = UINT64_MAX, maxVA = 0;
316 for (OutputSection *osec : ctx.outputSections) {
317 if (!(osec->flags & SHF_ALLOC))
318 continue;
319 minVA = std::min(a: minVA, b: osec->addr);
320 maxVA = std::max(a: maxVA, b: osec->addr + osec->size);
321 }
322 // If the max VA is under 2^31, GOTPCRELX relocations cannot overfow. In
323 // -pie/-shared, the condition can be relaxed to test the max VA difference as
324 // there is no R_RELAX_GOT_PC_NOPIC.
325 if (isUInt<31>(x: maxVA) || (isUInt<31>(x: maxVA - minVA) && ctx.arg.isPic))
326 return false;
327
328 SmallVector<InputSection *, 0> storage;
329 bool changed = false;
330 for (OutputSection *osec : ctx.outputSections) {
331 if (!(osec->flags & SHF_EXECINSTR))
332 continue;
333 for (InputSection *sec : getInputSections(os: *osec, storage)) {
334 for (Relocation &rel : sec->relocs()) {
335 if (rel.expr != R_RELAX_GOT_PC && rel.expr != R_RELAX_GOT_PC_NOPIC)
336 continue;
337 assert(rel.addend == -4);
338
339 Relocation rel1 = rel;
340 rel1.addend = rel.expr == R_RELAX_GOT_PC_NOPIC ? 0 : -4;
341 uint64_t v = sec->getRelocTargetVA(ctx, r: rel1,
342 p: sec->getOutputSection()->addr +
343 sec->outSecOff + rel.offset);
344 if (isInt<32>(x: v))
345 continue;
346 if (rel.sym->auxIdx == 0) {
347 rel.sym->allocateAux(ctx);
348 addGotEntry(ctx, sym&: *rel.sym);
349 changed = true;
350 }
351 rel.expr = R_GOT_PC;
352 }
353 }
354 }
355 return changed;
356}
357
358// Only needed to support relocations used by relocateNonAlloc and relocateEh.
359RelExpr X86_64::getRelExpr(RelType type, const Symbol &s,
360 const uint8_t *loc) const {
361 switch (type) {
362 case R_X86_64_8:
363 case R_X86_64_16:
364 case R_X86_64_32:
365 case R_X86_64_32S:
366 case R_X86_64_64:
367 return R_ABS;
368 case R_X86_64_SIZE32:
369 case R_X86_64_SIZE64:
370 return R_SIZE;
371 case R_X86_64_DTPOFF32:
372 case R_X86_64_DTPOFF64:
373 return R_DTPREL;
374 case R_X86_64_PC8:
375 case R_X86_64_PC16:
376 case R_X86_64_PC32:
377 case R_X86_64_PC64:
378 return R_PC;
379 case R_X86_64_GOTOFF64:
380 return R_GOTPLTREL;
381 case R_X86_64_GOTPC32:
382 case R_X86_64_GOTPC64:
383 return R_GOTPLTONLY_PC;
384 case R_X86_64_NONE:
385 return R_NONE;
386 default:
387 Err(ctx) << getErrorLoc(ctx, loc) << "unknown relocation (" << type.v
388 << ") against symbol " << &s;
389 return R_NONE;
390 }
391}
392
393void X86_64::writeGotPltHeader(uint8_t *buf) const {
394 // The first entry holds the link-time address of _DYNAMIC. It is documented
395 // in the psABI and glibc before Aug 2021 used the entry to compute run-time
396 // load address of the shared object (note that this is relevant for linking
397 // ld.so, not any other program).
398 write64le(P: buf, V: ctx.mainPart->dynamic->getVA());
399}
400
401void X86_64::writeGotPlt(uint8_t *buf, const Symbol &s) const {
402 // See comments in X86::writeGotPlt.
403 write64le(P: buf, V: s.getPltVA(ctx) + 6);
404}
405
406void X86_64::writeIgotPlt(uint8_t *buf, const Symbol &s) const {
407 // An x86 entry is the address of the ifunc resolver function (for -z rel).
408 if (ctx.arg.writeAddends)
409 write64le(P: buf, V: s.getVA(ctx));
410}
411
412void X86_64::writePltHeader(uint8_t *buf) const {
413 const uint8_t pltData[] = {
414 0xff, 0x35, 0, 0, 0, 0, // pushq GOTPLT+8(%rip)
415 0xff, 0x25, 0, 0, 0, 0, // jmp *GOTPLT+16(%rip)
416 0x0f, 0x1f, 0x40, 0x00, // nop
417 };
418 memcpy(dest: buf, src: pltData, n: sizeof(pltData));
419 uint64_t gotPlt = ctx.in.gotPlt->getVA();
420 uint64_t plt = ctx.in.ibtPlt ? ctx.in.ibtPlt->getVA() : ctx.in.plt->getVA();
421 write32le(P: buf + 2, V: gotPlt - plt + 2); // GOTPLT+8
422 write32le(P: buf + 8, V: gotPlt - plt + 4); // GOTPLT+16
423}
424
425void X86_64::writePlt(uint8_t *buf, const Symbol &sym,
426 uint64_t pltEntryAddr) const {
427 const uint8_t inst[] = {
428 0xff, 0x25, 0, 0, 0, 0, // jmpq *got(%rip)
429 0x68, 0, 0, 0, 0, // pushq <relocation index>
430 0xe9, 0, 0, 0, 0, // jmpq plt[0]
431 };
432 memcpy(dest: buf, src: inst, n: sizeof(inst));
433
434 write32le(P: buf + 2, V: sym.getGotPltVA(ctx) - pltEntryAddr - 6);
435 write32le(P: buf + 7, V: sym.getPltIdx(ctx));
436 write32le(P: buf + 12, V: ctx.in.plt->getVA() - pltEntryAddr - 16);
437}
438
439RelType X86_64::getDynRel(RelType type) const {
440 if (type == symbolicRel || type == R_X86_64_SIZE32 || type == R_X86_64_SIZE64)
441 return type;
442 return R_X86_64_NONE;
443}
444
445template <class ELFT, class RelTy>
446void X86_64::scanSectionImpl(InputSectionBase &sec, Relocs<RelTy> rels) {
447 RelocScan rs(ctx, &sec);
448 sec.relocations.reserve(N: rels.size());
449
450 for (auto it = rels.begin(); it != rels.end(); ++it) {
451 const RelTy &rel = *it;
452 uint32_t symIdx = rel.getSymbol(false);
453 Symbol &sym = sec.getFile<ELFT>()->getSymbol(symIdx);
454 uint64_t offset = rel.r_offset;
455 RelType type = rel.getType(false);
456 if (sym.isUndefined() && symIdx != 0 &&
457 rs.maybeReportUndefined(sym&: cast<Undefined>(Val&: sym), offset))
458 continue;
459 int64_t addend = rs.getAddend<ELFT>(rel, type);
460 RelExpr expr;
461 // Relocation types that only need a RelExpr set `expr` and break out of
462 // the switch to reach rs.process(). Types that need special handling
463 // (fast-path helpers, TLS) call a handler and use `continue`.
464 switch (type) {
465 case R_X86_64_NONE:
466 continue;
467
468 // Absolute relocations:
469 case R_X86_64_8:
470 case R_X86_64_16:
471 case R_X86_64_32:
472 case R_X86_64_32S:
473 case R_X86_64_64:
474 expr = R_ABS;
475 break;
476
477 // PC-relative relocations:
478 case R_X86_64_PC8:
479 case R_X86_64_PC16:
480 case R_X86_64_PC32:
481 case R_X86_64_PC64:
482 rs.processR_PC(type, offset, addend, sym);
483 continue;
484
485 // GOT-generating relocations:
486 case R_X86_64_GOTPC32:
487 case R_X86_64_GOTPC64:
488 ctx.in.gotPlt->hasGotPltOffRel.store(i: true, m: std::memory_order_relaxed);
489 expr = R_GOTPLTONLY_PC;
490 break;
491 case R_X86_64_GOTOFF64:
492 ctx.in.gotPlt->hasGotPltOffRel.store(i: true, m: std::memory_order_relaxed);
493 expr = R_GOTPLTREL;
494 break;
495 case R_X86_64_GOT32:
496 case R_X86_64_GOT64:
497 ctx.in.gotPlt->hasGotPltOffRel.store(i: true, m: std::memory_order_relaxed);
498 expr = R_GOTPLT;
499 break;
500 case R_X86_64_PLTOFF64:
501 ctx.in.gotPlt->hasGotPltOffRel.store(i: true, m: std::memory_order_relaxed);
502 expr = R_PLT_GOTPLT;
503 break;
504 case R_X86_64_GOTPCREL:
505 case R_X86_64_GOTPCRELX:
506 case R_X86_64_REX_GOTPCRELX:
507 case R_X86_64_CODE_4_GOTPCRELX:
508 expr = R_GOT_PC;
509 break;
510
511 // PLT-generating relocation:
512 case R_X86_64_PLT32:
513 rs.processR_PLT_PC(type, offset, addend, sym);
514 continue;
515
516 // TLS relocations:
517 case R_X86_64_TPOFF32:
518 case R_X86_64_TPOFF64:
519 if (rs.checkTlsLe(offset, sym, type))
520 continue;
521 expr = R_TPREL;
522 break;
523 case R_X86_64_GOTTPOFF:
524 case R_X86_64_CODE_4_GOTTPOFF:
525 case R_X86_64_CODE_6_GOTTPOFF:
526 rs.handleTlsIe(ieExpr: R_GOT_PC, type, offset, addend, sym);
527 continue;
528 case R_X86_64_TLSGD:
529 if (rs.handleTlsGd(sharedExpr: R_TLSGD_PC, ieExpr: R_GOT_PC, leExpr: R_TPREL, type, offset, addend,
530 sym))
531 ++it;
532 continue;
533 case R_X86_64_TLSLD:
534 if (rs.handleTlsLd(sharedExpr: R_TLSLD_PC, type, offset, addend, sym))
535 ++it;
536 continue;
537 case R_X86_64_DTPOFF32:
538 case R_X86_64_DTPOFF64:
539 sec.addReloc(
540 r: {.expr: ctx.arg.shared ? R_DTPREL : R_TPREL, .type: type, .offset: offset, .addend: addend, .sym: &sym});
541 continue;
542 case R_X86_64_TLSDESC_CALL:
543 // For executables, TLSDESC is optimized to IE or LE. Use R_TPREL as the
544 // rewrites for this relocation are identical.
545 if (!ctx.arg.shared)
546 sec.addReloc(r: {.expr: R_TPREL, .type: type, .offset: offset, .addend: addend, .sym: &sym});
547 continue;
548 case R_X86_64_GOTPC32_TLSDESC:
549 case R_X86_64_CODE_4_GOTPC32_TLSDESC:
550 rs.handleTlsDesc(sharedExpr: R_TLSDESC_PC, ieExpr: R_GOT_PC, type, offset, addend, sym);
551 continue;
552
553 // Misc relocations:
554 case R_X86_64_SIZE32:
555 case R_X86_64_SIZE64:
556 expr = R_SIZE;
557 break;
558
559 default:
560 Err(ctx) << getErrorLoc(ctx, loc: sec.content().data() + offset)
561 << "unknown relocation (" << type.v << ") against symbol "
562 << &sym;
563 continue;
564 }
565 rs.process(expr, type, offset, sym, addend);
566 }
567
568 if (ctx.arg.branchToBranch)
569 llvm::stable_sort(sec.relocs(),
570 [](auto &l, auto &r) { return l.offset < r.offset; });
571}
572
573void X86_64::scanSection(InputSectionBase &sec) {
574 if (ctx.arg.is64)
575 elf::scanSection1<X86_64, ELF64LE>(target&: *this, sec);
576 else // ilp32
577 elf::scanSection1<X86_64, ELF32LE>(target&: *this, sec);
578}
579
580void X86_64::relaxTlsGdToLe(uint8_t *loc, const Relocation &rel,
581 uint64_t val) const {
582 if (rel.type == R_X86_64_TLSGD) {
583 // Convert
584 // .byte 0x66
585 // leaq x@tlsgd(%rip), %rdi
586 // .word 0x6666
587 // rex64
588 // call __tls_get_addr@plt
589 // to the following two instructions.
590 const uint8_t inst[] = {
591 0x64, 0x48, 0x8b, 0x04, 0x25, 0x00, 0x00,
592 0x00, 0x00, // mov %fs:0x0,%rax
593 0x48, 0x8d, 0x80, 0, 0, 0, 0, // lea x@tpoff,%rax
594 };
595 memcpy(dest: loc - 4, src: inst, n: sizeof(inst));
596
597 // The original code used a pc relative relocation and so we have to
598 // compensate for the -4 in had in the addend.
599 write32le(P: loc + 8, V: val + 4);
600 } else if (rel.type == R_X86_64_GOTPC32_TLSDESC ||
601 rel.type == R_X86_64_CODE_4_GOTPC32_TLSDESC) {
602 // Convert leaq x@tlsdesc(%rip), %REG to movq $x@tpoff, %REG.
603 if ((loc[-3] & 0xfb) != 0x48 || loc[-2] != 0x8d ||
604 (loc[-1] & 0xc7) != 0x05) {
605 Err(ctx) << getErrorLoc(ctx, loc: (rel.type == R_X86_64_GOTPC32_TLSDESC)
606 ? loc - 3
607 : loc - 4)
608 << "R_X86_64_GOTPC32_TLSDESC/R_X86_64_CODE_4_GOTPC32_TLSDESC "
609 "must be used in leaq x@tlsdesc(%rip), %REG";
610 return;
611 }
612 if (rel.type == R_X86_64_GOTPC32_TLSDESC) {
613 loc[-3] = 0x48 | ((loc[-3] >> 2) & 1);
614 } else {
615 loc[-3] = (loc[-3] & ~0x44) | ((loc[-3] & 0x44) >> 2);
616 }
617 loc[-2] = 0xc7;
618 loc[-1] = 0xc0 | ((loc[-1] >> 3) & 7);
619
620 write32le(P: loc, V: val + 4);
621 } else {
622 // Convert call *x@tlsdesc(%REG) to xchg ax, ax.
623 assert(rel.type == R_X86_64_TLSDESC_CALL);
624 loc[0] = 0x66;
625 loc[1] = 0x90;
626 }
627}
628
629void X86_64::relaxTlsGdToIe(uint8_t *loc, const Relocation &rel,
630 uint64_t val) const {
631 if (rel.type == R_X86_64_TLSGD) {
632 // Convert
633 // .byte 0x66
634 // leaq x@tlsgd(%rip), %rdi
635 // .word 0x6666
636 // rex64
637 // call __tls_get_addr@plt
638 // to the following two instructions.
639 const uint8_t inst[] = {
640 0x64, 0x48, 0x8b, 0x04, 0x25, 0x00, 0x00,
641 0x00, 0x00, // mov %fs:0x0,%rax
642 0x48, 0x03, 0x05, 0, 0, 0, 0, // addq x@gottpoff(%rip),%rax
643 };
644 memcpy(dest: loc - 4, src: inst, n: sizeof(inst));
645
646 // Both code sequences are PC relatives, but since we are moving the
647 // constant forward by 8 bytes we have to subtract the value by 8.
648 write32le(P: loc + 8, V: val - 8);
649 } else if (rel.type == R_X86_64_GOTPC32_TLSDESC ||
650 rel.type == R_X86_64_CODE_4_GOTPC32_TLSDESC) {
651 // Convert leaq x@tlsdesc(%rip), %REG to movq x@gottpoff(%rip), %REG.
652 if ((loc[-3] & 0xfb) != 0x48 || loc[-2] != 0x8d ||
653 (loc[-1] & 0xc7) != 0x05) {
654 Err(ctx) << getErrorLoc(ctx, loc: (rel.type == R_X86_64_GOTPC32_TLSDESC)
655 ? loc - 3
656 : loc - 4)
657 << "R_X86_64_GOTPC32_TLSDESC/R_X86_64_CODE_4_GOTPC32_TLSDESC "
658 "must be used in leaq x@tlsdesc(%rip), %REG";
659 return;
660 }
661 loc[-2] = 0x8b;
662 write32le(P: loc, V: val);
663 }
664}
665
666// In some conditions,
667// R_X86_64_GOTTPOFF/R_X86_64_CODE_4_GOTTPOFF/R_X86_64_CODE_6_GOTTPOFF
668// relocation can be optimized to R_X86_64_TPOFF32 so that it does not use GOT.
669void X86_64::relaxTlsIeToLe(uint8_t *loc, const Relocation &rel,
670 uint64_t val) const {
671 uint8_t *inst = loc - 3;
672 uint8_t reg = loc[-1] >> 3;
673 uint8_t *regSlot = loc - 1;
674
675 if (rel.type == R_X86_64_GOTTPOFF) {
676 // Note that ADD with RSP or R12 is converted to ADD instead of LEA
677 // because LEA with these registers needs 4 bytes to encode and thus
678 // wouldn't fit the space.
679
680 if (memcmp(s1: inst, s2: "\x48\x03\x25", n: 3) == 0) {
681 // "addq foo@gottpoff(%rip),%rsp" -> "addq $foo,%rsp"
682 memcpy(dest: inst, src: "\x48\x81\xc4", n: 3);
683 } else if (memcmp(s1: inst, s2: "\x4c\x03\x25", n: 3) == 0) {
684 // "addq foo@gottpoff(%rip),%r12" -> "addq $foo,%r12"
685 memcpy(dest: inst, src: "\x49\x81\xc4", n: 3);
686 } else if (memcmp(s1: inst, s2: "\x4c\x03", n: 2) == 0) {
687 // "addq foo@gottpoff(%rip),%r[8-15]" -> "leaq foo(%r[8-15]),%r[8-15]"
688 memcpy(dest: inst, src: "\x4d\x8d", n: 2);
689 *regSlot = 0x80 | (reg << 3) | reg;
690 } else if (memcmp(s1: inst, s2: "\x48\x03", n: 2) == 0) {
691 // "addq foo@gottpoff(%rip),%reg -> "leaq foo(%reg),%reg"
692 memcpy(dest: inst, src: "\x48\x8d", n: 2);
693 *regSlot = 0x80 | (reg << 3) | reg;
694 } else if (memcmp(s1: inst, s2: "\x4c\x8b", n: 2) == 0) {
695 // "movq foo@gottpoff(%rip),%r[8-15]" -> "movq $foo,%r[8-15]"
696 memcpy(dest: inst, src: "\x49\xc7", n: 2);
697 *regSlot = 0xc0 | reg;
698 } else if (memcmp(s1: inst, s2: "\x48\x8b", n: 2) == 0) {
699 // "movq foo@gottpoff(%rip),%reg" -> "movq $foo,%reg"
700 memcpy(dest: inst, src: "\x48\xc7", n: 2);
701 *regSlot = 0xc0 | reg;
702 } else {
703 Err(ctx)
704 << getErrorLoc(ctx, loc: loc - 3)
705 << "R_X86_64_GOTTPOFF must be used in MOVQ or ADDQ instructions only";
706 }
707 } else if (rel.type == R_X86_64_CODE_4_GOTTPOFF) {
708 if (loc[-4] != 0xd5) {
709 Err(ctx) << getErrorLoc(ctx, loc: loc - 4)
710 << "invalid prefix with R_X86_64_CODE_4_GOTTPOFF!";
711 return;
712 }
713 const uint8_t rex = loc[-3];
714 loc[-3] = (rex & ~0x44) | (rex & 0x44) >> 2;
715 *regSlot = 0xc0 | reg;
716
717 if (loc[-2] == 0x8b) {
718 // "movq foo@gottpoff(%rip),%r[16-31]" -> "movq $foo,%r[16-31]"
719 loc[-2] = 0xc7;
720 } else if (loc[-2] == 0x03) {
721 // "addq foo@gottpoff(%rip),%r[16-31]" -> "addq $foo,%r[16-31]"
722 loc[-2] = 0x81;
723 } else {
724 Err(ctx) << getErrorLoc(ctx, loc: loc - 4)
725 << "R_X86_64_CODE_4_GOTTPOFF must be used in MOVQ or ADDQ "
726 "instructions only";
727 }
728 } else if (rel.type == R_X86_64_CODE_6_GOTTPOFF) {
729 if (loc[-6] != 0x62) {
730 Err(ctx) << getErrorLoc(ctx, loc: loc - 6)
731 << "invalid prefix with R_X86_64_CODE_6_GOTTPOFF!";
732 return;
733 }
734 // Check bits are satisfied:
735 // loc[-5]: X==1 (inverted polarity), (loc[-5] & 0x7) == 0x4
736 // loc[-4]: W==1, X2==1 (inverted polarity), pp==0b00(NP)
737 // loc[-3]: NF==1 or ND==1
738 // loc[-2]: opcode==0x1 or opcode==0x3
739 // loc[-1]: Mod==0b00, RM==0b101
740 if (((loc[-5] & 0x47) == 0x44) && ((loc[-4] & 0x87) == 0x84) &&
741 ((loc[-3] & 0x14) != 0) && (loc[-2] == 0x1 || loc[-2] == 0x3) &&
742 ((loc[-1] & 0xc7) == 0x5)) {
743 // "addq %reg1, foo@GOTTPOFF(%rip), %reg2" -> "addq $foo, %reg1, %reg2"
744 // "addq foo@GOTTPOFF(%rip), %reg1, %reg2" -> "addq $foo, %reg1, %reg2"
745 // "{nf} addq %reg1, foo@GOTTPOFF(%rip), %reg2"
746 // -> "{nf} addq $foo, %reg1, %reg2"
747 // "{nf} addq name@GOTTPOFF(%rip), %reg1, %reg2"
748 // -> "{nf} addq $foo, %reg1, %reg2"
749 // "{nf} addq name@GOTTPOFF(%rip), %reg" -> "{nf} addq $foo, %reg"
750 loc[-2] = 0x81;
751 // Move R bits to B bits in EVEX payloads and ModRM byte.
752 const uint8_t evexPayload0 = loc[-5];
753 if ((evexPayload0 & (1 << 7)) == 0)
754 loc[-5] = (evexPayload0 | (1 << 7)) & ~(1 << 5);
755 if ((evexPayload0 & (1 << 4)) == 0)
756 loc[-5] = evexPayload0 | (1 << 4) | (1 << 3);
757 *regSlot = 0xc0 | reg;
758 } else {
759 Err(ctx) << getErrorLoc(ctx, loc: loc - 6)
760 << "R_X86_64_CODE_6_GOTTPOFF must be used in ADDQ instructions "
761 "with NDD/NF/NDD+NF only";
762 }
763 } else {
764 llvm_unreachable("Unsupported relocation type!");
765 }
766
767 // The original code used a PC relative relocation.
768 // Need to compensate for the -4 it had in the addend.
769 write32le(P: loc, V: val + 4);
770}
771
772void X86_64::relaxTlsLdToLe(uint8_t *loc, const Relocation &rel,
773 uint64_t val) const {
774 const uint8_t inst[] = {
775 0x66, 0x66, // .word 0x6666
776 0x66, // .byte 0x66
777 0x64, 0x48, 0x8b, 0x04, 0x25, 0x00, 0x00, 0x00, 0x00, // mov %fs:0,%rax
778 };
779
780 if (loc[4] == 0xe8) {
781 // Convert
782 // leaq bar@tlsld(%rip), %rdi # 48 8d 3d <Loc>
783 // callq __tls_get_addr@PLT # e8 <disp32>
784 // leaq bar@dtpoff(%rax), %rcx
785 // to
786 // .word 0x6666
787 // .byte 0x66
788 // mov %fs:0,%rax
789 // leaq bar@tpoff(%rax), %rcx
790 memcpy(dest: loc - 3, src: inst, n: sizeof(inst));
791 return;
792 }
793
794 if (loc[4] == 0xff && loc[5] == 0x15) {
795 // Convert
796 // leaq x@tlsld(%rip),%rdi # 48 8d 3d <Loc>
797 // call *__tls_get_addr@GOTPCREL(%rip) # ff 15 <disp32>
798 // to
799 // .long 0x66666666
800 // movq %fs:0,%rax
801 // See "Table 11.9: LD -> LE Code Transition (LP64)" in
802 // https://raw.githubusercontent.com/wiki/hjl-tools/x86-psABI/x86-64-psABI-1.0.pdf
803 loc[-3] = 0x66;
804 memcpy(dest: loc - 2, src: inst, n: sizeof(inst));
805 return;
806 }
807
808 ErrAlways(ctx)
809 << getErrorLoc(ctx, loc: loc - 3)
810 << "expected R_X86_64_PLT32 or R_X86_64_GOTPCRELX after R_X86_64_TLSLD";
811}
812
813// A JumpInstrMod at a specific offset indicates that the jump instruction
814// opcode at that offset must be modified. This is specifically used to relax
815// jump instructions with basic block sections. This function looks at the
816// JumpMod and effects the change.
817void X86_64::applyJumpInstrMod(uint8_t *loc, JumpModType type,
818 unsigned size) const {
819 switch (type) {
820 case J_JMP_32:
821 if (size == 4)
822 *loc = 0xe9;
823 else
824 *loc = 0xeb;
825 break;
826 case J_JE_32:
827 if (size == 4) {
828 loc[-1] = 0x0f;
829 *loc = 0x84;
830 } else
831 *loc = 0x74;
832 break;
833 case J_JNE_32:
834 if (size == 4) {
835 loc[-1] = 0x0f;
836 *loc = 0x85;
837 } else
838 *loc = 0x75;
839 break;
840 case J_JG_32:
841 if (size == 4) {
842 loc[-1] = 0x0f;
843 *loc = 0x8f;
844 } else
845 *loc = 0x7f;
846 break;
847 case J_JGE_32:
848 if (size == 4) {
849 loc[-1] = 0x0f;
850 *loc = 0x8d;
851 } else
852 *loc = 0x7d;
853 break;
854 case J_JB_32:
855 if (size == 4) {
856 loc[-1] = 0x0f;
857 *loc = 0x82;
858 } else
859 *loc = 0x72;
860 break;
861 case J_JBE_32:
862 if (size == 4) {
863 loc[-1] = 0x0f;
864 *loc = 0x86;
865 } else
866 *loc = 0x76;
867 break;
868 case J_JL_32:
869 if (size == 4) {
870 loc[-1] = 0x0f;
871 *loc = 0x8c;
872 } else
873 *loc = 0x7c;
874 break;
875 case J_JLE_32:
876 if (size == 4) {
877 loc[-1] = 0x0f;
878 *loc = 0x8e;
879 } else
880 *loc = 0x7e;
881 break;
882 case J_JA_32:
883 if (size == 4) {
884 loc[-1] = 0x0f;
885 *loc = 0x87;
886 } else
887 *loc = 0x77;
888 break;
889 case J_JAE_32:
890 if (size == 4) {
891 loc[-1] = 0x0f;
892 *loc = 0x83;
893 } else
894 *loc = 0x73;
895 break;
896 case J_UNKNOWN:
897 llvm_unreachable("Unknown Jump Relocation");
898 }
899}
900
901int64_t X86_64::getImplicitAddend(const uint8_t *buf, RelType type) const {
902 switch (type) {
903 case R_X86_64_8:
904 case R_X86_64_PC8:
905 return SignExtend64<8>(x: *buf);
906 case R_X86_64_16:
907 case R_X86_64_PC16:
908 return SignExtend64<16>(x: read16le(P: buf));
909 case R_X86_64_32:
910 case R_X86_64_32S:
911 case R_X86_64_TPOFF32:
912 case R_X86_64_GOT32:
913 case R_X86_64_GOTPC32:
914 case R_X86_64_GOTPC32_TLSDESC:
915 case R_X86_64_GOTPCREL:
916 case R_X86_64_GOTPCRELX:
917 case R_X86_64_REX_GOTPCRELX:
918 case R_X86_64_CODE_4_GOTPCRELX:
919 case R_X86_64_PC32:
920 case R_X86_64_GOTTPOFF:
921 case R_X86_64_CODE_4_GOTTPOFF:
922 case R_X86_64_CODE_6_GOTTPOFF:
923 case R_X86_64_PLT32:
924 case R_X86_64_TLSGD:
925 case R_X86_64_TLSLD:
926 case R_X86_64_DTPOFF32:
927 case R_X86_64_SIZE32:
928 return SignExtend64<32>(x: read32le(P: buf));
929 case R_X86_64_64:
930 case R_X86_64_TPOFF64:
931 case R_X86_64_DTPOFF64:
932 case R_X86_64_DTPMOD64:
933 case R_X86_64_PC64:
934 case R_X86_64_SIZE64:
935 case R_X86_64_GLOB_DAT:
936 case R_X86_64_GOT64:
937 case R_X86_64_GOTOFF64:
938 case R_X86_64_GOTPC64:
939 case R_X86_64_PLTOFF64:
940 case R_X86_64_IRELATIVE:
941 case R_X86_64_RELATIVE:
942 return read64le(P: buf);
943 case R_X86_64_TLSDESC:
944 return read64le(P: buf + 8);
945 case R_X86_64_JUMP_SLOT:
946 case R_X86_64_NONE:
947 // These relocations are defined as not having an implicit addend.
948 return 0;
949 default:
950 InternalErr(ctx, buf) << "cannot read addend for relocation " << type;
951 return 0;
952 }
953}
954
955static void relaxGot(uint8_t *loc, const Relocation &rel, uint64_t val);
956
957void X86_64::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const {
958 switch (rel.type) {
959 case R_X86_64_8:
960 checkIntUInt(ctx, loc, v: val, n: 8, rel);
961 *loc = val;
962 break;
963 case R_X86_64_PC8:
964 checkInt(ctx, loc, v: val, n: 8, rel);
965 *loc = val;
966 break;
967 case R_X86_64_16:
968 checkIntUInt(ctx, loc, v: val, n: 16, rel);
969 write16le(P: loc, V: val);
970 break;
971 case R_X86_64_PC16:
972 checkInt(ctx, loc, v: val, n: 16, rel);
973 write16le(P: loc, V: val);
974 break;
975 case R_X86_64_32:
976 checkUInt(ctx, loc, v: val, n: 32, rel);
977 write32le(P: loc, V: val);
978 break;
979 case R_X86_64_32S:
980 case R_X86_64_GOT32:
981 case R_X86_64_GOTPC32:
982 case R_X86_64_GOTPCREL:
983 case R_X86_64_PC32:
984 case R_X86_64_PLT32:
985 case R_X86_64_DTPOFF32:
986 case R_X86_64_SIZE32:
987 checkInt(ctx, loc, v: val, n: 32, rel);
988 write32le(P: loc, V: val);
989 break;
990 case R_X86_64_64:
991 case R_X86_64_TPOFF64:
992 case R_X86_64_DTPOFF64:
993 case R_X86_64_PC64:
994 case R_X86_64_SIZE64:
995 case R_X86_64_GOT64:
996 case R_X86_64_GOTOFF64:
997 case R_X86_64_GOTPC64:
998 case R_X86_64_PLTOFF64:
999 write64le(P: loc, V: val);
1000 break;
1001 case R_X86_64_GOTPCRELX:
1002 case R_X86_64_REX_GOTPCRELX:
1003 case R_X86_64_CODE_4_GOTPCRELX:
1004 if (rel.expr != R_GOT_PC) {
1005 relaxGot(loc, rel, val);
1006 } else {
1007 checkInt(ctx, loc, v: val, n: 32, rel);
1008 write32le(P: loc, V: val);
1009 }
1010 break;
1011 case R_X86_64_GOTPC32_TLSDESC:
1012 case R_X86_64_CODE_4_GOTPC32_TLSDESC:
1013 case R_X86_64_TLSDESC_CALL:
1014 case R_X86_64_TLSGD:
1015 if (rel.expr == R_TPREL) {
1016 relaxTlsGdToLe(loc, rel, val);
1017 } else if (rel.expr == R_GOT_PC) {
1018 relaxTlsGdToIe(loc, rel, val);
1019 } else {
1020 checkInt(ctx, loc, v: val, n: 32, rel);
1021 write32le(P: loc, V: val);
1022 }
1023 break;
1024 case R_X86_64_TLSLD:
1025 if (rel.expr == R_TPREL) {
1026 relaxTlsLdToLe(loc, rel, val);
1027 } else {
1028 checkInt(ctx, loc, v: val, n: 32, rel);
1029 write32le(P: loc, V: val);
1030 }
1031 break;
1032 case R_X86_64_GOTTPOFF:
1033 case R_X86_64_CODE_4_GOTTPOFF:
1034 case R_X86_64_CODE_6_GOTTPOFF:
1035 if (rel.expr == R_TPREL) {
1036 relaxTlsIeToLe(loc, rel, val);
1037 } else {
1038 checkInt(ctx, loc, v: val, n: 32, rel);
1039 write32le(P: loc, V: val);
1040 }
1041 break;
1042 case R_X86_64_TPOFF32:
1043 checkInt(ctx, loc, v: val, n: 32, rel);
1044 write32le(P: loc, V: val);
1045 break;
1046
1047 case R_X86_64_TLSDESC:
1048 // The addend is stored in the second 64-bit word.
1049 write64le(P: loc + 8, V: val);
1050 break;
1051 default:
1052 llvm_unreachable("unknown relocation");
1053 }
1054}
1055
1056RelExpr X86_64::adjustGotPcExpr(RelType type, int64_t addend,
1057 const uint8_t *loc) const {
1058 // Only R_X86_64_[REX_]|[CODE_4_]GOTPCRELX can be relaxed. GNU as may emit
1059 // GOTPCRELX with addend != -4. Such an instruction does not load the full GOT
1060 // entry, so we cannot relax the relocation. E.g. movl x@GOTPCREL+4(%rip),
1061 // %rax (addend=0) loads the high 32 bits of the GOT entry.
1062 if (!ctx.arg.relax || addend != -4 ||
1063 (type != R_X86_64_GOTPCRELX && type != R_X86_64_REX_GOTPCRELX &&
1064 type != R_X86_64_CODE_4_GOTPCRELX))
1065 return R_GOT_PC;
1066 const uint8_t op = loc[-2];
1067 const uint8_t modRm = loc[-1];
1068
1069 // FIXME: When PIC is disabled and foo is defined locally in the
1070 // lower 32 bit address space, memory operand in mov can be converted into
1071 // immediate operand. Otherwise, mov must be changed to lea. We support only
1072 // latter relaxation at this moment.
1073 if (op == 0x8b)
1074 return R_RELAX_GOT_PC;
1075
1076 // Relax call and jmp.
1077 if (op == 0xff && (modRm == 0x15 || modRm == 0x25))
1078 return R_RELAX_GOT_PC;
1079
1080 // We don't support test/binop instructions without a REX/REX2 prefix.
1081 if (type == R_X86_64_GOTPCRELX)
1082 return R_GOT_PC;
1083
1084 // Relaxation of test, adc, add, and, cmp, or, sbb, sub, xor.
1085 // If PIC then no relaxation is available.
1086 return ctx.arg.isPic ? R_GOT_PC : R_RELAX_GOT_PC_NOPIC;
1087}
1088
1089// A subset of relaxations can only be applied for no-PIC. This method
1090// handles such relaxations. Instructions encoding information was taken from:
1091// "Intel 64 and IA-32 Architectures Software Developer's Manual V2"
1092// (http://www.intel.com/content/dam/www/public/us/en/documents/manuals/
1093// 64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf)
1094static void relaxGotNoPic(uint8_t *loc, uint64_t val, uint8_t op, uint8_t modRm,
1095 bool isRex2) {
1096 const uint8_t rex = loc[-3];
1097 // Convert "test %reg, foo@GOTPCREL(%rip)" to "test $foo, %reg".
1098 if (op == 0x85) {
1099 // See "TEST-Logical Compare" (4-428 Vol. 2B),
1100 // TEST r/m64, r64 uses "full" ModR / M byte (no opcode extension).
1101
1102 // ModR/M byte has form XX YYY ZZZ, where
1103 // YYY is MODRM.reg(register 2), ZZZ is MODRM.rm(register 1).
1104 // XX has different meanings:
1105 // 00: The operand's memory address is in reg1.
1106 // 01: The operand's memory address is reg1 + a byte-sized displacement.
1107 // 10: The operand's memory address is reg1 + a word-sized displacement.
1108 // 11: The operand is reg1 itself.
1109 // If an instruction requires only one operand, the unused reg2 field
1110 // holds extra opcode bits rather than a register code
1111 // 0xC0 == 11 000 000 binary.
1112 // 0x38 == 00 111 000 binary.
1113 // We transfer reg2 to reg1 here as operand.
1114 // See "2.1.3 ModR/M and SIB Bytes" (Vol. 2A 2-3).
1115 loc[-1] = 0xc0 | (modRm & 0x38) >> 3; // ModR/M byte.
1116
1117 // Change opcode from TEST r/m64, r64 to TEST r/m64, imm32
1118 // See "TEST-Logical Compare" (4-428 Vol. 2B).
1119 loc[-2] = 0xf7;
1120
1121 // Move R bit to the B bit in REX/REX2 byte.
1122 // REX byte is encoded as 0100WRXB, where
1123 // 0100 is 4bit fixed pattern.
1124 // REX.W When 1, a 64-bit operand size is used. Otherwise, when 0, the
1125 // default operand size is used (which is 32-bit for most but not all
1126 // instructions).
1127 // REX.R This 1-bit value is an extension to the MODRM.reg field.
1128 // REX.X This 1-bit value is an extension to the SIB.index field.
1129 // REX.B This 1-bit value is an extension to the MODRM.rm field or the
1130 // SIB.base field.
1131 // See "2.2.1.2 More on REX Prefix Fields " (2-8 Vol. 2A).
1132 //
1133 // REX2 prefix is encoded as 0xd5|M|R2|X2|B2|WRXB, where
1134 // 0xd5 is 1byte fixed pattern.
1135 // REX2's [W,R,X,B] have the same meanings as REX's.
1136 // REX2.M encodes the map id.
1137 // R2/X2/B2 provides the fifth and most siginicant bits of the R/X/B
1138 // register identifiers, each of which can now address all 32 GPRs.
1139 if (isRex2)
1140 loc[-3] = (rex & ~0x44) | (rex & 0x44) >> 2;
1141 else
1142 loc[-3] = (rex & ~0x4) | (rex & 0x4) >> 2;
1143 write32le(P: loc, V: val);
1144 return;
1145 }
1146
1147 // If we are here then we need to relax the adc, add, and, cmp, or, sbb, sub
1148 // or xor operations.
1149
1150 // Convert "binop foo@GOTPCREL(%rip), %reg" to "binop $foo, %reg".
1151 // Logic is close to one for test instruction above, but we also
1152 // write opcode extension here, see below for details.
1153 loc[-1] = 0xc0 | (modRm & 0x38) >> 3 | (op & 0x3c); // ModR/M byte.
1154
1155 // Primary opcode is 0x81, opcode extension is one of:
1156 // 000b = ADD, 001b is OR, 010b is ADC, 011b is SBB,
1157 // 100b is AND, 101b is SUB, 110b is XOR, 111b is CMP.
1158 // This value was wrote to MODRM.reg in a line above.
1159 // See "3.2 INSTRUCTIONS (A-M)" (Vol. 2A 3-15),
1160 // "INSTRUCTION SET REFERENCE, N-Z" (Vol. 2B 4-1) for
1161 // descriptions about each operation.
1162 loc[-2] = 0x81;
1163 if (isRex2)
1164 loc[-3] = (rex & ~0x44) | (rex & 0x44) >> 2;
1165 else
1166 loc[-3] = (rex & ~0x4) | (rex & 0x4) >> 2;
1167 write32le(P: loc, V: val);
1168}
1169
1170static void relaxGot(uint8_t *loc, const Relocation &rel, uint64_t val) {
1171 assert(isInt<32>(val) &&
1172 "GOTPCRELX should not have been relaxed if it overflows");
1173 const uint8_t op = loc[-2];
1174 const uint8_t modRm = loc[-1];
1175
1176 // Convert "mov foo@GOTPCREL(%rip),%reg" to "lea foo(%rip),%reg".
1177 if (op == 0x8b) {
1178 loc[-2] = 0x8d;
1179 write32le(P: loc, V: val);
1180 return;
1181 }
1182
1183 if (op != 0xff) {
1184 // We are relaxing a rip relative to an absolute, so compensate
1185 // for the old -4 addend.
1186 assert(!rel.sym->file->ctx.arg.isPic);
1187 relaxGotNoPic(loc, val: val + 4, op, modRm,
1188 isRex2: rel.type == R_X86_64_CODE_4_GOTPCRELX);
1189 return;
1190 }
1191
1192 // Convert call/jmp instructions.
1193 if (modRm == 0x15) {
1194 // ABI says we can convert "call *foo@GOTPCREL(%rip)" to "nop; call foo".
1195 // Instead we convert to "addr32 call foo" where addr32 is an instruction
1196 // prefix. That makes result expression to be a single instruction.
1197 loc[-2] = 0x67; // addr32 prefix
1198 loc[-1] = 0xe8; // call
1199 write32le(P: loc, V: val);
1200 return;
1201 }
1202
1203 // Convert "jmp *foo@GOTPCREL(%rip)" to "jmp foo; nop".
1204 // jmp doesn't return, so it is fine to use nop here, it is just a stub.
1205 assert(modRm == 0x25);
1206 loc[-2] = 0xe9; // jmp
1207 loc[3] = 0x90; // nop
1208 write32le(P: loc - 1, V: val + 1);
1209}
1210
1211// A split-stack prologue starts by checking the amount of stack remaining
1212// in one of two ways:
1213// A) Comparing of the stack pointer to a field in the tcb.
1214// B) Or a load of a stack pointer offset with an lea to r10 or r11.
1215bool X86_64::adjustPrologueForCrossSplitStack(uint8_t *loc, uint8_t *end,
1216 uint8_t stOther) const {
1217 if (!ctx.arg.is64) {
1218 ErrAlways(ctx) << "target doesn't support split stacks";
1219 return false;
1220 }
1221
1222 if (loc + 8 >= end)
1223 return false;
1224
1225 // Replace "cmp %fs:0x70,%rsp" and subsequent branch
1226 // with "stc, nopl 0x0(%rax,%rax,1)"
1227 if (memcmp(s1: loc, s2: "\x64\x48\x3b\x24\x25", n: 5) == 0) {
1228 memcpy(dest: loc, src: "\xf9\x0f\x1f\x84\x00\x00\x00\x00", n: 8);
1229 return true;
1230 }
1231
1232 // Adjust "lea X(%rsp),%rYY" to lea "(X - 0x4000)(%rsp),%rYY" where rYY could
1233 // be r10 or r11. The lea instruction feeds a subsequent compare which checks
1234 // if there is X available stack space. Making X larger effectively reserves
1235 // that much additional space. The stack grows downward so subtract the value.
1236 if (memcmp(s1: loc, s2: "\x4c\x8d\x94\x24", n: 4) == 0 ||
1237 memcmp(s1: loc, s2: "\x4c\x8d\x9c\x24", n: 4) == 0) {
1238 // The offset bytes are encoded four bytes after the start of the
1239 // instruction.
1240 write32le(P: loc + 4, V: read32le(P: loc + 4) - 0x4000);
1241 return true;
1242 }
1243 return false;
1244}
1245
1246void X86_64::relocateAlloc(InputSection &sec, uint8_t *buf) const {
1247 uint64_t secAddr = sec.getOutputSection()->addr + sec.outSecOff;
1248 for (const Relocation &rel : sec.relocs()) {
1249 if (rel.expr == R_NONE) // See deleteFallThruJmpInsn
1250 continue;
1251 uint8_t *loc = buf + rel.offset;
1252 const uint64_t val = sec.getRelocTargetVA(ctx, r: rel, p: secAddr + rel.offset);
1253 relocate(loc, rel, val);
1254 }
1255 if (sec.jumpInstrMod) {
1256 applyJumpInstrMod(loc: buf + sec.jumpInstrMod->offset,
1257 type: sec.jumpInstrMod->original, size: sec.jumpInstrMod->size);
1258 }
1259}
1260
1261static std::optional<uint64_t> getControlTransferAddend(InputSection &is,
1262 Relocation &r) {
1263 // Identify a control transfer relocation for the branch-to-branch
1264 // optimization. A "control transfer relocation" usually means a CALL or JMP
1265 // target but it also includes relative vtable relocations for example.
1266 //
1267 // We require the relocation type to be PLT32. With a relocation type of PLT32
1268 // the value may be assumed to be used for branching directly to the symbol
1269 // and the addend is only used to produce the relocated value (hence the
1270 // effective addend is always 0). This is because if a PLT is needed the
1271 // addend will be added to the address of the PLT, and it doesn't make sense
1272 // to branch into the middle of a PLT. For example, relative vtable
1273 // relocations use PLT32 and 0 or a positive value as the addend but still are
1274 // used to branch to the symbol.
1275 //
1276 // STT_SECTION symbols are a special case on x86 because the LLVM assembler
1277 // uses them for branches to local symbols which are assembled as referring to
1278 // the section symbol with the addend equal to the symbol value - 4.
1279 if (r.type == R_X86_64_PLT32) {
1280 if (r.sym->isSection())
1281 return r.addend + 4;
1282 return 0;
1283 }
1284 return std::nullopt;
1285}
1286
1287static std::pair<Relocation *, uint64_t>
1288getBranchInfoAtTarget(InputSection &is, uint64_t offset) {
1289 auto content = is.contentMaybeDecompress();
1290 if (content.size() > offset && content[offset] == 0xe9) { // JMP immediate
1291 auto *i = llvm::partition_point(
1292 Range&: is.relocations, P: [&](Relocation &r) { return r.offset < offset + 1; });
1293 // Unlike with getControlTransferAddend() it is valid to accept a PC32
1294 // relocation here because we know that this is actually a JMP and not some
1295 // other reference, so the interpretation is that we add 4 to the addend and
1296 // use that as the effective addend.
1297 if (i != is.relocations.end() && i->offset == offset + 1 &&
1298 (i->type == R_X86_64_PC32 || i->type == R_X86_64_PLT32)) {
1299 return {i, i->addend + 4};
1300 }
1301 }
1302 return {nullptr, 0};
1303}
1304
1305static void redirectControlTransferRelocations(Relocation &r1,
1306 const Relocation &r2) {
1307 // The isSection() check handles the STT_SECTION case described above.
1308 // In that case the original addend is irrelevant because it referred to an
1309 // offset within the original target section so we overwrite it.
1310 //
1311 // The +4 is here to compensate for r2.addend which will likely be -4,
1312 // but may also be addend-4 in case of a PC32 branch to symbol+addend.
1313 if (r1.sym->isSection())
1314 r1.addend = r2.addend;
1315 else
1316 r1.addend += r2.addend + 4;
1317 r1.expr = r2.expr;
1318 r1.sym = r2.sym;
1319}
1320
1321void X86_64::applyBranchToBranchOpt() const {
1322 applyBranchToBranchOptImpl(ctx, getControlTransferAddend,
1323 getBranchInfoAtTarget,
1324 redirectControlTransferRelocations);
1325}
1326
1327// If Intel Indirect Branch Tracking is enabled, we have to emit special PLT
1328// entries containing endbr64 instructions. A PLT entry will be split into two
1329// parts, one in .plt.sec (writePlt), and the other in .plt (writeIBTPlt).
1330namespace {
1331class IntelIBT : public X86_64 {
1332public:
1333 IntelIBT(Ctx &ctx) : X86_64(ctx) { pltHeaderSize = 0; };
1334 void writeGotPlt(uint8_t *buf, const Symbol &s) const override;
1335 void writePlt(uint8_t *buf, const Symbol &sym,
1336 uint64_t pltEntryAddr) const override;
1337 void writeIBTPlt(uint8_t *buf, size_t numEntries) const override;
1338
1339 static const unsigned IBTPltHeaderSize = 16;
1340};
1341} // namespace
1342
1343void IntelIBT::writeGotPlt(uint8_t *buf, const Symbol &s) const {
1344 uint64_t va = ctx.in.ibtPlt->getVA() + IBTPltHeaderSize +
1345 s.getPltIdx(ctx) * pltEntrySize;
1346 write64le(P: buf, V: va);
1347}
1348
1349void IntelIBT::writePlt(uint8_t *buf, const Symbol &sym,
1350 uint64_t pltEntryAddr) const {
1351 const uint8_t Inst[] = {
1352 0xf3, 0x0f, 0x1e, 0xfa, // endbr64
1353 0xff, 0x25, 0, 0, 0, 0, // jmpq *got(%rip)
1354 0x66, 0x0f, 0x1f, 0x44, 0, 0, // nop
1355 };
1356 memcpy(dest: buf, src: Inst, n: sizeof(Inst));
1357 write32le(P: buf + 6, V: sym.getGotPltVA(ctx) - pltEntryAddr - 10);
1358}
1359
1360void IntelIBT::writeIBTPlt(uint8_t *buf, size_t numEntries) const {
1361 writePltHeader(buf);
1362 buf += IBTPltHeaderSize;
1363
1364 const uint8_t inst[] = {
1365 0xf3, 0x0f, 0x1e, 0xfa, // endbr64
1366 0x68, 0, 0, 0, 0, // pushq <relocation index>
1367 0xe9, 0, 0, 0, 0, // jmpq plt[0]
1368 0x66, 0x90, // nop
1369 };
1370
1371 for (size_t i = 0; i < numEntries; ++i) {
1372 memcpy(dest: buf, src: inst, n: sizeof(inst));
1373 write32le(P: buf + 5, V: i);
1374 write32le(P: buf + 10, V: -pltHeaderSize - sizeof(inst) * i - 30);
1375 buf += sizeof(inst);
1376 }
1377}
1378
1379// These nonstandard PLT entries are to migtigate Spectre v2 security
1380// vulnerability. In order to mitigate Spectre v2, we want to avoid indirect
1381// branch instructions such as `jmp *GOTPLT(%rip)`. So, in the following PLT
1382// entries, we use a CALL followed by MOV and RET to do the same thing as an
1383// indirect jump. That instruction sequence is so-called "retpoline".
1384//
1385// We have two types of retpoline PLTs as a size optimization. If `-z now`
1386// is specified, all dynamic symbols are resolved at load-time. Thus, when
1387// that option is given, we can omit code for symbol lazy resolution.
1388namespace {
1389class Retpoline : public X86_64 {
1390public:
1391 Retpoline(Ctx &);
1392 void writeGotPlt(uint8_t *buf, const Symbol &s) const override;
1393 void writePltHeader(uint8_t *buf) const override;
1394 void writePlt(uint8_t *buf, const Symbol &sym,
1395 uint64_t pltEntryAddr) const override;
1396};
1397
1398class RetpolineZNow : public X86_64 {
1399public:
1400 RetpolineZNow(Ctx &);
1401 void writeGotPlt(uint8_t *buf, const Symbol &s) const override {}
1402 void writePltHeader(uint8_t *buf) const override;
1403 void writePlt(uint8_t *buf, const Symbol &sym,
1404 uint64_t pltEntryAddr) const override;
1405};
1406} // namespace
1407
1408Retpoline::Retpoline(Ctx &ctx) : X86_64(ctx) {
1409 pltHeaderSize = 48;
1410 pltEntrySize = 32;
1411 ipltEntrySize = 32;
1412}
1413
1414void Retpoline::writeGotPlt(uint8_t *buf, const Symbol &s) const {
1415 write64le(P: buf, V: s.getPltVA(ctx) + 17);
1416}
1417
1418void Retpoline::writePltHeader(uint8_t *buf) const {
1419 const uint8_t insn[] = {
1420 0xff, 0x35, 0, 0, 0, 0, // 0: pushq GOTPLT+8(%rip)
1421 0x4c, 0x8b, 0x1d, 0, 0, 0, 0, // 6: mov GOTPLT+16(%rip), %r11
1422 0xe8, 0x0e, 0x00, 0x00, 0x00, // d: callq next
1423 0xf3, 0x90, // 12: loop: pause
1424 0x0f, 0xae, 0xe8, // 14: lfence
1425 0xeb, 0xf9, // 17: jmp loop
1426 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // 19: int3; .align 16
1427 0x4c, 0x89, 0x1c, 0x24, // 20: next: mov %r11, (%rsp)
1428 0xc3, // 24: ret
1429 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // 25: int3; padding
1430 0xcc, 0xcc, 0xcc, 0xcc, // 2c: int3; padding
1431 };
1432 memcpy(dest: buf, src: insn, n: sizeof(insn));
1433
1434 uint64_t gotPlt = ctx.in.gotPlt->getVA();
1435 uint64_t plt = ctx.in.plt->getVA();
1436 write32le(P: buf + 2, V: gotPlt - plt - 6 + 8);
1437 write32le(P: buf + 9, V: gotPlt - plt - 13 + 16);
1438}
1439
1440void Retpoline::writePlt(uint8_t *buf, const Symbol &sym,
1441 uint64_t pltEntryAddr) const {
1442 const uint8_t insn[] = {
1443 0x4c, 0x8b, 0x1d, 0, 0, 0, 0, // 0: mov foo@GOTPLT(%rip), %r11
1444 0xe8, 0, 0, 0, 0, // 7: callq plt+0x20
1445 0xe9, 0, 0, 0, 0, // c: jmp plt+0x12
1446 0x68, 0, 0, 0, 0, // 11: pushq <relocation index>
1447 0xe9, 0, 0, 0, 0, // 16: jmp plt+0
1448 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // 1b: int3; padding
1449 };
1450 memcpy(dest: buf, src: insn, n: sizeof(insn));
1451
1452 uint64_t off = pltEntryAddr - ctx.in.plt->getVA();
1453
1454 write32le(P: buf + 3, V: sym.getGotPltVA(ctx) - pltEntryAddr - 7);
1455 write32le(P: buf + 8, V: -off - 12 + 32);
1456 write32le(P: buf + 13, V: -off - 17 + 18);
1457 write32le(P: buf + 18, V: sym.getPltIdx(ctx));
1458 write32le(P: buf + 23, V: -off - 27);
1459}
1460
1461RetpolineZNow::RetpolineZNow(Ctx &ctx) : X86_64(ctx) {
1462 pltHeaderSize = 32;
1463 pltEntrySize = 16;
1464 ipltEntrySize = 16;
1465}
1466
1467void RetpolineZNow::writePltHeader(uint8_t *buf) const {
1468 const uint8_t insn[] = {
1469 0xe8, 0x0b, 0x00, 0x00, 0x00, // 0: call next
1470 0xf3, 0x90, // 5: loop: pause
1471 0x0f, 0xae, 0xe8, // 7: lfence
1472 0xeb, 0xf9, // a: jmp loop
1473 0xcc, 0xcc, 0xcc, 0xcc, // c: int3; .align 16
1474 0x4c, 0x89, 0x1c, 0x24, // 10: next: mov %r11, (%rsp)
1475 0xc3, // 14: ret
1476 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // 15: int3; padding
1477 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // 1a: int3; padding
1478 0xcc, // 1f: int3; padding
1479 };
1480 memcpy(dest: buf, src: insn, n: sizeof(insn));
1481}
1482
1483void RetpolineZNow::writePlt(uint8_t *buf, const Symbol &sym,
1484 uint64_t pltEntryAddr) const {
1485 const uint8_t insn[] = {
1486 0x4c, 0x8b, 0x1d, 0, 0, 0, 0, // mov foo@GOTPLT(%rip), %r11
1487 0xe9, 0, 0, 0, 0, // jmp plt+0
1488 0xcc, 0xcc, 0xcc, 0xcc, // int3; padding
1489 };
1490 memcpy(dest: buf, src: insn, n: sizeof(insn));
1491
1492 write32le(P: buf + 3, V: sym.getGotPltVA(ctx) - pltEntryAddr - 7);
1493 write32le(P: buf + 8, V: ctx.in.plt->getVA() - pltEntryAddr - 12);
1494}
1495
1496void elf::setX86_64TargetInfo(Ctx &ctx) {
1497 if (ctx.arg.zRetpolineplt) {
1498 if (ctx.arg.zNow)
1499 ctx.target.reset(p: new RetpolineZNow(ctx));
1500 else
1501 ctx.target.reset(p: new Retpoline(ctx));
1502 return;
1503 }
1504
1505 if (ctx.arg.andFeatures & GNU_PROPERTY_X86_FEATURE_1_IBT)
1506 ctx.target.reset(p: new IntelIBT(ctx));
1507 else
1508 ctx.target.reset(p: new X86_64(ctx));
1509}
1510