X86SpeculativeLoadHardening.cpp source code [llvm_projects/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp]

1	//====- X86SpeculativeLoadHardening.cpp - A Spectre v1 mitigation ---------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	/// \file
9	///
10	/// Provide a pass which mitigates speculative execution attacks which operate
11	/// by speculating incorrectly past some predicate (a type check, bounds check,
12	/// or other condition) to reach a load with invalid inputs and leak the data
13	/// accessed by that load using a side channel out of the speculative domain.
14	///
15	/// For details on the attacks, see the first variant in both the Project Zero
16	/// writeup and the Spectre paper:
17	/// https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html
18	/// https://spectreattack.com/spectre.pdf
19	///
20	//===----------------------------------------------------------------------===//
21
22	#include "X86.h"
23	#include "X86InstrInfo.h"
24	#include "X86Subtarget.h"
25	#include "llvm/ADT/ArrayRef.h"
26	#include "llvm/ADT/DenseMap.h"
27	#include "llvm/ADT/STLExtras.h"
28	#include "llvm/ADT/SmallPtrSet.h"
29	#include "llvm/ADT/SmallSet.h"
30	#include "llvm/ADT/SmallVector.h"
31	#include "llvm/ADT/SparseBitVector.h"
32	#include "llvm/ADT/Statistic.h"
33	#include "llvm/CodeGen/MachineBasicBlock.h"
34	#include "llvm/CodeGen/MachineConstantPool.h"
35	#include "llvm/CodeGen/MachineFunction.h"
36	#include "llvm/CodeGen/MachineFunctionPass.h"
37	#include "llvm/CodeGen/MachineInstr.h"
38	#include "llvm/CodeGen/MachineInstrBuilder.h"
39	#include "llvm/CodeGen/MachineModuleInfo.h"
40	#include "llvm/CodeGen/MachineOperand.h"
41	#include "llvm/CodeGen/MachineRegisterInfo.h"
42	#include "llvm/CodeGen/MachineSSAUpdater.h"
43	#include "llvm/CodeGen/TargetInstrInfo.h"
44	#include "llvm/CodeGen/TargetRegisterInfo.h"
45	#include "llvm/CodeGen/TargetSchedule.h"
46	#include "llvm/CodeGen/TargetSubtargetInfo.h"
47	#include "llvm/IR/DebugLoc.h"
48	#include "llvm/MC/MCSchedule.h"
49	#include "llvm/Pass.h"
50	#include "llvm/Support/CommandLine.h"
51	#include "llvm/Support/Debug.h"
52	#include "llvm/Support/raw_ostream.h"
53	#include "llvm/Target/TargetMachine.h"
54	#include <cassert>
55	#include <iterator>
56	#include <optional>
57	#include <utility>
58
59	using namespace llvm;
60
61	#define PASS_KEY "x86-slh"
62	#define DEBUG_TYPE PASS_KEY
63
64	STATISTIC(NumCondBranchesTraced, "Number of conditional branches traced");
65	STATISTIC(NumBranchesUntraced, "Number of branches unable to trace");
66	STATISTIC(NumAddrRegsHardened,
67	"Number of address mode used registers hardaned");
68	STATISTIC(NumPostLoadRegsHardened,
69	"Number of post-load register values hardened");
70	STATISTIC(NumCallsOrJumpsHardened,
71	"Number of calls or jumps requiring extra hardening");
72	STATISTIC(NumInstsInserted, "Number of instructions inserted");
73	STATISTIC(NumLFENCEsInserted, "Number of lfence instructions inserted");
74
75	static cl::opt<bool> EnableSpeculativeLoadHardening(
76	"x86-speculative-load-hardening",
77	cl::desc ("Force enable speculative load hardening"), cl::init(Val: false),
78	cl::Hidden);
79
80	static cl::opt<bool> HardenEdgesWithLFENCE(
81	PASS_KEY "-lfence",
82	cl::desc (
83	"Use LFENCE along each conditional edge to harden against speculative "
84	"loads rather than conditional movs and poisoned pointers."),
85	cl::init(Val: false), cl::Hidden);
86
87	static cl::opt<bool> EnablePostLoadHardening(
88	PASS_KEY "-post-load",
89	cl::desc ("Harden the value loaded after it is loaded by "
90	"flushing the loaded bits to 1. This is hard to do "
91	"in general but can be done easily for GPRs."),
92	cl::init(Val: true), cl::Hidden);
93
94	static cl::opt<bool> FenceCallAndRet(
95	PASS_KEY "-fence-call-and-ret",
96	cl::desc ("Use a full speculation fence to harden both call and ret edges "
97	"rather than a lighter weight mitigation."),
98	cl::init(Val: false), cl::Hidden);
99
100	static cl::opt<bool> HardenInterprocedurally(
101	PASS_KEY "-ip",
102	cl::desc ("Harden interprocedurally by passing our state in and out of "
103	"functions in the high bits of the stack pointer."),
104	cl::init(Val: true), cl::Hidden);
105
106	static cl::opt<bool>
107	HardenLoads(PASS_KEY "-loads",
108	cl::desc ("Sanitize loads from memory. When disable, no "
109	"significant security is provided."),
110	cl::init(Val: true), cl::Hidden);
111
112	static cl::opt<bool> HardenIndirectCallsAndJumps(
113	PASS_KEY "-indirect",
114	cl::desc ("Harden indirect calls and jumps against using speculatively "
115	"stored attacker controlled addresses. This is designed to "
116	"mitigate Spectre v1.2 style attacks."),
117	cl::init(Val: true), cl::Hidden);
118
119	namespace {
120
121	class X86SpeculativeLoadHardeningPass : public MachineFunctionPass {
122	public:
123	X86SpeculativeLoadHardeningPass() : MachineFunctionPass (ID) { }
124
125	StringRef getPassName() const override {
126	return "X86 speculative load hardening";
127	}
128	bool runOnMachineFunction(MachineFunction &MF) override;
129	void getAnalysisUsage(AnalysisUsage &AU) const override;
130
131	/// Pass identification, replacement for typeid.
132	static char ID;
133
134	private:
135	/// The information about a block's conditional terminators needed to trace
136	/// our predicate state through the exiting edges.
137	struct BlockCondInfo {
138	MachineBasicBlock *MBB;
139
140	// We mostly have one conditional branch, and in extremely rare cases have
141	// two. Three and more are so rare as to be unimportant for compile time.
142	SmallVector<MachineInstr *, `2`> CondBrs;
143
144	MachineInstr *UncondBr;
145	};
146
147	/// Manages the predicate state traced through the program.
148	struct PredState {
149	Register InitialReg;
150	Register PoisonReg;
151
152	const TargetRegisterClass *RC;
153	MachineSSAUpdater SSA;
154
155	PredState(MachineFunction &MF, const TargetRegisterClass *RC)
156	: RC(RC), SSA (MF) {}
157	};
158
159	const X86Subtarget Subtarget = nullptr*;
160	MachineRegisterInfo MRI = nullptr*;
161	const X86InstrInfo TII = nullptr*;
162	const TargetRegisterInfo TRI = nullptr*;
163
164	std::optional<PredState> PS;
165
166	void hardenEdgesWithLFENCE(MachineFunction &MF);
167
168	SmallVector<BlockCondInfo, `16`> collectBlockCondInfo(MachineFunction &MF);
169
170	SmallVector<MachineInstr *, `16`>
171	tracePredStateThroughCFG(MachineFunction &MF, ArrayRef<BlockCondInfo> Infos);
172
173	void unfoldCallAndJumpLoads(MachineFunction &MF);
174
175	SmallVector<MachineInstr *, `16`>
176	tracePredStateThroughIndirectBranches(MachineFunction &MF);
177
178	void tracePredStateThroughBlocksAndHarden(MachineFunction &MF);
179
180	Register saveEFLAGS(MachineBasicBlock &MBB,
181	MachineBasicBlock::iterator InsertPt,
182	const DebugLoc &Loc);
183	void restoreEFLAGS(MachineBasicBlock &MBB,
184	MachineBasicBlock::iterator InsertPt, const DebugLoc &Loc,
185	Register Reg);
186
187	void mergePredStateIntoSP(MachineBasicBlock &MBB,
188	MachineBasicBlock::iterator InsertPt,
189	const DebugLoc &Loc, Register PredStateReg);
190	Register extractPredStateFromSP(MachineBasicBlock &MBB,
191	MachineBasicBlock::iterator InsertPt,
192	const DebugLoc &Loc);
193
194	void
195	hardenLoadAddr(MachineInstr &MI, MachineOperand &BaseMO,
196	MachineOperand &IndexMO,
197	SmallDenseMap<Register, Register, `32`> &AddrRegToHardenedReg);
198	MachineInstr *
199	sinkPostLoadHardenedInst(MachineInstr &MI,
200	SmallPtrSetImpl<MachineInstr *> &HardenedInstrs);
201	bool canHardenRegister(Register Reg);
202	Register hardenValueInRegister(Register Reg, MachineBasicBlock &MBB,
203	MachineBasicBlock::iterator InsertPt,
204	const DebugLoc &Loc);
205	Register hardenPostLoad(MachineInstr &MI);
206	void hardenReturnInstr(MachineInstr &MI);
207	void tracePredStateThroughCall(MachineInstr &MI);
208	void hardenIndirectCallOrJumpInstr(
209	MachineInstr &MI,
210	SmallDenseMap<Register, Register, `32`> &AddrRegToHardenedReg);
211	};
212
213	} // end anonymous namespace
214
215	char X86SpeculativeLoadHardeningPass::ID = `0`;
216
217	void X86SpeculativeLoadHardeningPass::getAnalysisUsage(
218	AnalysisUsage &AU) const {
219	MachineFunctionPass::getAnalysisUsage(AU);
220	}
221
222	static MachineBasicBlock &splitEdge(MachineBasicBlock &MBB,
223	MachineBasicBlock &Succ, int SuccCount,
224	MachineInstr Br, MachineInstr &UncondBr,
225	const X86InstrInfo &TII) {
226	assert(!Succ.isEHPad() && "Shouldn't get edges to EH pads!");
227
228	MachineFunction &MF = *MBB.getParent();
229
230	MachineBasicBlock &NewMBB = *MF.CreateMachineBasicBlock();
231
232	// We have to insert the new block immediately after the current one as we
233	// don't know what layout-successor relationships the successor has and we
234	// may not be able to (and generally don't want to) try to fix those up.
235	MF.insert(MBBI: std::next(x: MachineFunction::iterator (&MBB)), MBB: &NewMBB);
236
237	// Update the branch instruction if necessary.
238	if (Br) {
239	assert(Br->getOperand(`0`).getMBB() == &Succ &&
240	"Didn't start with the right target!");
241	Br->getOperand(i: `0`).setMBB(&NewMBB);
242
243	// If this successor was reached through a branch rather than fallthrough,
244	// we might have broken* fallthrough and so need to inject a new*
245	// unconditional branch.
246	if (!UncondBr) {
247	MachineBasicBlock &OldLayoutSucc =
248	*std::next(x: MachineFunction::iterator (&NewMBB));
249	assert(MBB.isSuccessor(&OldLayoutSucc) &&
250	"Without an unconditional branch, the old layout successor should "
251	"be an actual successor!");
252	auto BrBuilder =
253	BuildMI(BB: &MBB, MIMD: DebugLoc (), MCID: TII.get(Opcode: X86::JMP_1)).addMBB(MBB: &OldLayoutSucc);
254	// Update the unconditional branch now that we've added one.
255	UncondBr = &*BrBuilder;
256	}
257
258	// Insert unconditional "jump Succ" instruction in the new block if
259	// necessary.
260	if (!NewMBB.isLayoutSuccessor(MBB: &Succ)) {
261	SmallVector<MachineOperand, `4`> Cond;
262	TII.insertBranch(MBB&: NewMBB, TBB: &Succ, FBB: nullptr, Cond, DL: Br->getDebugLoc());
263	}
264	} else {
265	assert(!UncondBr &&
266	"Cannot have a branchless successor and an unconditional branch!");
267	assert(NewMBB.isLayoutSuccessor(&Succ) &&
268	"A non-branch successor must have been a layout successor before "
269	"and now is a layout successor of the new block.");
270	}
271
272	// If this is the only edge to the successor, we can just replace it in the
273	// CFG. Otherwise we need to add a new entry in the CFG for the new
274	// successor.
275	if (SuccCount == `1`) {
276	MBB.replaceSuccessor(Old: &Succ, New: &NewMBB);
277	} else {
278	MBB.splitSuccessor(Old: &Succ, New: &NewMBB);
279	}
280
281	// Hook up the edge from the new basic block to the old successor in the CFG.
282	NewMBB.addSuccessor(Succ: &Succ);
283
284	// Fix PHI nodes in Succ so they refer to NewMBB instead of MBB.
285	for (MachineInstr &MI : Succ) {
286	if (!MI.isPHI())
287	break;
288	for (int OpIdx = `1`, NumOps = MI.getNumOperands(); OpIdx < NumOps;
289	OpIdx += `2`) {
290	MachineOperand &OpV = MI.getOperand(i: OpIdx);
291	MachineOperand &OpMBB = MI.getOperand(i: OpIdx + `1`);
292	assert(OpMBB.isMBB() && "Block operand to a PHI is not a block!");
293	if (OpMBB.getMBB() != &MBB)
294	continue;
295
296	// If this is the last edge to the succesor, just replace MBB in the PHI
297	if (SuccCount == `1`) {
298	OpMBB.setMBB(&NewMBB);
299	break;
300	}
301
302	// Otherwise, append a new pair of operands for the new incoming edge.
303	MI.addOperand(MF, Op: OpV);
304	MI.addOperand(MF, Op: MachineOperand::CreateMBB(MBB: &NewMBB));
305	break;
306	}
307	}
308
309	// Inherit live-ins from the successor
310	for (auto &LI : Succ.liveins())
311	NewMBB.addLiveIn(RegMaskPair: LI);
312
313	LLVM_DEBUG(dbgs() << " Split edge from '" << MBB.getName() << "' to '"
314	<< Succ.getName() << "'.\n");
315	return NewMBB;
316	}
317
318	/// Removing duplicate PHI operands to leave the PHI in a canonical and
319	/// predictable form.
320	///
321	/// FIXME: It's really frustrating that we have to do this, but SSA-form in MIR
322	/// isn't what you might expect. We may have multiple entries in PHI nodes for
323	/// a single predecessor. This makes CFG-updating extremely complex, so here we
324	/// simplify all PHI nodes to a model even simpler than the IR's model: exactly
325	/// one entry per predecessor, regardless of how many edges there are.
326	static void canonicalizePHIOperands(MachineFunction &MF) {
327	SmallPtrSet<MachineBasicBlock *, `4`> Preds;
328	SmallVector<int, `4`> DupIndices;
329	for (auto &MBB : MF)
330	for (auto &MI : MBB) {
331	if (!MI.isPHI())
332	break;
333
334	// First we scan the operands of the PHI looking for duplicate entries
335	// a particular predecessor. We retain the operand index of each duplicate
336	// entry found.
337	for (int OpIdx = `1`, NumOps = MI.getNumOperands(); OpIdx < NumOps;
338	OpIdx += `2`)
339	if (!Preds.insert(Ptr: MI.getOperand(i: OpIdx + `1`).getMBB()).second)
340	DupIndices.push_back(Elt: OpIdx);
341
342	// Now walk the duplicate indices, removing both the block and value. Note
343	// that these are stored as a vector making this element-wise removal
344	// :w
345	// potentially quadratic.
346	//
347	// FIXME: It is really frustrating that we have to use a quadratic
348	// removal algorithm here. There should be a better way, but the use-def
349	// updates required make that impossible using the public API.
350	//
351	// Note that we have to process these backwards so that we don't
352	// invalidate other indices with each removal.
353	while (!DupIndices.empty()) {
354	int OpIdx = DupIndices.pop_back_val();
355	// Remove both the block and value operand, again in reverse order to
356	// preserve indices.
357	MI.removeOperand(OpNo: OpIdx + `1`);
358	MI.removeOperand(OpNo: OpIdx);
359	}
360
361	Preds.clear();
362	}
363	}
364
365	/// Helper to scan a function for loads vulnerable to misspeculation that we
366	/// want to harden.
367	///
368	/// We use this to avoid making changes to functions where there is nothing we
369	/// need to do to harden against misspeculation.
370	static bool hasVulnerableLoad(MachineFunction &MF) {
371	for (MachineBasicBlock &MBB : MF) {
372	for (MachineInstr &MI : MBB) {
373	// Loads within this basic block after an LFENCE are not at risk of
374	// speculatively executing with invalid predicates from prior control
375	// flow. So break out of this block but continue scanning the function.
376	if (MI.getOpcode() == X86::LFENCE)
377	break;
378
379	// Looking for loads only.
380	if (!MI.mayLoad())
381	continue;
382
383	// An MFENCE is modeled as a load but isn't vulnerable to misspeculation.
384	if (MI.getOpcode() == X86::MFENCE)
385	continue;
386
387	// We found a load.
388	return true;
389	}
390	}
391
392	// No loads found.
393	return false;
394	}
395
396	bool X86SpeculativeLoadHardeningPass::runOnMachineFunction(
397	MachineFunction &MF) {
398	LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
399	<< " **********\n");
400
401	// Only run if this pass is forced enabled or we detect the relevant function
402	// attribute requesting SLH.
403	if (!EnableSpeculativeLoadHardening &&
404	!MF.getFunction().hasFnAttribute(Kind: Attribute::SpeculativeLoadHardening))
405	return false;
406
407	Subtarget = &MF.getSubtarget<X86Subtarget>();
408	MRI = &MF.getRegInfo();
409	TII = Subtarget->getInstrInfo();
410	TRI = Subtarget->getRegisterInfo();
411
412	// FIXME: Support for 32-bit.
413	PS.emplace(args&: MF, args: &X86::GR64_NOSPRegClass);
414
415	if (MF.begin() == MF.end())
416	// Nothing to do for a degenerate empty function...
417	return false;
418
419	// We support an alternative hardening technique based on a debug flag.
420	if (HardenEdgesWithLFENCE) {
421	hardenEdgesWithLFENCE(MF);
422	return true;
423	}
424
425	// Create a dummy debug loc to use for all the generated code here.
426	DebugLoc Loc;
427
428	MachineBasicBlock &Entry = *MF.begin();
429	auto EntryInsertPt = Entry.SkipPHIsLabelsAndDebug(I: Entry.begin());
430
431	// Do a quick scan to see if we have any checkable loads.
432	bool HasVulnerableLoad = hasVulnerableLoad(MF);
433
434	// See if we have any conditional branching blocks that we will need to trace
435	// predicate state through.
436	SmallVector<BlockCondInfo, `16`> Infos = collectBlockCondInfo(MF);
437
438	// If we have no interesting conditions or loads, nothing to do here.
439	if (!HasVulnerableLoad && Infos.empty())
440	return true;
441
442	// The poison value is required to be an all-ones value for many aspects of
443	// this mitigation.
444	const int PoisonVal = -`1`;
445	PS ->PoisonReg = MRI->createVirtualRegister(RegClass: PS ->RC);
446	BuildMI(BB&: Entry, I: EntryInsertPt, MIMD: Loc, MCID: TII->get(Opcode: X86::MOV64ri32), DestReg: PS ->PoisonReg)
447	.addImm(Val: PoisonVal);
448	++NumInstsInserted;
449
450	// If we have loads being hardened and we've asked for call and ret edges to
451	// get a full fence-based mitigation, inject that fence.
452	if (HasVulnerableLoad && FenceCallAndRet) {
453	// We need to insert an LFENCE at the start of the function to suspend any
454	// incoming misspeculation from the caller. This helps two-fold: the caller
455	// may not have been protected as this code has been, and this code gets to
456	// not take any specific action to protect across calls.
457	// FIXME: We could skip this for functions which unconditionally return
458	// a constant.
459	BuildMI(BB&: Entry, I: EntryInsertPt, MIMD: Loc, MCID: TII->get(Opcode: X86::LFENCE));
460	++NumInstsInserted;
461	++NumLFENCEsInserted;
462	}
463
464	// If we guarded the entry with an LFENCE and have no conditionals to protect
465	// in blocks, then we're done.
466	if (FenceCallAndRet && Infos.empty())
467	// We may have changed the function's code at this point to insert fences.
468	return true;
469
470	// For every basic block in the function which can b
471	if (HardenInterprocedurally && !FenceCallAndRet) {
472	// Set up the predicate state by extracting it from the incoming stack
473	// pointer so we pick up any misspeculation in our caller.
474	PS ->InitialReg = extractPredStateFromSP(MBB&: Entry, InsertPt: EntryInsertPt, Loc);
475	} else {
476	// Otherwise, just build the predicate state itself by zeroing a register
477	// as we don't need any initial state.
478	PS ->InitialReg = MRI->createVirtualRegister(RegClass: PS ->RC);
479	Register PredStateSubReg = MRI->createVirtualRegister(RegClass: &X86::GR32RegClass);
480	auto ZeroI = BuildMI(BB&: Entry, I: EntryInsertPt, MIMD: Loc, MCID: TII->get(Opcode: X86::MOV32r0),
481	DestReg: PredStateSubReg);
482	++NumInstsInserted;
483	MachineOperand *ZeroEFLAGSDefOp =
484	ZeroI ->findRegisterDefOperand(Reg: X86::EFLAGS, /TRI=/nullptr);
485	assert(ZeroEFLAGSDefOp && ZeroEFLAGSDefOp->isImplicit() &&
486	"Must have an implicit def of EFLAGS!");
487	ZeroEFLAGSDefOp->setIsDead(true);
488	BuildMI(BB&: Entry, I: EntryInsertPt, MIMD: Loc, MCID: TII->get(Opcode: X86::SUBREG_TO_REG),
489	DestReg: PS ->InitialReg)
490	.addImm(Val: `0`)
491	.addReg(RegNo: PredStateSubReg)
492	.addImm(Val: X86::sub_32bit);
493	}
494
495	// We're going to need to trace predicate state throughout the function's
496	// CFG. Prepare for this by setting up our initial state of PHIs with unique
497	// predecessor entries and all the initial predicate state.
498	canonicalizePHIOperands(MF);
499
500	// Track the updated values in an SSA updater to rewrite into SSA form at the
501	// end.
502	PS ->SSA.Initialize(V: PS ->InitialReg);
503	PS ->SSA.AddAvailableValue(BB: &Entry, V: PS ->InitialReg);
504
505	// Trace through the CFG.
506	auto CMovs = tracePredStateThroughCFG(MF, Infos);
507
508	// We may also enter basic blocks in this function via exception handling
509	// control flow. Here, if we are hardening interprocedurally, we need to
510	// re-capture the predicate state from the throwing code. In the Itanium ABI,
511	// the throw will always look like a call to __cxa_throw and will have the
512	// predicate state in the stack pointer, so extract fresh predicate state from
513	// the stack pointer and make it available in SSA.
514	// FIXME: Handle non-itanium ABI EH models.
515	if (HardenInterprocedurally) {
516	for (MachineBasicBlock &MBB : MF) {
517	assert(!MBB.isEHScopeEntry() && "Only Itanium ABI EH supported!");
518	assert(!MBB.isEHFuncletEntry() && "Only Itanium ABI EH supported!");
519	assert(!MBB.isCleanupFuncletEntry() && "Only Itanium ABI EH supported!");
520	if (!MBB.isEHPad())
521	continue;
522	PS ->SSA.AddAvailableValue(
523	BB: &MBB,
524	V: extractPredStateFromSP(MBB, InsertPt: MBB.SkipPHIsAndLabels(I: MBB.begin()), Loc));
525	}
526	}
527
528	if (HardenIndirectCallsAndJumps) {
529	// If we are going to harden calls and jumps we need to unfold their memory
530	// operands.
531	unfoldCallAndJumpLoads(MF);
532
533	// Then we trace predicate state through the indirect branches.
534	auto IndirectBrCMovs = tracePredStateThroughIndirectBranches(MF);
535	CMovs.append(in_start: IndirectBrCMovs.begin(), in_end: IndirectBrCMovs.end());
536	}
537
538	// Now that we have the predicate state available at the start of each block
539	// in the CFG, trace it through each block, hardening vulnerable instructions
540	// as we go.
541	tracePredStateThroughBlocksAndHarden(MF);
542
543	// Now rewrite all the uses of the pred state using the SSA updater to insert
544	// PHIs connecting the state between blocks along the CFG edges.
545	for (MachineInstr *CMovI : CMovs)
546	for (MachineOperand &Op : CMovI->operands()) {
547	if (!Op.isReg() \|\| Op.getReg() != PS ->InitialReg)
548	continue;
549
550	PS ->SSA.RewriteUse(U&: Op);
551	}
552
553	LLVM_DEBUG(dbgs() << "Final speculative load hardened function:\n"; MF.dump();
554	dbgs() << "\n"; MF.verify(this));
555	return true;
556	}
557
558	/// Implements the naive hardening approach of putting an LFENCE after every
559	/// potentially mis-predicted control flow construct.
560	///
561	/// We include this as an alternative mostly for the purpose of comparison. The
562	/// performance impact of this is expected to be extremely severe and not
563	/// practical for any real-world users.
564	void X86SpeculativeLoadHardeningPass::hardenEdgesWithLFENCE(
565	MachineFunction &MF) {
566	// First, we scan the function looking for blocks that are reached along edges
567	// that we might want to harden.
568	SmallSetVector<MachineBasicBlock *, `8`> Blocks;
569	for (MachineBasicBlock &MBB : MF) {
570	// If there are no or only one successor, nothing to do here.
571	if (MBB.succ_size() <= `1`)
572	continue;
573
574	// Skip blocks unless their terminators start with a branch. Other
575	// terminators don't seem interesting for guarding against misspeculation.
576	auto TermIt = MBB.getFirstTerminator();
577	if (TermIt == MBB.end() \|\| !TermIt ->isBranch())
578	continue;
579
580	// Add all the non-EH-pad succossors to the blocks we want to harden. We
581	// skip EH pads because there isn't really a condition of interest on
582	// entering.
583	for (MachineBasicBlock *SuccMBB : MBB.successors())
584	if (!SuccMBB->isEHPad())
585	Blocks.insert(X: SuccMBB);
586	}
587
588	for (MachineBasicBlock *MBB : Blocks) {
589	auto InsertPt = MBB->SkipPHIsAndLabels(I: MBB->begin());
590	BuildMI(BB&: *MBB, I: InsertPt, MIMD: DebugLoc (), MCID: TII->get(Opcode: X86::LFENCE));
591	++NumInstsInserted;
592	++NumLFENCEsInserted;
593	}
594	}
595
596	SmallVector<X86SpeculativeLoadHardeningPass::BlockCondInfo, `16`>
597	X86SpeculativeLoadHardeningPass::collectBlockCondInfo(MachineFunction &MF) {
598	SmallVector<BlockCondInfo, `16`> Infos;
599
600	// Walk the function and build up a summary for each block's conditions that
601	// we need to trace through.
602	for (MachineBasicBlock &MBB : MF) {
603	// If there are no or only one successor, nothing to do here.
604	if (MBB.succ_size() <= `1`)
605	continue;
606
607	// We want to reliably handle any conditional branch terminators in the
608	// MBB, so we manually analyze the branch. We can handle all of the
609	// permutations here, including ones that analyze branch cannot.
610	//
611	// The approach is to walk backwards across the terminators, resetting at
612	// any unconditional non-indirect branch, and track all conditional edges
613	// to basic blocks as well as the fallthrough or unconditional successor
614	// edge. For each conditional edge, we track the target and the opposite
615	// condition code in order to inject a "no-op" cmov into that successor
616	// that will harden the predicate. For the fallthrough/unconditional
617	// edge, we inject a separate cmov for each conditional branch with
618	// matching condition codes. This effectively implements an "and" of the
619	// condition flags, even if there isn't a single condition flag that would
620	// directly implement that. We don't bother trying to optimize either of
621	// these cases because if such an optimization is possible, LLVM should
622	// have optimized the conditional branches* in that way already to reduce*
623	// instruction count. This late, we simply assume the minimal number of
624	// branch instructions is being emitted and use that to guide our cmov
625	// insertion.
626
627	BlockCondInfo Info = {.MBB: &MBB, .CondBrs: {}, .UncondBr: nullptr};
628
629	// Now walk backwards through the terminators and build up successors they
630	// reach and the conditions.
631	for (MachineInstr &MI : llvm::reverse(C&: MBB)) {
632	// Once we've handled all the terminators, we're done.
633	if (!MI.isTerminator())
634	break;
635
636	// If we see a non-branch terminator, we can't handle anything so bail.
637	if (!MI.isBranch()) {
638	Info.CondBrs.clear();
639	break;
640	}
641
642	// If we see an unconditional branch, reset our state, clear any
643	// fallthrough, and set this is the "else" successor.
644	if (MI.getOpcode() == X86::JMP_1) {
645	Info.CondBrs.clear();
646	Info.UncondBr = &MI;
647	continue;
648	}
649
650	// If we get an invalid condition, we have an indirect branch or some
651	// other unanalyzable "fallthrough" case. We model this as a nullptr for
652	// the destination so we can still guard any conditional successors.
653	// Consider code sequences like:
654	// ```
655	// jCC L1
656	// jmpq %rax*
657	// ```
658	// We still want to harden the edge to `L1`.
659	if (X86::getCondFromBranch(MI) == X86::COND_INVALID) {
660	Info.CondBrs.clear();
661	Info.UncondBr = &MI;
662	continue;
663	}
664
665	// We have a vanilla conditional branch, add it to our list.
666	Info.CondBrs.push_back(Elt: &MI);
667	}
668	if (Info.CondBrs.empty()) {
669	++NumBranchesUntraced;
670	LLVM_DEBUG(dbgs() << "WARNING: unable to secure successors of block:\n";
671	MBB.dump());
672	continue;
673	}
674
675	Infos.push_back(Elt: Info);
676	}
677
678	return Infos;
679	}
680
681	/// Trace the predicate state through the CFG, instrumenting each conditional
682	/// branch such that misspeculation through an edge will poison the predicate
683	/// state.
684	///
685	/// Returns the list of inserted CMov instructions so that they can have their
686	/// uses of the predicate state rewritten into proper SSA form once it is
687	/// complete.
688	SmallVector<MachineInstr *, `16`>
689	X86SpeculativeLoadHardeningPass::tracePredStateThroughCFG(
690	MachineFunction &MF, ArrayRef<BlockCondInfo> Infos) {
691	// Collect the inserted cmov instructions so we can rewrite their uses of the
692	// predicate state into SSA form.
693	SmallVector<MachineInstr *, `16`> CMovs;
694
695	// Now walk all of the basic blocks looking for ones that end in conditional
696	// jumps where we need to update this register along each edge.
697	for (const BlockCondInfo &Info : Infos) {
698	MachineBasicBlock &MBB = *Info.MBB;
699	const SmallVectorImpl<MachineInstr *> &CondBrs = Info.CondBrs;
700	MachineInstr *UncondBr = Info.UncondBr;
701
702	LLVM_DEBUG(dbgs() << "Tracing predicate through block: " << MBB.getName()
703	<< "\n");
704	++NumCondBranchesTraced;
705
706	// Compute the non-conditional successor as either the target of any
707	// unconditional branch or the layout successor.
708	MachineBasicBlock *UncondSucc =
709	UncondBr ? (UncondBr->getOpcode() == X86::JMP_1
710	? UncondBr->getOperand(i: `0`).getMBB()
711	: nullptr)
712	: &*std::next(x: MachineFunction::iterator (&MBB));
713
714	// Count how many edges there are to any given successor.
715	SmallDenseMap<MachineBasicBlock , int*> SuccCounts;
716	if (UncondSucc)
717	++SuccCounts [UncondSucc];
718	for (auto *CondBr : CondBrs)
719	++SuccCounts [CondBr->getOperand(i: `0`).getMBB()];
720
721	// A lambda to insert cmov instructions into a block checking all of the
722	// condition codes in a sequence.
723	auto BuildCheckingBlockForSuccAndConds =
724	[&](MachineBasicBlock &MBB, MachineBasicBlock &Succ, int SuccCount,
725	MachineInstr Br, MachineInstr &UncondBr,
726	ArrayRef<X86::CondCode> Conds) {
727	// First, we split the edge to insert the checking block into a safe
728	// location.
729	auto &CheckingMBB =
730	(SuccCount == `1` && Succ.pred_size() == `1`)
731	? Succ
732	: splitEdge(MBB, Succ, SuccCount, Br, UncondBr, TII: *TII);
733
734	bool LiveEFLAGS = Succ.isLiveIn(Reg: X86::EFLAGS);
735	if (!LiveEFLAGS)
736	CheckingMBB.addLiveIn(PhysReg: X86::EFLAGS);
737
738	// Now insert the cmovs to implement the checks.
739	auto InsertPt = CheckingMBB.begin();
740	assert((InsertPt == CheckingMBB.end() \|\| !InsertPt->isPHI()) &&
741	"Should never have a PHI in the initial checking block as it "
742	"always has a single predecessor!");
743
744	// We will wire each cmov to each other, but need to start with the
745	// incoming pred state.
746	Register CurStateReg = PS ->InitialReg;
747
748	for (X86::CondCode Cond : Conds) {
749	int PredStateSizeInBytes = TRI->getRegSizeInBits(RC: *PS ->RC) / `8`;
750	auto CMovOp = X86::getCMovOpcode(RegBytes: PredStateSizeInBytes);
751
752	Register UpdatedStateReg = MRI->createVirtualRegister(RegClass: PS ->RC);
753	// Note that we intentionally use an empty debug location so that
754	// this picks up the preceding location.
755	auto CMovI = BuildMI(BB&: CheckingMBB, I: InsertPt, MIMD: DebugLoc (),
756	MCID: TII->get(Opcode: CMovOp), DestReg: UpdatedStateReg)
757	.addReg(RegNo: CurStateReg)
758	.addReg(RegNo: PS ->PoisonReg)
759	.addImm(Val: Cond);
760	// If this is the last cmov and the EFLAGS weren't originally
761	// live-in, mark them as killed.
762	if (!LiveEFLAGS && Cond == Conds.back())
763	CMovI ->findRegisterUseOperand(Reg: X86::EFLAGS, /TRI=/nullptr)
764	->setIsKill(true);
765
766	++NumInstsInserted;
767	LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI->dump();
768	dbgs() << "\n");
769
770	// The first one of the cmovs will be using the top level
771	// `PredStateReg` and need to get rewritten into SSA form.
772	if (CurStateReg == PS ->InitialReg)
773	CMovs.push_back(Elt: &*CMovI);
774
775	// The next cmov should start from this one's def.
776	CurStateReg = UpdatedStateReg;
777	}
778
779	// And put the last one into the available values for SSA form of our
780	// predicate state.
781	PS ->SSA.AddAvailableValue(BB: &CheckingMBB, V: CurStateReg);
782	};
783
784	std::vector<X86::CondCode> UncondCodeSeq;
785	for (auto *CondBr : CondBrs) {
786	MachineBasicBlock &Succ = *CondBr->getOperand(i: `0`).getMBB();
787	int &SuccCount = SuccCounts [&Succ];
788
789	X86::CondCode Cond = X86::getCondFromBranch(MI: *CondBr);
790	X86::CondCode InvCond = X86::GetOppositeBranchCondition(CC: Cond);
791	UncondCodeSeq.push_back(x: Cond);
792
793	BuildCheckingBlockForSuccAndConds (MBB, Succ, SuccCount, CondBr, UncondBr,
794	{InvCond});
795
796	// Decrement the successor count now that we've split one of the edges.
797	// We need to keep the count of edges to the successor accurate in order
798	// to know above when to replace* the successor in the CFG vs. just*
799	// adding the new successor.
800	--SuccCount;
801	}
802
803	// Since we may have split edges and changed the number of successors,
804	// normalize the probabilities. This avoids doing it each time we split an
805	// edge.
806	MBB.normalizeSuccProbs();
807
808	// Finally, we need to insert cmovs into the "fallthrough" edge. Here, we
809	// need to intersect the other condition codes. We can do this by just
810	// doing a cmov for each one.
811	if (!UncondSucc)
812	// If we have no fallthrough to protect (perhaps it is an indirect jump?)
813	// just skip this and continue.
814	continue;
815
816	assert(SuccCounts[UncondSucc] == `1` &&
817	"We should never have more than one edge to the unconditional "
818	"successor at this point because every other edge must have been "
819	"split above!");
820
821	// Sort and unique the codes to minimize them.
822	llvm::sort(C&: UncondCodeSeq);
823	UncondCodeSeq.erase(first: llvm::unique(R&: UncondCodeSeq), last: UncondCodeSeq.end());
824
825	// Build a checking version of the successor.
826	BuildCheckingBlockForSuccAndConds (MBB, UncondSucc, /SuccCount/* `1`,
827	UncondBr, UncondBr, UncondCodeSeq);
828	}
829
830	return CMovs;
831	}
832
833	/// Compute the register class for the unfolded load.
834	///
835	/// FIXME: This should probably live in X86InstrInfo, potentially by adding
836	/// a way to unfold into a newly created vreg rather than requiring a register
837	/// input.
838	static const TargetRegisterClass *
839	getRegClassForUnfoldedLoad(MachineFunction &MF, const X86InstrInfo &TII,
840	unsigned Opcode) {
841	unsigned Index;
842	unsigned UnfoldedOpc = TII.getOpcodeAfterMemoryUnfold(
843	Opc: Opcode, /UnfoldLoad/ true, /UnfoldStore/ false, LoadRegIndex: &Index);
844	const MCInstrDesc &MCID = TII.get(Opcode: UnfoldedOpc);
845	return TII.getRegClass(MCID, OpNum: Index, TRI: &TII.getRegisterInfo(), MF);
846	}
847
848	void X86SpeculativeLoadHardeningPass::unfoldCallAndJumpLoads(
849	MachineFunction &MF) {
850	for (MachineBasicBlock &MBB : MF)
851	// We use make_early_inc_range here so we can remove instructions if needed
852	// without disturbing the iteration.
853	for (MachineInstr &MI : llvm::make_early_inc_range(Range: MBB.instrs())) {
854	// Must either be a call or a branch.
855	if (!MI.isCall() && !MI.isBranch())
856	continue;
857	// We only care about loading variants of these instructions.
858	if (!MI.mayLoad())
859	continue;
860
861	switch (MI.getOpcode()) {
862	default: {
863	LLVM_DEBUG(
864	dbgs() << "ERROR: Found an unexpected loading branch or call "
865	"instruction:\n";
866	MI.dump(); dbgs() << "\n");
867	report_fatal_error(reason: "Unexpected loading branch or call!");
868	}
869
870	case X86::FARCALL16m:
871	case X86::FARCALL32m:
872	case X86::FARCALL64m:
873	case X86::FARJMP16m:
874	case X86::FARJMP32m:
875	case X86::FARJMP64m:
876	// We cannot mitigate far jumps or calls, but we also don't expect them
877	// to be vulnerable to Spectre v1.2 style attacks.
878	continue;
879
880	case X86::CALL16m:
881	case X86::CALL16m_NT:
882	case X86::CALL32m:
883	case X86::CALL32m_NT:
884	case X86::CALL64m:
885	case X86::CALL64m_NT:
886	case X86::JMP16m:
887	case X86::JMP16m_NT:
888	case X86::JMP32m:
889	case X86::JMP32m_NT:
890	case X86::JMP64m:
891	case X86::JMP64m_NT:
892	case X86::TAILJMPm64:
893	case X86::TAILJMPm64_REX:
894	case X86::TAILJMPm:
895	case X86::TCRETURNmi64:
896	case X86::TCRETURNmi: {
897	// Use the generic unfold logic now that we know we're dealing with
898	// expected instructions.
899	// FIXME: We don't have test coverage for all of these!
900	auto UnfoldedRC = getRegClassForUnfoldedLoad(MF, TII: TII, Opcode: MI.getOpcode());
901	if (!UnfoldedRC) {
902	LLVM_DEBUG(dbgs()
903	<< "ERROR: Unable to unfold load from instruction:\n";
904	MI.dump(); dbgs() << "\n");
905	report_fatal_error(reason: "Unable to unfold load!");
906	}
907	Register Reg = MRI->createVirtualRegister(RegClass: UnfoldedRC);
908	SmallVector<MachineInstr *, `2`> NewMIs;
909	// If we were able to compute an unfolded reg class, any failure here
910	// is just a programming error so just assert.
911	bool Unfolded =
912	TII->unfoldMemoryOperand(MF, MI, Reg, /UnfoldLoad/ true,
913	/UnfoldStore/ false, NewMIs);
914	(void)Unfolded;
915	assert(Unfolded &&
916	"Computed unfolded register class but failed to unfold");
917	// Now stitch the new instructions into place and erase the old one.
918	for (auto *NewMI : NewMIs)
919	MBB.insert(I: MI.getIterator(), M: NewMI);
920
921	// Update the call info.
922	if (MI.isCandidateForAdditionalCallInfo())
923	MF.eraseAdditionalCallInfo(MI: &MI);
924
925	MI.eraseFromParent();
926	LLVM_DEBUG({
927	dbgs() << "Unfolded load successfully into:\n";
928	for (auto *NewMI : NewMIs) {
929	NewMI->dump();
930	dbgs() << "\n";
931	}
932	});
933	continue;
934	}
935	}
936	llvm_unreachable("Escaped switch with default!");
937	}
938	}
939
940	/// Trace the predicate state through indirect branches, instrumenting them to
941	/// poison the state if a target is reached that does not match the expected
942	/// target.
943	///
944	/// This is designed to mitigate Spectre variant 1 attacks where an indirect
945	/// branch is trained to predict a particular target and then mispredicts that
946	/// target in a way that can leak data. Despite using an indirect branch, this
947	/// is really a variant 1 style attack: it does not steer execution to an
948	/// arbitrary or attacker controlled address, and it does not require any
949	/// special code executing next to the victim. This attack can also be mitigated
950	/// through retpolines, but those require either replacing indirect branches
951	/// with conditional direct branches or lowering them through a device that
952	/// blocks speculation. This mitigation can replace these retpoline-style
953	/// mitigations for jump tables and other indirect branches within a function
954	/// when variant 2 isn't a risk while allowing limited speculation. Indirect
955	/// calls, however, cannot be mitigated through this technique without changing
956	/// the ABI in a fundamental way.
957	SmallVector<MachineInstr *, `16`>
958	X86SpeculativeLoadHardeningPass::tracePredStateThroughIndirectBranches(
959	MachineFunction &MF) {
960	// We use the SSAUpdater to insert PHI nodes for the target addresses of
961	// indirect branches. We don't actually need the full power of the SSA updater
962	// in this particular case as we always have immediately available values, but
963	// this avoids us having to re-implement the PHI construction logic.
964	MachineSSAUpdater TargetAddrSSA(MF);
965	TargetAddrSSA.Initialize(V: MRI->createVirtualRegister(RegClass: &X86::GR64RegClass));
966
967	// Track which blocks were terminated with an indirect branch.
968	SmallPtrSet<MachineBasicBlock *, `4`> IndirectTerminatedMBBs;
969
970	// We need to know what blocks end up reached via indirect branches. We
971	// expect this to be a subset of those whose address is taken and so track it
972	// directly via the CFG.
973	SmallPtrSet<MachineBasicBlock *, `4`> IndirectTargetMBBs;
974
975	// Walk all the blocks which end in an indirect branch and make the
976	// target address available.
977	for (MachineBasicBlock &MBB : MF) {
978	// Find the last terminator.
979	auto MII = MBB.instr_rbegin();
980	while (MII != MBB.instr_rend() && MII ->isDebugInstr())
981	++MII;
982	if (MII == MBB.instr_rend())
983	continue;
984	MachineInstr &TI = *MII;
985	if (!TI.isTerminator() \|\| !TI.isBranch())
986	// No terminator or non-branch terminator.
987	continue;
988
989	Register TargetReg;
990
991	switch (TI.getOpcode()) {
992	default:
993	// Direct branch or conditional branch (leading to fallthrough).
994	continue;
995
996	case X86::FARJMP16m:
997	case X86::FARJMP32m:
998	case X86::FARJMP64m:
999	// We cannot mitigate far jumps or calls, but we also don't expect them
1000	// to be vulnerable to Spectre v1.2 or v2 (self trained) style attacks.
1001	continue;
1002
1003	case X86::JMP16m:
1004	case X86::JMP16m_NT:
1005	case X86::JMP32m:
1006	case X86::JMP32m_NT:
1007	case X86::JMP64m:
1008	case X86::JMP64m_NT:
1009	// Mostly as documentation.
1010	report_fatal_error(reason: "Memory operand jumps should have been unfolded!");
1011
1012	case X86::JMP16r:
1013	report_fatal_error(
1014	reason: "Support for 16-bit indirect branches is not implemented.");
1015	case X86::JMP32r:
1016	report_fatal_error(
1017	reason: "Support for 32-bit indirect branches is not implemented.");
1018
1019	case X86::JMP64r:
1020	TargetReg = TI.getOperand(i: `0`).getReg();
1021	}
1022
1023	// We have definitely found an indirect branch. Verify that there are no
1024	// preceding conditional branches as we don't yet support that.
1025	if (llvm::any_of(Range: MBB.terminators(), P: [&](MachineInstr &OtherTI) {
1026	return !OtherTI.isDebugInstr() && &OtherTI != &TI;
1027	})) {
1028	LLVM_DEBUG({
1029	dbgs() << "ERROR: Found other terminators in a block with an indirect "
1030	"branch! This is not yet supported! Terminator sequence:\n";
1031	for (MachineInstr &MI : MBB.terminators()) {
1032	MI.dump();
1033	dbgs() << `'\n'`;
1034	}
1035	});
1036	report_fatal_error(reason: "Unimplemented terminator sequence!");
1037	}
1038
1039	// Make the target register an available value for this block.
1040	TargetAddrSSA.AddAvailableValue(BB: &MBB, V: TargetReg);
1041	IndirectTerminatedMBBs.insert(Ptr: &MBB);
1042
1043	// Add all the successors to our target candidates.
1044	IndirectTargetMBBs.insert_range(R: MBB.successors());
1045	}
1046
1047	// Keep track of the cmov instructions we insert so we can return them.
1048	SmallVector<MachineInstr *, `16`> CMovs;
1049
1050	// If we didn't find any indirect branches with targets, nothing to do here.
1051	if (IndirectTargetMBBs.empty())
1052	return CMovs;
1053
1054	// We found indirect branches and targets that need to be instrumented to
1055	// harden loads within them. Walk the blocks of the function (to get a stable
1056	// ordering) and instrument each target of an indirect branch.
1057	for (MachineBasicBlock &MBB : MF) {
1058	// Skip the blocks that aren't candidate targets.
1059	if (!IndirectTargetMBBs.count(Ptr: &MBB))
1060	continue;
1061
1062	// We don't expect EH pads to ever be reached via an indirect branch. If
1063	// this is desired for some reason, we could simply skip them here rather
1064	// than asserting.
1065	assert(!MBB.isEHPad() &&
1066	"Unexpected EH pad as target of an indirect branch!");
1067
1068	// We should never end up threading EFLAGS into a block to harden
1069	// conditional jumps as there would be an additional successor via the
1070	// indirect branch. As a consequence, all such edges would be split before
1071	// reaching here, and the inserted block will handle the EFLAGS-based
1072	// hardening.
1073	assert(!MBB.isLiveIn(X86::EFLAGS) &&
1074	"Cannot check within a block that already has live-in EFLAGS!");
1075
1076	// We can't handle having non-indirect edges into this block unless this is
1077	// the only successor and we can synthesize the necessary target address.
1078	for (MachineBasicBlock *Pred : MBB.predecessors()) {
1079	// If we've already handled this by extracting the target directly,
1080	// nothing to do.
1081	if (IndirectTerminatedMBBs.count(Ptr: Pred))
1082	continue;
1083
1084	// Otherwise, we have to be the only successor. We generally expect this
1085	// to be true as conditional branches should have had a critical edge
1086	// split already. We don't however need to worry about EH pad successors
1087	// as they'll happily ignore the target and their hardening strategy is
1088	// resilient to all ways in which they could be reached speculatively.
1089	if (!llvm::all_of(Range: Pred->successors(), P: [&](MachineBasicBlock *Succ) {
1090	return Succ->isEHPad() \|\| Succ == &MBB;
1091	})) {
1092	LLVM_DEBUG({
1093	dbgs() << "ERROR: Found conditional entry to target of indirect "
1094	"branch!\n";
1095	Pred->dump();
1096	MBB.dump();
1097	});
1098	report_fatal_error(reason: "Cannot harden a conditional entry to a target of "
1099	"an indirect branch!");
1100	}
1101
1102	// Now we need to compute the address of this block and install it as a
1103	// synthetic target in the predecessor. We do this at the bottom of the
1104	// predecessor.
1105	auto InsertPt = Pred->getFirstTerminator();
1106	Register TargetReg = MRI->createVirtualRegister(RegClass: &X86::GR64RegClass);
1107	if (MF.getTarget().getCodeModel() == CodeModel::Small &&
1108	!Subtarget->isPositionIndependent()) {
1109	// Directly materialize it into an immediate.
1110	auto AddrI = BuildMI(BB&: *Pred, I: InsertPt, MIMD: DebugLoc (),
1111	MCID: TII->get(Opcode: X86::MOV64ri32), DestReg: TargetReg)
1112	.addMBB(MBB: &MBB);
1113	++NumInstsInserted;
1114	(void)AddrI;
1115	LLVM_DEBUG(dbgs() << " Inserting mov: "; AddrI->dump();
1116	dbgs() << "\n");
1117	} else {
1118	auto AddrI = BuildMI(BB&: *Pred, I: InsertPt, MIMD: DebugLoc (), MCID: TII->get(Opcode: X86::LEA64r),
1119	DestReg: TargetReg)
1120	.addReg(/Base/ RegNo: X86::RIP)
1121	.addImm(/Scale/ Val: `1`)
1122	.addReg(/Index/ RegNo: `0`)
1123	.addMBB(MBB: &MBB)
1124	.addReg(/Segment/ RegNo: `0`);
1125	++NumInstsInserted;
1126	(void)AddrI;
1127	LLVM_DEBUG(dbgs() << " Inserting lea: "; AddrI->dump();
1128	dbgs() << "\n");
1129	}
1130	// And make this available.
1131	TargetAddrSSA.AddAvailableValue(BB: Pred, V: TargetReg);
1132	}
1133
1134	// Materialize the needed SSA value of the target. Note that we need the
1135	// middle of the block as this block might at the bottom have an indirect
1136	// branch back to itself. We can do this here because at this point, every
1137	// predecessor of this block has an available value. This is basically just
1138	// automating the construction of a PHI node for this target.
1139	Register TargetReg = TargetAddrSSA.GetValueInMiddleOfBlock(BB: &MBB);
1140
1141	// Insert a comparison of the incoming target register with this block's
1142	// address. This also requires us to mark the block as having its address
1143	// taken explicitly.
1144	MBB.setMachineBlockAddressTaken();
1145	auto InsertPt = MBB.SkipPHIsLabelsAndDebug(I: MBB.begin());
1146	if (MF.getTarget().getCodeModel() == CodeModel::Small &&
1147	!Subtarget->isPositionIndependent()) {
1148	// Check directly against a relocated immediate when we can.
1149	auto CheckI = BuildMI(BB&: MBB, I: InsertPt, MIMD: DebugLoc (), MCID: TII->get(Opcode: X86::CMP64ri32))
1150	.addReg(RegNo: TargetReg, flags: RegState::Kill)
1151	.addMBB(MBB: &MBB);
1152	++NumInstsInserted;
1153	(void)CheckI;
1154	LLVM_DEBUG(dbgs() << " Inserting cmp: "; CheckI->dump(); dbgs() << "\n");
1155	} else {
1156	// Otherwise compute the address into a register first.
1157	Register AddrReg = MRI->createVirtualRegister(RegClass: &X86::GR64RegClass);
1158	auto AddrI =
1159	BuildMI(BB&: MBB, I: InsertPt, MIMD: DebugLoc (), MCID: TII->get(Opcode: X86::LEA64r), DestReg: AddrReg)
1160	.addReg(/Base/ RegNo: X86::RIP)
1161	.addImm(/Scale/ Val: `1`)
1162	.addReg(/Index/ RegNo: `0`)
1163	.addMBB(MBB: &MBB)
1164	.addReg(/Segment/ RegNo: `0`);
1165	++NumInstsInserted;
1166	(void)AddrI;
1167	LLVM_DEBUG(dbgs() << " Inserting lea: "; AddrI->dump(); dbgs() << "\n");
1168	auto CheckI = BuildMI(BB&: MBB, I: InsertPt, MIMD: DebugLoc (), MCID: TII->get(Opcode: X86::CMP64rr))
1169	.addReg(RegNo: TargetReg, flags: RegState::Kill)
1170	.addReg(RegNo: AddrReg, flags: RegState::Kill);
1171	++NumInstsInserted;
1172	(void)CheckI;
1173	LLVM_DEBUG(dbgs() << " Inserting cmp: "; CheckI->dump(); dbgs() << "\n");
1174	}
1175
1176	// Now cmov over the predicate if the comparison wasn't equal.
1177	int PredStateSizeInBytes = TRI->getRegSizeInBits(RC: *PS ->RC) / `8`;
1178	auto CMovOp = X86::getCMovOpcode(RegBytes: PredStateSizeInBytes);
1179	Register UpdatedStateReg = MRI->createVirtualRegister(RegClass: PS ->RC);
1180	auto CMovI =
1181	BuildMI(BB&: MBB, I: InsertPt, MIMD: DebugLoc (), MCID: TII->get(Opcode: CMovOp), DestReg: UpdatedStateReg)
1182	.addReg(RegNo: PS ->InitialReg)
1183	.addReg(RegNo: PS ->PoisonReg)
1184	.addImm(Val: X86::COND_NE);
1185	CMovI ->findRegisterUseOperand(Reg: X86::EFLAGS, /TRI=/nullptr)
1186	->setIsKill(true);
1187	++NumInstsInserted;
1188	LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI->dump(); dbgs() << "\n");
1189	CMovs.push_back(Elt: &*CMovI);
1190
1191	// And put the new value into the available values for SSA form of our
1192	// predicate state.
1193	PS ->SSA.AddAvailableValue(BB: &MBB, V: UpdatedStateReg);
1194	}
1195
1196	// Return all the newly inserted cmov instructions of the predicate state.
1197	return CMovs;
1198	}
1199
1200	// Returns true if the MI has EFLAGS as a register def operand and it's live,
1201	// otherwise it returns false
1202	static bool isEFLAGSDefLive(const MachineInstr &MI) {
1203	if (const MachineOperand *DefOp =
1204	MI.findRegisterDefOperand(Reg: X86::EFLAGS, /TRI=/nullptr)) {
1205	return !DefOp->isDead();
1206	}
1207	return false;
1208	}
1209
1210	static bool isEFLAGSLive(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
1211	const TargetRegisterInfo &TRI) {
1212	// Check if EFLAGS are alive by seeing if there is a def of them or they
1213	// live-in, and then seeing if that def is in turn used.
1214	for (MachineInstr &MI : llvm::reverse(C: llvm::make_range(x: MBB.begin(), y: I))) {
1215	if (MachineOperand *DefOp =
1216	MI.findRegisterDefOperand(Reg: X86::EFLAGS, /TRI=/nullptr)) {
1217	// If the def is dead, then EFLAGS is not live.
1218	if (DefOp->isDead())
1219	return false;
1220
1221	// Otherwise we've def'ed it, and it is live.
1222	return true;
1223	}
1224	// While at this instruction, also check if we use and kill EFLAGS
1225	// which means it isn't live.
1226	if (MI.killsRegister(Reg: X86::EFLAGS, TRI: &TRI))
1227	return false;
1228	}
1229
1230	// If we didn't find anything conclusive (neither definitely alive or
1231	// definitely dead) return whether it lives into the block.
1232	return MBB.isLiveIn(Reg: X86::EFLAGS);
1233	}
1234
1235	/// Trace the predicate state through each of the blocks in the function,
1236	/// hardening everything necessary along the way.
1237	///
1238	/// We call this routine once the initial predicate state has been established
1239	/// for each basic block in the function in the SSA updater. This routine traces
1240	/// it through the instructions within each basic block, and for non-returning
1241	/// blocks informs the SSA updater about the final state that lives out of the
1242	/// block. Along the way, it hardens any vulnerable instruction using the
1243	/// currently valid predicate state. We have to do these two things together
1244	/// because the SSA updater only works across blocks. Within a block, we track
1245	/// the current predicate state directly and update it as it changes.
1246	///
1247	/// This operates in two passes over each block. First, we analyze the loads in
1248	/// the block to determine which strategy will be used to harden them: hardening
1249	/// the address or hardening the loaded value when loaded into a register
1250	/// amenable to hardening. We have to process these first because the two
1251	/// strategies may interact -- later hardening may change what strategy we wish
1252	/// to use. We also will analyze data dependencies between loads and avoid
1253	/// hardening those loads that are data dependent on a load with a hardened
1254	/// address. We also skip hardening loads already behind an LFENCE as that is
1255	/// sufficient to harden them against misspeculation.
1256	///
1257	/// Second, we actively trace the predicate state through the block, applying
1258	/// the hardening steps we determined necessary in the first pass as we go.
1259	///
1260	/// These two passes are applied to each basic block. We operate one block at a
1261	/// time to simplify reasoning about reachability and sequencing.
1262	void X86SpeculativeLoadHardeningPass::tracePredStateThroughBlocksAndHarden(
1263	MachineFunction &MF) {
1264	SmallPtrSet<MachineInstr *, `16`> HardenPostLoad;
1265	SmallPtrSet<MachineInstr *, `16`> HardenLoadAddr;
1266
1267	SmallSet<Register, `16`> HardenedAddrRegs;
1268
1269	SmallDenseMap<Register, Register, `32`> AddrRegToHardenedReg;
1270
1271	// Track the set of load-dependent registers through the basic block. Because
1272	// the values of these registers have an existing data dependency on a loaded
1273	// value which we would have checked, we can omit any checks on them.
1274	SparseBitVector<> LoadDepRegs;
1275
1276	for (MachineBasicBlock &MBB : MF) {
1277	// The first pass over the block: collect all the loads which can have their
1278	// loaded value hardened and all the loads that instead need their address
1279	// hardened. During this walk we propagate load dependence for address
1280	// hardened loads and also look for LFENCE to stop hardening wherever
1281	// possible. When deciding whether or not to harden the loaded value or not,
1282	// we check to see if any registers used in the address will have been
1283	// hardened at this point and if so, harden any remaining address registers
1284	// as that often successfully re-uses hardened addresses and minimizes
1285	// instructions.
1286	//
1287	// FIXME: We should consider an aggressive mode where we continue to keep as
1288	// many loads value hardened even when some address register hardening would
1289	// be free (due to reuse).
1290	//
1291	// Note that we only need this pass if we are actually hardening loads.
1292	if (HardenLoads)
1293	for (MachineInstr &MI : MBB) {
1294	// We naively assume that all def'ed registers of an instruction have
1295	// a data dependency on all of their operands.
1296	// FIXME: Do a more careful analysis of x86 to build a conservative
1297	// model here.
1298	if (llvm::any_of(Range: MI.uses(), P: [&](MachineOperand &Op) {
1299	return Op.isReg() && LoadDepRegs.test(Idx: Op.getReg().id());
1300	}))
1301	for (MachineOperand &Def : MI.defs())
1302	if (Def.isReg())
1303	LoadDepRegs.set(Def.getReg().id());
1304
1305	// Both Intel and AMD are guiding that they will change the semantics of
1306	// LFENCE to be a speculation barrier, so if we see an LFENCE, there is
1307	// no more need to guard things in this block.
1308	if (MI.getOpcode() == X86::LFENCE)
1309	break;
1310
1311	// If this instruction cannot load, nothing to do.
1312	if (!MI.mayLoad())
1313	continue;
1314
1315	// Some instructions which "load" are trivially safe or unimportant.
1316	if (MI.getOpcode() == X86::MFENCE)
1317	continue;
1318
1319	// Extract the memory operand information about this instruction.
1320	const int MemRefBeginIdx = X86::getFirstAddrOperandIdx(MI);
1321	if (MemRefBeginIdx < `0`) {
1322	LLVM_DEBUG(dbgs()
1323	<< "WARNING: unable to harden loading instruction: ";
1324	MI.dump());
1325	continue;
1326	}
1327
1328	MachineOperand &BaseMO =
1329	MI.getOperand(i: MemRefBeginIdx + X86::AddrBaseReg);
1330	MachineOperand &IndexMO =
1331	MI.getOperand(i: MemRefBeginIdx + X86::AddrIndexReg);
1332
1333	// If we have at least one (non-frame-index, non-RIP) register operand,
1334	// and neither operand is load-dependent, we need to check the load.
1335	Register BaseReg, IndexReg;
1336	if (!BaseMO.isFI() && BaseMO.getReg() != X86::RIP &&
1337	BaseMO.getReg().isValid())
1338	BaseReg = BaseMO.getReg();
1339	if (IndexMO.getReg().isValid())
1340	IndexReg = IndexMO.getReg();
1341
1342	if (!BaseReg && !IndexReg)
1343	// No register operands!
1344	continue;
1345
1346	// If any register operand is dependent, this load is dependent and we
1347	// needn't check it.
1348	// FIXME: Is this true in the case where we are hardening loads after
1349	// they complete? Unclear, need to investigate.
1350	if ((BaseReg && LoadDepRegs.test(Idx: BaseReg.id())) \|\|
1351	(IndexReg && LoadDepRegs.test(Idx: IndexReg.id())))
1352	continue;
1353
1354	// If post-load hardening is enabled, this load is compatible with
1355	// post-load hardening, and we aren't already going to harden one of the
1356	// address registers, queue it up to be hardened post-load. Notably,
1357	// even once hardened this won't introduce a useful dependency that
1358	// could prune out subsequent loads.
1359	if (EnablePostLoadHardening && X86InstrInfo::isDataInvariantLoad(MI) &&
1360	!isEFLAGSDefLive(MI) && MI.getDesc().getNumDefs() == `1` &&
1361	MI.getOperand(i: `0`).isReg() &&
1362	canHardenRegister(Reg: MI.getOperand(i: `0`).getReg()) &&
1363	!HardenedAddrRegs.count(V: BaseReg) &&
1364	!HardenedAddrRegs.count(V: IndexReg)) {
1365	HardenPostLoad.insert(Ptr: &MI);
1366	HardenedAddrRegs.insert(V: MI.getOperand(i: `0`).getReg());
1367	continue;
1368	}
1369
1370	// Record this instruction for address hardening and record its register
1371	// operands as being address-hardened.
1372	HardenLoadAddr.insert(Ptr: &MI);
1373	if (BaseReg)
1374	HardenedAddrRegs.insert(V: BaseReg);
1375	if (IndexReg)
1376	HardenedAddrRegs.insert(V: IndexReg);
1377
1378	for (MachineOperand &Def : MI.defs())
1379	if (Def.isReg())
1380	LoadDepRegs.set(Def.getReg().id());
1381	}
1382
1383	// Now re-walk the instructions in the basic block, and apply whichever
1384	// hardening strategy we have elected. Note that we do this in a second
1385	// pass specifically so that we have the complete set of instructions for
1386	// which we will do post-load hardening and can defer it in certain
1387	// circumstances.
1388	for (MachineInstr &MI : MBB) {
1389	if (HardenLoads) {
1390	// We cannot both require hardening the def of a load and its address.
1391	assert(!(HardenLoadAddr.count(&MI) && HardenPostLoad.count(&MI)) &&
1392	"Requested to harden both the address and def of a load!");
1393
1394	// Check if this is a load whose address needs to be hardened.
1395	if (HardenLoadAddr.erase(Ptr: &MI)) {
1396	const int MemRefBeginIdx = X86::getFirstAddrOperandIdx(MI);
1397	assert(MemRefBeginIdx >= `0` && "Cannot have an invalid index here!");
1398
1399	MachineOperand &BaseMO =
1400	MI.getOperand(i: MemRefBeginIdx + X86::AddrBaseReg);
1401	MachineOperand &IndexMO =
1402	MI.getOperand(i: MemRefBeginIdx + X86::AddrIndexReg);
1403	hardenLoadAddr(MI, BaseMO, IndexMO, AddrRegToHardenedReg);
1404	continue;
1405	}
1406
1407	// Test if this instruction is one of our post load instructions (and
1408	// remove it from the set if so).
1409	if (HardenPostLoad.erase(Ptr: &MI)) {
1410	assert(!MI.isCall() && "Must not try to post-load harden a call!");
1411
1412	// If this is a data-invariant load and there is no EFLAGS
1413	// interference, we want to try and sink any hardening as far as
1414	// possible.
1415	if (X86InstrInfo::isDataInvariantLoad(MI) && !isEFLAGSDefLive(MI)) {
1416	// Sink the instruction we'll need to harden as far as we can down
1417	// the graph.
1418	MachineInstr *SunkMI = sinkPostLoadHardenedInst(MI, HardenedInstrs&: HardenPostLoad);
1419
1420	// If we managed to sink this instruction, update everything so we
1421	// harden that instruction when we reach it in the instruction
1422	// sequence.
1423	if (SunkMI != &MI) {
1424	// If in sinking there was no instruction needing to be hardened,
1425	// we're done.
1426	if (!SunkMI)
1427	continue;
1428
1429	// Otherwise, add this to the set of defs we harden.
1430	HardenPostLoad.insert(Ptr: SunkMI);
1431	continue;
1432	}
1433	}
1434
1435	Register HardenedReg = hardenPostLoad(MI);
1436
1437	// Mark the resulting hardened register as such so we don't re-harden.
1438	AddrRegToHardenedReg [HardenedReg] = HardenedReg;
1439
1440	continue;
1441	}
1442
1443	// Check for an indirect call or branch that may need its input hardened
1444	// even if we couldn't find the specific load used, or were able to
1445	// avoid hardening it for some reason. Note that here we cannot break
1446	// out afterward as we may still need to handle any call aspect of this
1447	// instruction.
1448	if ((MI.isCall() \|\| MI.isBranch()) && HardenIndirectCallsAndJumps)
1449	hardenIndirectCallOrJumpInstr(MI, AddrRegToHardenedReg);
1450	}
1451
1452	// After we finish hardening loads we handle interprocedural hardening if
1453	// enabled and relevant for this instruction.
1454	if (!HardenInterprocedurally)
1455	continue;
1456	if (!MI.isCall() && !MI.isReturn())
1457	continue;
1458
1459	// If this is a direct return (IE, not a tail call) just directly harden
1460	// it.
1461	if (MI.isReturn() && !MI.isCall()) {
1462	hardenReturnInstr(MI);
1463	continue;
1464	}
1465
1466	// Otherwise we have a call. We need to handle transferring the predicate
1467	// state into a call and recovering it after the call returns (unless this
1468	// is a tail call).
1469	assert(MI.isCall() && "Should only reach here for calls!");
1470	tracePredStateThroughCall(MI);
1471	}
1472
1473	HardenPostLoad.clear();
1474	HardenLoadAddr.clear();
1475	HardenedAddrRegs.clear();
1476	AddrRegToHardenedReg.clear();
1477
1478	// Currently, we only track data-dependent loads within a basic block.
1479	// FIXME: We should see if this is necessary or if we could be more
1480	// aggressive here without opening up attack avenues.
1481	LoadDepRegs.clear();
1482	}
1483	}
1484
1485	/// Save EFLAGS into the returned GPR. This can in turn be restored with
1486	/// `restoreEFLAGS`.
1487	///
1488	/// Note that LLVM can only lower very simple patterns of saved and restored
1489	/// EFLAGS registers. The restore should always be within the same basic block
1490	/// as the save so that no PHI nodes are inserted.
1491	Register X86SpeculativeLoadHardeningPass::saveEFLAGS(
1492	MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
1493	const DebugLoc &Loc) {
1494	// FIXME: Hard coding this to a 32-bit register class seems weird, but matches
1495	// what instruction selection does.
1496	Register Reg = MRI->createVirtualRegister(RegClass: &X86::GR32RegClass);
1497	// We directly copy the FLAGS register and rely on later lowering to clean
1498	// this up into the appropriate setCC instructions.
1499	BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc, MCID: TII->get(Opcode: X86::COPY), DestReg: Reg).addReg(RegNo: X86::EFLAGS);
1500	++NumInstsInserted;
1501	return Reg;
1502	}
1503
1504	/// Restore EFLAGS from the provided GPR. This should be produced by
1505	/// `saveEFLAGS`.
1506	///
1507	/// This must be done within the same basic block as the save in order to
1508	/// reliably lower.
1509	void X86SpeculativeLoadHardeningPass::restoreEFLAGS(
1510	MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
1511	const DebugLoc &Loc, Register Reg) {
1512	BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc, MCID: TII->get(Opcode: X86::COPY), DestReg: X86::EFLAGS).addReg(RegNo: Reg);
1513	++NumInstsInserted;
1514	}
1515
1516	/// Takes the current predicate state (in a register) and merges it into the
1517	/// stack pointer. The state is essentially a single bit, but we merge this in
1518	/// a way that won't form non-canonical pointers and also will be preserved
1519	/// across normal stack adjustments.
1520	void X86SpeculativeLoadHardeningPass::mergePredStateIntoSP(
1521	MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
1522	const DebugLoc &Loc, Register PredStateReg) {
1523	Register TmpReg = MRI->createVirtualRegister(RegClass: PS ->RC);
1524	// FIXME: This hard codes a shift distance based on the number of bits needed
1525	// to stay canonical on 64-bit. We should compute this somehow and support
1526	// 32-bit as part of that.
1527	auto ShiftI = BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc, MCID: TII->get(Opcode: X86::SHL64ri), DestReg: TmpReg)
1528	.addReg(RegNo: PredStateReg, flags: RegState::Kill)
1529	.addImm(Val: `47`);
1530	ShiftI ->addRegisterDead(Reg: X86::EFLAGS, RegInfo: TRI);
1531	++NumInstsInserted;
1532	auto OrI = BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc, MCID: TII->get(Opcode: X86::OR64rr), DestReg: X86::RSP)
1533	.addReg(RegNo: X86::RSP)
1534	.addReg(RegNo: TmpReg, flags: RegState::Kill);
1535	OrI ->addRegisterDead(Reg: X86::EFLAGS, RegInfo: TRI);
1536	++NumInstsInserted;
1537	}
1538
1539	/// Extracts the predicate state stored in the high bits of the stack pointer.
1540	Register X86SpeculativeLoadHardeningPass::extractPredStateFromSP(
1541	MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
1542	const DebugLoc &Loc) {
1543	Register PredStateReg = MRI->createVirtualRegister(RegClass: PS ->RC);
1544	Register TmpReg = MRI->createVirtualRegister(RegClass: PS ->RC);
1545
1546	// We know that the stack pointer will have any preserved predicate state in
1547	// its high bit. We just want to smear this across the other bits. Turns out,
1548	// this is exactly what an arithmetic right shift does.
1549	BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc, MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: TmpReg)
1550	.addReg(RegNo: X86::RSP);
1551	auto ShiftI =
1552	BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc, MCID: TII->get(Opcode: X86::SAR64ri), DestReg: PredStateReg)
1553	.addReg(RegNo: TmpReg, flags: RegState::Kill)
1554	.addImm(Val: TRI->getRegSizeInBits(RC: *PS ->RC) - `1`);
1555	ShiftI ->addRegisterDead(Reg: X86::EFLAGS, RegInfo: TRI);
1556	++NumInstsInserted;
1557
1558	return PredStateReg;
1559	}
1560
1561	void X86SpeculativeLoadHardeningPass::hardenLoadAddr(
1562	MachineInstr &MI, MachineOperand &BaseMO, MachineOperand &IndexMO,
1563	SmallDenseMap<Register, Register, `32`> &AddrRegToHardenedReg) {
1564	MachineBasicBlock &MBB = *MI.getParent();
1565	const DebugLoc &Loc = MI.getDebugLoc();
1566
1567	// Check if EFLAGS are alive by seeing if there is a def of them or they
1568	// live-in, and then seeing if that def is in turn used.
1569	bool EFLAGSLive = isEFLAGSLive(MBB, I: MI.getIterator(), TRI: *TRI);
1570
1571	SmallVector<MachineOperand *, `2`> HardenOpRegs;
1572
1573	if (BaseMO.isFI()) {
1574	// A frame index is never a dynamically controllable load, so only
1575	// harden it if we're covering fixed address loads as well.
1576	LLVM_DEBUG(
1577	dbgs() << " Skipping hardening base of explicit stack frame load: ";
1578	MI.dump(); dbgs() << "\n");
1579	} else if (BaseMO.getReg() == X86::RSP) {
1580	// Some idempotent atomic operations are lowered directly to a locked
1581	// OR with 0 to the top of stack(or slightly offset from top) which uses an
1582	// explicit RSP register as the base.
1583	assert(IndexMO.getReg() == X86::NoRegister &&
1584	"Explicit RSP access with dynamic index!");
1585	LLVM_DEBUG(
1586	dbgs() << " Cannot harden base of explicit RSP offset in a load!");
1587	} else if (BaseMO.getReg() == X86::RIP \|\|
1588	BaseMO.getReg() == X86::NoRegister) {
1589	// For both RIP-relative addressed loads or absolute loads, we cannot
1590	// meaningfully harden them because the address being loaded has no
1591	// dynamic component.
1592	//
1593	// FIXME: When using a segment base (like TLS does) we end up with the
1594	// dynamic address being the base plus -1 because we can't mutate the
1595	// segment register here. This allows the signed 32-bit offset to point at
1596	// valid segment-relative addresses and load them successfully.
1597	LLVM_DEBUG(
1598	dbgs() << " Cannot harden base of "
1599	<< (BaseMO.getReg() == X86::RIP ? "RIP-relative" : "no-base")
1600	<< " address in a load!");
1601	} else {
1602	assert(BaseMO.isReg() &&
1603	"Only allowed to have a frame index or register base.");
1604	HardenOpRegs.push_back(Elt: &BaseMO);
1605	}
1606
1607	if (IndexMO.getReg() != X86::NoRegister &&
1608	(HardenOpRegs.empty() \|\|
1609	HardenOpRegs.front()->getReg() != IndexMO.getReg()))
1610	HardenOpRegs.push_back(Elt: &IndexMO);
1611
1612	assert((HardenOpRegs.size() == `1` \|\| HardenOpRegs.size() == `2`) &&
1613	"Should have exactly one or two registers to harden!");
1614	assert((HardenOpRegs.size() == `1` \|\|
1615	HardenOpRegs[`0`]->getReg() != HardenOpRegs[`1`]->getReg()) &&
1616	"Should not have two of the same registers!");
1617
1618	// Remove any registers that have alreaded been checked.
1619	llvm::erase_if(C&: HardenOpRegs, P: [&](MachineOperand *Op) {
1620	// See if this operand's register has already been checked.
1621	auto It = AddrRegToHardenedReg.find(Val: Op->getReg());
1622	if (It == AddrRegToHardenedReg.end())
1623	// Not checked, so retain this one.
1624	return false;
1625
1626	// Otherwise, we can directly update this operand and remove it.
1627	Op->setReg(It ->second);
1628	return true;
1629	});
1630	// If there are none left, we're done.
1631	if (HardenOpRegs.empty())
1632	return;
1633
1634	// Compute the current predicate state.
1635	Register StateReg = PS ->SSA.GetValueAtEndOfBlock(BB: &MBB);
1636
1637	auto InsertPt = MI.getIterator();
1638
1639	// If EFLAGS are live and we don't have access to instructions that avoid
1640	// clobbering EFLAGS we need to save and restore them. This in turn makes
1641	// the EFLAGS no longer live.
1642	Register FlagsReg;
1643	if (EFLAGSLive && !Subtarget->hasBMI2()) {
1644	EFLAGSLive = false;
1645	FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
1646	}
1647
1648	for (MachineOperand *Op : HardenOpRegs) {
1649	Register OpReg = Op->getReg();
1650	auto *OpRC = MRI->getRegClass(Reg: OpReg);
1651	Register TmpReg = MRI->createVirtualRegister(RegClass: OpRC);
1652
1653	// If this is a vector register, we'll need somewhat custom logic to handle
1654	// hardening it.
1655	if (!Subtarget->hasVLX() && (OpRC->hasSuperClassEq(RC: &X86::VR128RegClass) \|\|
1656	OpRC->hasSuperClassEq(RC: &X86::VR256RegClass))) {
1657	assert(Subtarget->hasAVX2() && "AVX2-specific register classes!");
1658	bool Is128Bit = OpRC->hasSuperClassEq(RC: &X86::VR128RegClass);
1659
1660	// Move our state into a vector register.
1661	// FIXME: We could skip this at the cost of longer encodings with AVX-512
1662	// but that doesn't seem likely worth it.
1663	Register VStateReg = MRI->createVirtualRegister(RegClass: &X86::VR128RegClass);
1664	auto MovI =
1665	BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc, MCID: TII->get(Opcode: X86::VMOV64toPQIrr), DestReg: VStateReg)
1666	.addReg(RegNo: StateReg);
1667	(void)MovI;
1668	++NumInstsInserted;
1669	LLVM_DEBUG(dbgs() << " Inserting mov: "; MovI->dump(); dbgs() << "\n");
1670
1671	// Broadcast it across the vector register.
1672	Register VBStateReg = MRI->createVirtualRegister(RegClass: OpRC);
1673	auto BroadcastI = BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc,
1674	MCID: TII->get(Opcode: Is128Bit ? X86::VPBROADCASTQrr
1675	: X86::VPBROADCASTQYrr),
1676	DestReg: VBStateReg)
1677	.addReg(RegNo: VStateReg);
1678	(void)BroadcastI;
1679	++NumInstsInserted;
1680	LLVM_DEBUG(dbgs() << " Inserting broadcast: "; BroadcastI->dump();
1681	dbgs() << "\n");
1682
1683	// Merge our potential poison state into the value with a vector or.
1684	auto OrI =
1685	BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc,
1686	MCID: TII->get(Opcode: Is128Bit ? X86::VPORrr : X86::VPORYrr), DestReg: TmpReg)
1687	.addReg(RegNo: VBStateReg)
1688	.addReg(RegNo: OpReg);
1689	(void)OrI;
1690	++NumInstsInserted;
1691	LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
1692	} else if (OpRC->hasSuperClassEq(RC: &X86::VR128XRegClass) \|\|
1693	OpRC->hasSuperClassEq(RC: &X86::VR256XRegClass) \|\|
1694	OpRC->hasSuperClassEq(RC: &X86::VR512RegClass)) {
1695	assert(Subtarget->hasAVX512() && "AVX512-specific register classes!");
1696	bool Is128Bit = OpRC->hasSuperClassEq(RC: &X86::VR128XRegClass);
1697	bool Is256Bit = OpRC->hasSuperClassEq(RC: &X86::VR256XRegClass);
1698	if (Is128Bit \|\| Is256Bit)
1699	assert(Subtarget->hasVLX() && "AVX512VL-specific register classes!");
1700
1701	// Broadcast our state into a vector register.
1702	Register VStateReg = MRI->createVirtualRegister(RegClass: OpRC);
1703	unsigned BroadcastOp = Is128Bit ? X86::VPBROADCASTQrZ128rr
1704	: Is256Bit ? X86::VPBROADCASTQrZ256rr
1705	: X86::VPBROADCASTQrZrr;
1706	auto BroadcastI =
1707	BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc, MCID: TII->get(Opcode: BroadcastOp), DestReg: VStateReg)
1708	.addReg(RegNo: StateReg);
1709	(void)BroadcastI;
1710	++NumInstsInserted;
1711	LLVM_DEBUG(dbgs() << " Inserting broadcast: "; BroadcastI->dump();
1712	dbgs() << "\n");
1713
1714	// Merge our potential poison state into the value with a vector or.
1715	unsigned OrOp = Is128Bit ? X86::VPORQZ128rr
1716	: Is256Bit ? X86::VPORQZ256rr : X86::VPORQZrr;
1717	auto OrI = BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc, MCID: TII->get(Opcode: OrOp), DestReg: TmpReg)
1718	.addReg(RegNo: VStateReg)
1719	.addReg(RegNo: OpReg);
1720	(void)OrI;
1721	++NumInstsInserted;
1722	LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
1723	} else {
1724	// FIXME: Need to support GR32 here for 32-bit code.
1725	assert(OpRC->hasSuperClassEq(&X86::GR64RegClass) &&
1726	"Not a supported register class for address hardening!");
1727
1728	if (!EFLAGSLive) {
1729	// Merge our potential poison state into the value with an or.
1730	auto OrI = BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc, MCID: TII->get(Opcode: X86::OR64rr), DestReg: TmpReg)
1731	.addReg(RegNo: StateReg)
1732	.addReg(RegNo: OpReg);
1733	OrI ->addRegisterDead(Reg: X86::EFLAGS, RegInfo: TRI);
1734	++NumInstsInserted;
1735	LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
1736	} else {
1737	// We need to avoid touching EFLAGS so shift out all but the least
1738	// significant bit using the instruction that doesn't update flags.
1739	auto ShiftI =
1740	BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc, MCID: TII->get(Opcode: X86::SHRX64rr), DestReg: TmpReg)
1741	.addReg(RegNo: OpReg)
1742	.addReg(RegNo: StateReg);
1743	(void)ShiftI;
1744	++NumInstsInserted;
1745	LLVM_DEBUG(dbgs() << " Inserting shrx: "; ShiftI->dump();
1746	dbgs() << "\n");
1747	}
1748	}
1749
1750	// Record this register as checked and update the operand.
1751	assert(!AddrRegToHardenedReg.count(Op->getReg()) &&
1752	"Should not have checked this register yet!");
1753	AddrRegToHardenedReg [Op->getReg()] = TmpReg;
1754	Op->setReg(TmpReg);
1755	++NumAddrRegsHardened;
1756	}
1757
1758	// And restore the flags if needed.
1759	if (FlagsReg)
1760	restoreEFLAGS(MBB, InsertPt, Loc, Reg: FlagsReg);
1761	}
1762
1763	MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
1764	MachineInstr &InitialMI, SmallPtrSetImpl<MachineInstr *> &HardenedInstrs) {
1765	assert(X86InstrInfo::isDataInvariantLoad(InitialMI) &&
1766	"Cannot get here with a non-invariant load!");
1767	assert(!isEFLAGSDefLive(InitialMI) &&
1768	"Cannot get here with a data invariant load "
1769	"that interferes with EFLAGS!");
1770
1771	// See if we can sink hardening the loaded value.
1772	auto SinkCheckToSingleUse =
1773	[&](MachineInstr &MI) -> std::optional<MachineInstr *> {
1774	Register DefReg = MI.getOperand(i: `0`).getReg();
1775
1776	// We need to find a single use which we can sink the check. We can
1777	// primarily do this because many uses may already end up checked on their
1778	// own.
1779	MachineInstr SingleUseMI = nullptr*;
1780	for (MachineInstr &UseMI : MRI->use_instructions(Reg: DefReg)) {
1781	// If we're already going to harden this use, it is data invariant, it
1782	// does not interfere with EFLAGS, and within our block.
1783	if (HardenedInstrs.count(Ptr: &UseMI)) {
1784	if (!X86InstrInfo::isDataInvariantLoad(MI&: UseMI) \|\| isEFLAGSDefLive(MI: UseMI)) {
1785	// If we've already decided to harden a non-load, we must have sunk
1786	// some other post-load hardened instruction to it and it must itself
1787	// be data-invariant.
1788	assert(X86InstrInfo::isDataInvariant(UseMI) &&
1789	"Data variant instruction being hardened!");
1790	continue;
1791	}
1792
1793	// Otherwise, this is a load and the load component can't be data
1794	// invariant so check how this register is being used.
1795	const int MemRefBeginIdx = X86::getFirstAddrOperandIdx(MI: UseMI);
1796	assert(MemRefBeginIdx >= `0` &&
1797	"Should always have mem references here!");
1798
1799	MachineOperand &BaseMO =
1800	UseMI.getOperand(i: MemRefBeginIdx + X86::AddrBaseReg);
1801	MachineOperand &IndexMO =
1802	UseMI.getOperand(i: MemRefBeginIdx + X86::AddrIndexReg);
1803	if ((BaseMO.isReg() && BaseMO.getReg() == DefReg) \|\|
1804	(IndexMO.isReg() && IndexMO.getReg() == DefReg))
1805	// The load uses the register as part of its address making it not
1806	// invariant.
1807	return {};
1808
1809	continue;
1810	}
1811
1812	if (SingleUseMI)
1813	// We already have a single use, this would make two. Bail.
1814	return {};
1815
1816	// If this single use isn't data invariant, isn't in this block, or has
1817	// interfering EFLAGS, we can't sink the hardening to it.
1818	if (!X86InstrInfo::isDataInvariant(MI&: UseMI) \|\| UseMI.getParent() != MI.getParent() \|\|
1819	isEFLAGSDefLive(MI: UseMI))
1820	return {};
1821
1822	// If this instruction defines multiple registers bail as we won't harden
1823	// all of them.
1824	if (UseMI.getDesc().getNumDefs() > `1`)
1825	return {};
1826
1827	// If this register isn't a virtual register we can't walk uses of sanely,
1828	// just bail. Also check that its register class is one of the ones we
1829	// can harden.
1830	Register UseDefReg = UseMI.getOperand(i: `0`).getReg();
1831	if (!canHardenRegister(Reg: UseDefReg))
1832	return {};
1833
1834	SingleUseMI = &UseMI;
1835	}
1836
1837	// If SingleUseMI is still null, there is no use that needs its own
1838	// checking. Otherwise, it is the single use that needs checking.
1839	return {SingleUseMI};
1840	};
1841
1842	MachineInstr *MI = &InitialMI;
1843	while (std::optional<MachineInstr > SingleUse = SinkCheckToSingleUse (MI)) {
1844	// Update which MI we're checking now.
1845	MI = *SingleUse;
1846	if (!MI)
1847	break;
1848	}
1849
1850	return MI;
1851	}
1852
1853	bool X86SpeculativeLoadHardeningPass::canHardenRegister(Register Reg) {
1854	// We only support hardening virtual registers.
1855	if (!Reg.isVirtual())
1856	return false;
1857
1858	auto *RC = MRI->getRegClass(Reg);
1859	int RegBytes = TRI->getRegSizeInBits(RC: *RC) / `8`;
1860	if (RegBytes > `8`)
1861	// We don't support post-load hardening of vectors.
1862	return false;
1863
1864	unsigned RegIdx = Log2_32(Value: RegBytes);
1865	assert(RegIdx < `4` && "Unsupported register size");
1866
1867	// If this register class is explicitly constrained to a class that doesn't
1868	// require REX prefix, we may not be able to satisfy that constraint when
1869	// emitting the hardening instructions, so bail out here.
1870	// FIXME: This seems like a pretty lame hack. The way this comes up is when we
1871	// end up both with a NOREX and REX-only register as operands to the hardening
1872	// instructions. It would be better to fix that code to handle this situation
1873	// rather than hack around it in this way.
1874	const TargetRegisterClass *NOREXRegClasses[] = {
1875	&X86::GR8_NOREXRegClass, &X86::GR16_NOREXRegClass,
1876	&X86::GR32_NOREXRegClass, &X86::GR64_NOREXRegClass};
1877	if (RC == NOREXRegClasses[RegIdx])
1878	return false;
1879
1880	const TargetRegisterClass *GPRRegClasses[] = {
1881	&X86::GR8RegClass, &X86::GR16RegClass, &X86::GR32RegClass,
1882	&X86::GR64RegClass};
1883	return RC->hasSuperClassEq(RC: GPRRegClasses[RegIdx]);
1884	}
1885
1886	/// Harden a value in a register.
1887	///
1888	/// This is the low-level logic to fully harden a value sitting in a register
1889	/// against leaking during speculative execution.
1890	///
1891	/// Unlike hardening an address that is used by a load, this routine is required
1892	/// to hide all* incoming bits in the register.*
1893	///
1894	/// `Reg` must be a virtual register. Currently, it is required to be a GPR no
1895	/// larger than the predicate state register. FIXME: We should support vector
1896	/// registers here by broadcasting the predicate state.
1897	///
1898	/// The new, hardened virtual register is returned. It will have the same
1899	/// register class as `Reg`.
1900	Register X86SpeculativeLoadHardeningPass::hardenValueInRegister(
1901	Register Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
1902	const DebugLoc &Loc) {
1903	assert(canHardenRegister(Reg) && "Cannot harden this register!");
1904
1905	auto *RC = MRI->getRegClass(Reg);
1906	int Bytes = TRI->getRegSizeInBits(RC: *RC) / `8`;
1907	Register StateReg = PS ->SSA.GetValueAtEndOfBlock(BB: &MBB);
1908	assert((Bytes == `1` \|\| Bytes == `2` \|\| Bytes == `4` \|\| Bytes == `8`) &&
1909	"Unknown register size");
1910
1911	// FIXME: Need to teach this about 32-bit mode.
1912	if (Bytes != `8`) {
1913	unsigned SubRegImms[] = {X86::sub_8bit, X86::sub_16bit, X86::sub_32bit};
1914	unsigned SubRegImm = SubRegImms[Log2_32(Value: Bytes)];
1915	Register NarrowStateReg = MRI->createVirtualRegister(RegClass: RC);
1916	BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc, MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: NarrowStateReg)
1917	.addReg(RegNo: StateReg, flags: `0`, SubReg: SubRegImm);
1918	StateReg = NarrowStateReg;
1919	}
1920
1921	Register FlagsReg;
1922	if (isEFLAGSLive(MBB, I: InsertPt, TRI: *TRI))
1923	FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
1924
1925	Register NewReg = MRI->createVirtualRegister(RegClass: RC);
1926	unsigned OrOpCodes[] = {X86::OR8rr, X86::OR16rr, X86::OR32rr, X86::OR64rr};
1927	unsigned OrOpCode = OrOpCodes[Log2_32(Value: Bytes)];
1928	auto OrI = BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc, MCID: TII->get(Opcode: OrOpCode), DestReg: NewReg)
1929	.addReg(RegNo: StateReg)
1930	.addReg(RegNo: Reg);
1931	OrI ->addRegisterDead(Reg: X86::EFLAGS, RegInfo: TRI);
1932	++NumInstsInserted;
1933	LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
1934
1935	if (FlagsReg)
1936	restoreEFLAGS(MBB, InsertPt, Loc, Reg: FlagsReg);
1937
1938	return NewReg;
1939	}
1940
1941	/// Harden a load by hardening the loaded value in the defined register.
1942	///
1943	/// We can harden a non-leaking load into a register without touching the
1944	/// address by just hiding all of the loaded bits during misspeculation. We use
1945	/// an `or` instruction to do this because we set up our poison value as all
1946	/// ones. And the goal is just for the loaded bits to not be exposed to
1947	/// execution and coercing them to one is sufficient.
1948	///
1949	/// Returns the newly hardened register.
1950	Register X86SpeculativeLoadHardeningPass::hardenPostLoad(MachineInstr &MI) {
1951	MachineBasicBlock &MBB = *MI.getParent();
1952	const DebugLoc &Loc = MI.getDebugLoc();
1953
1954	auto &DefOp = MI.getOperand(i: `0`);
1955	Register OldDefReg = DefOp.getReg();
1956	auto *DefRC = MRI->getRegClass(Reg: OldDefReg);
1957
1958	// Because we want to completely replace the uses of this def'ed value with
1959	// the hardened value, create a dedicated new register that will only be used
1960	// to communicate the unhardened value to the hardening.
1961	Register UnhardenedReg = MRI->createVirtualRegister(RegClass: DefRC);
1962	DefOp.setReg(UnhardenedReg);
1963
1964	// Now harden this register's value, getting a hardened reg that is safe to
1965	// use. Note that we insert the instructions to compute this after* the*
1966	// defining instruction, not before it.
1967	Register HardenedReg = hardenValueInRegister(
1968	Reg: UnhardenedReg, MBB, InsertPt: std::next(x: MI.getIterator()), Loc);
1969
1970	// Finally, replace the old register (which now only has the uses of the
1971	// original def) with the hardened register.
1972	MRI->replaceRegWith(/FromReg/ OldDefReg, /ToReg/ HardenedReg);
1973
1974	++NumPostLoadRegsHardened;
1975	return HardenedReg;
1976	}
1977
1978	/// Harden a return instruction.
1979	///
1980	/// Returns implicitly perform a load which we need to harden. Without hardening
1981	/// this load, an attacker my speculatively write over the return address to
1982	/// steer speculation of the return to an attacker controlled address. This is
1983	/// called Spectre v1.1 or Bounds Check Bypass Store (BCBS) and is described in
1984	/// this paper:
1985	/// https://people.csail.mit.edu/vlk/spectre11.pdf
1986	///
1987	/// We can harden this by introducing an LFENCE that will delay any load of the
1988	/// return address until prior instructions have retired (and thus are not being
1989	/// speculated), or we can harden the address used by the implicit load: the
1990	/// stack pointer.
1991	///
1992	/// If we are not using an LFENCE, hardening the stack pointer has an additional
1993	/// benefit: it allows us to pass the predicate state accumulated in this
1994	/// function back to the caller. In the absence of a BCBS attack on the return,
1995	/// the caller will typically be resumed and speculatively executed due to the
1996	/// Return Stack Buffer (RSB) prediction which is very accurate and has a high
1997	/// priority. It is possible that some code from the caller will be executed
1998	/// speculatively even during a BCBS-attacked return until the steering takes
1999	/// effect. Whenever this happens, the caller can recover the (poisoned)
2000	/// predicate state from the stack pointer and continue to harden loads.
2001	void X86SpeculativeLoadHardeningPass::hardenReturnInstr(MachineInstr &MI) {
2002	MachineBasicBlock &MBB = *MI.getParent();
2003	const DebugLoc &Loc = MI.getDebugLoc();
2004	auto InsertPt = MI.getIterator();
2005
2006	if (FenceCallAndRet)
2007	// No need to fence here as we'll fence at the return site itself. That
2008	// handles more cases than we can handle here.
2009	return;
2010
2011	// Take our predicate state, shift it to the high 17 bits (so that we keep
2012	// pointers canonical) and merge it into RSP. This will allow the caller to
2013	// extract it when we return (speculatively).
2014	mergePredStateIntoSP(MBB, InsertPt, Loc, PredStateReg: PS ->SSA.GetValueAtEndOfBlock(BB: &MBB));
2015	}
2016
2017	/// Trace the predicate state through a call.
2018	///
2019	/// There are several layers of this needed to handle the full complexity of
2020	/// calls.
2021	///
2022	/// First, we need to send the predicate state into the called function. We do
2023	/// this by merging it into the high bits of the stack pointer.
2024	///
2025	/// For tail calls, this is all we need to do.
2026	///
2027	/// For calls where we might return and resume the control flow, we need to
2028	/// extract the predicate state from the high bits of the stack pointer after
2029	/// control returns from the called function.
2030	///
2031	/// We also need to verify that we intended to return to this location in the
2032	/// code. An attacker might arrange for the processor to mispredict the return
2033	/// to this valid but incorrect return address in the program rather than the
2034	/// correct one. See the paper on this attack, called "ret2spec" by the
2035	/// researchers, here:
2036	/// https://christian-rossow.de/publications/ret2spec-ccs2018.pdf
2037	///
2038	/// The way we verify that we returned to the correct location is by preserving
2039	/// the expected return address across the call. One technique involves taking
2040	/// advantage of the red-zone to load the return address from `8(%rsp)` where it
2041	/// was left by the RET instruction when it popped `%rsp`. Alternatively, we can
2042	/// directly save the address into a register that will be preserved across the
2043	/// call. We compare this intended return address against the address
2044	/// immediately following the call (the observed return address). If these
2045	/// mismatch, we have detected misspeculation and can poison our predicate
2046	/// state.
2047	void X86SpeculativeLoadHardeningPass::tracePredStateThroughCall(
2048	MachineInstr &MI) {
2049	MachineBasicBlock &MBB = *MI.getParent();
2050	MachineFunction &MF = *MBB.getParent();
2051	auto InsertPt = MI.getIterator();
2052	const DebugLoc &Loc = MI.getDebugLoc();
2053
2054	if (FenceCallAndRet) {
2055	if (MI.isReturn())
2056	// Tail call, we don't return to this function.
2057	// FIXME: We should also handle noreturn calls.
2058	return;
2059
2060	// We don't need to fence before the call because the function should fence
2061	// in its entry. However, we do need to fence after the call returns.
2062	// Fencing before the return doesn't correctly handle cases where the return
2063	// itself is mispredicted.
2064	BuildMI(BB&: MBB, I: std::next(x: InsertPt), MIMD: Loc, MCID: TII->get(Opcode: X86::LFENCE));
2065	++NumInstsInserted;
2066	++NumLFENCEsInserted;
2067	return;
2068	}
2069
2070	// First, we transfer the predicate state into the called function by merging
2071	// it into the stack pointer. This will kill the current def of the state.
2072	Register StateReg = PS ->SSA.GetValueAtEndOfBlock(BB: &MBB);
2073	mergePredStateIntoSP(MBB, InsertPt, Loc, PredStateReg: StateReg);
2074
2075	// If this call is also a return, it is a tail call and we don't need anything
2076	// else to handle it so just return. Also, if there are no further
2077	// instructions and no successors, this call does not return so we can also
2078	// bail.
2079	if (MI.isReturn() \|\| (std::next(x: InsertPt) == MBB.end() && MBB.succ_empty()))
2080	return;
2081
2082	// Create a symbol to track the return address and attach it to the call
2083	// machine instruction. We will lower extra symbols attached to call
2084	// instructions as label immediately following the call.
2085	MCSymbol *RetSymbol =
2086	MF.getContext().createTempSymbol(Name: "slh_ret_addr",
2087	/AlwaysAddSuffix/ true);
2088	MI.setPostInstrSymbol(MF, Symbol: RetSymbol);
2089
2090	const TargetRegisterClass *AddrRC = &X86::GR64RegClass;
2091	Register ExpectedRetAddrReg;
2092
2093	// If we have no red zones or if the function returns twice (possibly without
2094	// using the `ret` instruction) like setjmp, we need to save the expected
2095	// return address prior to the call.
2096	if (!Subtarget->getFrameLowering()->has128ByteRedZone(MF) \|\|
2097	MF.exposesReturnsTwice()) {
2098	// If we don't have red zones, we need to compute the expected return
2099	// address prior to the call and store it in a register that lives across
2100	// the call.
2101	//
2102	// In some ways, this is doubly satisfying as a mitigation because it will
2103	// also successfully detect stack smashing bugs in some cases (typically,
2104	// when a callee-saved register is used and the callee doesn't push it onto
2105	// the stack). But that isn't our primary goal, so we only use it as
2106	// a fallback.
2107	//
2108	// FIXME: It isn't clear that this is reliable in the face of
2109	// rematerialization in the register allocator. We somehow need to force
2110	// that to not occur for this particular instruction, and instead to spill
2111	// or otherwise preserve the value computed prior* to the call.*
2112	//
2113	// FIXME: It is even less clear why MachineCSE can't just fold this when we
2114	// end up having to use identical instructions both before and after the
2115	// call to feed the comparison.
2116	ExpectedRetAddrReg = MRI->createVirtualRegister(RegClass: AddrRC);
2117	if (MF.getTarget().getCodeModel() == CodeModel::Small &&
2118	!Subtarget->isPositionIndependent()) {
2119	BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc, MCID: TII->get(Opcode: X86::MOV64ri32), DestReg: ExpectedRetAddrReg)
2120	.addSym(Sym: RetSymbol);
2121	} else {
2122	BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc, MCID: TII->get(Opcode: X86::LEA64r), DestReg: ExpectedRetAddrReg)
2123	.addReg(/Base/ RegNo: X86::RIP)
2124	.addImm(/Scale/ Val: `1`)
2125	.addReg(/Index/ RegNo: `0`)
2126	.addSym(Sym: RetSymbol)
2127	.addReg(/Segment/ RegNo: `0`);
2128	}
2129	}
2130
2131	// Step past the call to handle when it returns.
2132	++InsertPt;
2133
2134	// If we didn't pre-compute the expected return address into a register, then
2135	// red zones are enabled and the return address is still available on the
2136	// stack immediately after the call. As the very first instruction, we load it
2137	// into a register.
2138	if (!ExpectedRetAddrReg) {
2139	ExpectedRetAddrReg = MRI->createVirtualRegister(RegClass: AddrRC);
2140	BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc, MCID: TII->get(Opcode: X86::MOV64rm), DestReg: ExpectedRetAddrReg)
2141	.addReg(/Base/ RegNo: X86::RSP)
2142	.addImm(/Scale/ Val: `1`)
2143	.addReg(/Index/ RegNo: `0`)
2144	.addImm(/Displacement/ Val: -`8`) // The stack pointer has been popped, so
2145	// the return address is 8-bytes past it.
2146	.addReg(/Segment/ RegNo: `0`);
2147	}
2148
2149	// Now we extract the callee's predicate state from the stack pointer.
2150	Register NewStateReg = extractPredStateFromSP(MBB, InsertPt, Loc);
2151
2152	// Test the expected return address against our actual address. If we can
2153	// form this basic block's address as an immediate, this is easy. Otherwise
2154	// we compute it.
2155	if (MF.getTarget().getCodeModel() == CodeModel::Small &&
2156	!Subtarget->isPositionIndependent()) {
2157	// FIXME: Could we fold this with the load? It would require careful EFLAGS
2158	// management.
2159	BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc, MCID: TII->get(Opcode: X86::CMP64ri32))
2160	.addReg(RegNo: ExpectedRetAddrReg, flags: RegState::Kill)
2161	.addSym(Sym: RetSymbol);
2162	} else {
2163	Register ActualRetAddrReg = MRI->createVirtualRegister(RegClass: AddrRC);
2164	BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc, MCID: TII->get(Opcode: X86::LEA64r), DestReg: ActualRetAddrReg)
2165	.addReg(/Base/ RegNo: X86::RIP)
2166	.addImm(/Scale/ Val: `1`)
2167	.addReg(/Index/ RegNo: `0`)
2168	.addSym(Sym: RetSymbol)
2169	.addReg(/Segment/ RegNo: `0`);
2170	BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc, MCID: TII->get(Opcode: X86::CMP64rr))
2171	.addReg(RegNo: ExpectedRetAddrReg, flags: RegState::Kill)
2172	.addReg(RegNo: ActualRetAddrReg, flags: RegState::Kill);
2173	}
2174
2175	// Now conditionally update the predicate state we just extracted if we ended
2176	// up at a different return address than expected.
2177	int PredStateSizeInBytes = TRI->getRegSizeInBits(RC: *PS ->RC) / `8`;
2178	auto CMovOp = X86::getCMovOpcode(RegBytes: PredStateSizeInBytes);
2179
2180	Register UpdatedStateReg = MRI->createVirtualRegister(RegClass: PS ->RC);
2181	auto CMovI = BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc, MCID: TII->get(Opcode: CMovOp), DestReg: UpdatedStateReg)
2182	.addReg(RegNo: NewStateReg, flags: RegState::Kill)
2183	.addReg(RegNo: PS ->PoisonReg)
2184	.addImm(Val: X86::COND_NE);
2185	CMovI ->findRegisterUseOperand(Reg: X86::EFLAGS, /TRI=/nullptr)->setIsKill(true);
2186	++NumInstsInserted;
2187	LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI->dump(); dbgs() << "\n");
2188
2189	PS ->SSA.AddAvailableValue(BB: &MBB, V: UpdatedStateReg);
2190	}
2191
2192	/// An attacker may speculatively store over a value that is then speculatively
2193	/// loaded and used as the target of an indirect call or jump instruction. This
2194	/// is called Spectre v1.2 or Bounds Check Bypass Store (BCBS) and is described
2195	/// in this paper:
2196	/// https://people.csail.mit.edu/vlk/spectre11.pdf
2197	///
2198	/// When this happens, the speculative execution of the call or jump will end up
2199	/// being steered to this attacker controlled address. While most such loads
2200	/// will be adequately hardened already, we want to ensure that they are
2201	/// definitively treated as needing post-load hardening. While address hardening
2202	/// is sufficient to prevent secret data from leaking to the attacker, it may
2203	/// not be sufficient to prevent an attacker from steering speculative
2204	/// execution. We forcibly unfolded all relevant loads above and so will always
2205	/// have an opportunity to post-load harden here, we just need to scan for cases
2206	/// not already flagged and add them.
2207	void X86SpeculativeLoadHardeningPass::hardenIndirectCallOrJumpInstr(
2208	MachineInstr &MI,
2209	SmallDenseMap<Register, Register, `32`> &AddrRegToHardenedReg) {
2210	switch (MI.getOpcode()) {
2211	case X86::FARCALL16m:
2212	case X86::FARCALL32m:
2213	case X86::FARCALL64m:
2214	case X86::FARJMP16m:
2215	case X86::FARJMP32m:
2216	case X86::FARJMP64m:
2217	// We don't need to harden either far calls or far jumps as they are
2218	// safe from Spectre.
2219	return;
2220
2221	default:
2222	break;
2223	}
2224
2225	// We should never see a loading instruction at this point, as those should
2226	// have been unfolded.
2227	assert(!MI.mayLoad() && "Found a lingering loading instruction!");
2228
2229	// If the first operand isn't a register, this is a branch or call
2230	// instruction with an immediate operand which doesn't need to be hardened.
2231	if (!MI.getOperand(i: `0`).isReg())
2232	return;
2233
2234	// For all of these, the target register is the first operand of the
2235	// instruction.
2236	auto &TargetOp = MI.getOperand(i: `0`);
2237	Register OldTargetReg = TargetOp.getReg();
2238
2239	// Try to lookup a hardened version of this register. We retain a reference
2240	// here as we want to update the map to track any newly computed hardened
2241	// register.
2242	Register &HardenedTargetReg = AddrRegToHardenedReg [OldTargetReg];
2243
2244	// If we don't have a hardened register yet, compute one. Otherwise, just use
2245	// the already hardened register.
2246	//
2247	// FIXME: It is a little suspect that we use partially hardened registers that
2248	// only feed addresses. The complexity of partial hardening with SHRX
2249	// continues to pile up. Should definitively measure its value and consider
2250	// eliminating it.
2251	if (!HardenedTargetReg)
2252	HardenedTargetReg = hardenValueInRegister(
2253	Reg: OldTargetReg, MBB&: *MI.getParent(), InsertPt: MI.getIterator(), Loc: MI.getDebugLoc());
2254
2255	// Set the target operand to the hardened register.
2256	TargetOp.setReg(HardenedTargetReg);
2257
2258	++NumCallsOrJumpsHardened;
2259	}
2260
2261	INITIALIZE_PASS_BEGIN(X86SpeculativeLoadHardeningPass, PASS_KEY,
2262	"X86 speculative load hardener", false, false)
2263	INITIALIZE_PASS_END(X86SpeculativeLoadHardeningPass, PASS_KEY,
2264	"X86 speculative load hardener", false, false)
2265
2266	FunctionPass *llvm::createX86SpeculativeLoadHardeningPass() {
2267	return new X86SpeculativeLoadHardeningPass ();
2268	}
2269

Browse the source code of llvm_projects/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp