X86SpeculativeLoadHardening.cpp source code [llvm_projects/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp]

1	//====- X86SpeculativeLoadHardening.cpp - A Spectre v1 mitigation ---------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	/// \file
9	///
10	/// Provide a pass which mitigates speculative execution attacks which operate
11	/// by speculating incorrectly past some predicate (a type check, bounds check,
12	/// or other condition) to reach a load with invalid inputs and leak the data
13	/// accessed by that load using a side channel out of the speculative domain.
14	///
15	/// For details on the attacks, see the first variant in both the Project Zero
16	/// writeup and the Spectre paper:
17	/// https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html
18	/// https://spectreattack.com/spectre.pdf
19	///
20	//===----------------------------------------------------------------------===//
21
22	#include "X86.h"
23	#include "X86InstrBuilder.h"
24	#include "X86InstrInfo.h"
25	#include "X86Subtarget.h"
26	#include "llvm/ADT/ArrayRef.h"
27	#include "llvm/ADT/DenseMap.h"
28	#include "llvm/ADT/STLExtras.h"
29	#include "llvm/ADT/SmallPtrSet.h"
30	#include "llvm/ADT/SmallSet.h"
31	#include "llvm/ADT/SmallVector.h"
32	#include "llvm/ADT/SparseBitVector.h"
33	#include "llvm/ADT/Statistic.h"
34	#include "llvm/CodeGen/MachineBasicBlock.h"
35	#include "llvm/CodeGen/MachineConstantPool.h"
36	#include "llvm/CodeGen/MachineFunction.h"
37	#include "llvm/CodeGen/MachineFunctionPass.h"
38	#include "llvm/CodeGen/MachineInstr.h"
39	#include "llvm/CodeGen/MachineInstrBuilder.h"
40	#include "llvm/CodeGen/MachineModuleInfo.h"
41	#include "llvm/CodeGen/MachineOperand.h"
42	#include "llvm/CodeGen/MachineRegisterInfo.h"
43	#include "llvm/CodeGen/MachineSSAUpdater.h"
44	#include "llvm/CodeGen/TargetInstrInfo.h"
45	#include "llvm/CodeGen/TargetRegisterInfo.h"
46	#include "llvm/CodeGen/TargetSchedule.h"
47	#include "llvm/CodeGen/TargetSubtargetInfo.h"
48	#include "llvm/IR/DebugLoc.h"
49	#include "llvm/MC/MCSchedule.h"
50	#include "llvm/Pass.h"
51	#include "llvm/Support/CommandLine.h"
52	#include "llvm/Support/Debug.h"
53	#include "llvm/Support/raw_ostream.h"
54	#include "llvm/Target/TargetMachine.h"
55	#include <algorithm>
56	#include <cassert>
57	#include <iterator>
58	#include <optional>
59	#include <utility>
60
61	using namespace llvm;
62
63	#define PASS_KEY "x86-slh"
64	#define DEBUG_TYPE PASS_KEY
65
66	STATISTIC(NumCondBranchesTraced, "Number of conditional branches traced");
67	STATISTIC(NumBranchesUntraced, "Number of branches unable to trace");
68	STATISTIC(NumAddrRegsHardened,
69	"Number of address mode used registers hardaned");
70	STATISTIC(NumPostLoadRegsHardened,
71	"Number of post-load register values hardened");
72	STATISTIC(NumCallsOrJumpsHardened,
73	"Number of calls or jumps requiring extra hardening");
74	STATISTIC(NumInstsInserted, "Number of instructions inserted");
75	STATISTIC(NumLFENCEsInserted, "Number of lfence instructions inserted");
76
77	static cl::opt<bool> EnableSpeculativeLoadHardening(
78	"x86-speculative-load-hardening",
79	cl::desc ("Force enable speculative load hardening"), cl::init(Val: false),
80	cl::Hidden);
81
82	static cl::opt<bool> HardenEdgesWithLFENCE(
83	PASS_KEY "-lfence",
84	cl::desc (
85	"Use LFENCE along each conditional edge to harden against speculative "
86	"loads rather than conditional movs and poisoned pointers."),
87	cl::init(Val: false), cl::Hidden);
88
89	static cl::opt<bool> EnablePostLoadHardening(
90	PASS_KEY "-post-load",
91	cl::desc ("Harden the value loaded after it is loaded by "
92	"flushing the loaded bits to 1. This is hard to do "
93	"in general but can be done easily for GPRs."),
94	cl::init(Val: true), cl::Hidden);
95
96	static cl::opt<bool> FenceCallAndRet(
97	PASS_KEY "-fence-call-and-ret",
98	cl::desc ("Use a full speculation fence to harden both call and ret edges "
99	"rather than a lighter weight mitigation."),
100	cl::init(Val: false), cl::Hidden);
101
102	static cl::opt<bool> HardenInterprocedurally(
103	PASS_KEY "-ip",
104	cl::desc ("Harden interprocedurally by passing our state in and out of "
105	"functions in the high bits of the stack pointer."),
106	cl::init(Val: true), cl::Hidden);
107
108	static cl::opt<bool>
109	HardenLoads(PASS_KEY "-loads",
110	cl::desc ("Sanitize loads from memory. When disable, no "
111	"significant security is provided."),
112	cl::init(Val: true), cl::Hidden);
113
114	static cl::opt<bool> HardenIndirectCallsAndJumps(
115	PASS_KEY "-indirect",
116	cl::desc ("Harden indirect calls and jumps against using speculatively "
117	"stored attacker controlled addresses. This is designed to "
118	"mitigate Spectre v1.2 style attacks."),
119	cl::init(Val: true), cl::Hidden);
120
121	namespace {
122
123	class X86SpeculativeLoadHardeningPass : public MachineFunctionPass {
124	public:
125	X86SpeculativeLoadHardeningPass() : MachineFunctionPass (ID) { }
126
127	StringRef getPassName() const override {
128	return "X86 speculative load hardening";
129	}
130	bool runOnMachineFunction(MachineFunction &MF) override;
131	void getAnalysisUsage(AnalysisUsage &AU) const override;
132
133	/// Pass identification, replacement for typeid.
134	static char ID;
135
136	private:
137	/// The information about a block's conditional terminators needed to trace
138	/// our predicate state through the exiting edges.
139	struct BlockCondInfo {
140	MachineBasicBlock *MBB;
141
142	// We mostly have one conditional branch, and in extremely rare cases have
143	// two. Three and more are so rare as to be unimportant for compile time.
144	SmallVector<MachineInstr *, `2`> CondBrs;
145
146	MachineInstr *UncondBr;
147	};
148
149	/// Manages the predicate state traced through the program.
150	struct PredState {
151	unsigned InitialReg = `0`;
152	unsigned PoisonReg = `0`;
153
154	const TargetRegisterClass *RC;
155	MachineSSAUpdater SSA;
156
157	PredState(MachineFunction &MF, const TargetRegisterClass *RC)
158	: RC(RC), SSA (MF) {}
159	};
160
161	const X86Subtarget Subtarget = nullptr*;
162	MachineRegisterInfo MRI = nullptr*;
163	const X86InstrInfo TII = nullptr*;
164	const TargetRegisterInfo TRI = nullptr*;
165
166	std::optional<PredState> PS;
167
168	void hardenEdgesWithLFENCE(MachineFunction &MF);
169
170	SmallVector<BlockCondInfo, `16`> collectBlockCondInfo(MachineFunction &MF);
171
172	SmallVector<MachineInstr *, `16`>
173	tracePredStateThroughCFG(MachineFunction &MF, ArrayRef<BlockCondInfo> Infos);
174
175	void unfoldCallAndJumpLoads(MachineFunction &MF);
176
177	SmallVector<MachineInstr *, `16`>
178	tracePredStateThroughIndirectBranches(MachineFunction &MF);
179
180	void tracePredStateThroughBlocksAndHarden(MachineFunction &MF);
181
182	unsigned saveEFLAGS(MachineBasicBlock &MBB,
183	MachineBasicBlock::iterator InsertPt,
184	const DebugLoc &Loc);
185	void restoreEFLAGS(MachineBasicBlock &MBB,
186	MachineBasicBlock::iterator InsertPt, const DebugLoc &Loc,
187	Register Reg);
188
189	void mergePredStateIntoSP(MachineBasicBlock &MBB,
190	MachineBasicBlock::iterator InsertPt,
191	const DebugLoc &Loc, unsigned PredStateReg);
192	unsigned extractPredStateFromSP(MachineBasicBlock &MBB,
193	MachineBasicBlock::iterator InsertPt,
194	const DebugLoc &Loc);
195
196	void
197	hardenLoadAddr(MachineInstr &MI, MachineOperand &BaseMO,
198	MachineOperand &IndexMO,
199	SmallDenseMap<unsigned, unsigned, `32`> &AddrRegToHardenedReg);
200	MachineInstr *
201	sinkPostLoadHardenedInst(MachineInstr &MI,
202	SmallPtrSetImpl<MachineInstr *> &HardenedInstrs);
203	bool canHardenRegister(Register Reg);
204	unsigned hardenValueInRegister(Register Reg, MachineBasicBlock &MBB,
205	MachineBasicBlock::iterator InsertPt,
206	const DebugLoc &Loc);
207	unsigned hardenPostLoad(MachineInstr &MI);
208	void hardenReturnInstr(MachineInstr &MI);
209	void tracePredStateThroughCall(MachineInstr &MI);
210	void hardenIndirectCallOrJumpInstr(
211	MachineInstr &MI,
212	SmallDenseMap<unsigned, unsigned, `32`> &AddrRegToHardenedReg);
213	};
214
215	} // end anonymous namespace
216
217	char X86SpeculativeLoadHardeningPass::ID = `0`;
218
219	void X86SpeculativeLoadHardeningPass::getAnalysisUsage(
220	AnalysisUsage &AU) const {
221	MachineFunctionPass::getAnalysisUsage(AU);
222	}
223
224	static MachineBasicBlock &splitEdge(MachineBasicBlock &MBB,
225	MachineBasicBlock &Succ, int SuccCount,
226	MachineInstr Br, MachineInstr &UncondBr,
227	const X86InstrInfo &TII) {
228	assert(!Succ.isEHPad() && "Shouldn't get edges to EH pads!");
229
230	MachineFunction &MF = *MBB.getParent();
231
232	MachineBasicBlock &NewMBB = *MF.CreateMachineBasicBlock();
233
234	// We have to insert the new block immediately after the current one as we
235	// don't know what layout-successor relationships the successor has and we
236	// may not be able to (and generally don't want to) try to fix those up.
237	MF.insert(MBBI: std::next(x: MachineFunction::iterator (&MBB)), MBB: &NewMBB);
238
239	// Update the branch instruction if necessary.
240	if (Br) {
241	assert(Br->getOperand(`0`).getMBB() == &Succ &&
242	"Didn't start with the right target!");
243	Br->getOperand(i: `0`).setMBB(&NewMBB);
244
245	// If this successor was reached through a branch rather than fallthrough,
246	// we might have broken* fallthrough and so need to inject a new*
247	// unconditional branch.
248	if (!UncondBr) {
249	MachineBasicBlock &OldLayoutSucc =
250	*std::next(x: MachineFunction::iterator (&NewMBB));
251	assert(MBB.isSuccessor(&OldLayoutSucc) &&
252	"Without an unconditional branch, the old layout successor should "
253	"be an actual successor!");
254	auto BrBuilder =
255	BuildMI(BB: &MBB, MIMD: DebugLoc (), MCID: TII.get(Opcode: X86::JMP_1)).addMBB(MBB: &OldLayoutSucc);
256	// Update the unconditional branch now that we've added one.
257	UncondBr = &*BrBuilder;
258	}
259
260	// Insert unconditional "jump Succ" instruction in the new block if
261	// necessary.
262	if (!NewMBB.isLayoutSuccessor(MBB: &Succ)) {
263	SmallVector<MachineOperand, `4`> Cond;
264	TII.insertBranch(MBB&: NewMBB, TBB: &Succ, FBB: nullptr, Cond, DL: Br->getDebugLoc());
265	}
266	} else {
267	assert(!UncondBr &&
268	"Cannot have a branchless successor and an unconditional branch!");
269	assert(NewMBB.isLayoutSuccessor(&Succ) &&
270	"A non-branch successor must have been a layout successor before "
271	"and now is a layout successor of the new block.");
272	}
273
274	// If this is the only edge to the successor, we can just replace it in the
275	// CFG. Otherwise we need to add a new entry in the CFG for the new
276	// successor.
277	if (SuccCount == `1`) {
278	MBB.replaceSuccessor(Old: &Succ, New: &NewMBB);
279	} else {
280	MBB.splitSuccessor(Old: &Succ, New: &NewMBB);
281	}
282
283	// Hook up the edge from the new basic block to the old successor in the CFG.
284	NewMBB.addSuccessor(Succ: &Succ);
285
286	// Fix PHI nodes in Succ so they refer to NewMBB instead of MBB.
287	for (MachineInstr &MI : Succ) {
288	if (!MI.isPHI())
289	break;
290	for (int OpIdx = `1`, NumOps = MI.getNumOperands(); OpIdx < NumOps;
291	OpIdx += `2`) {
292	MachineOperand &OpV = MI.getOperand(i: OpIdx);
293	MachineOperand &OpMBB = MI.getOperand(i: OpIdx + `1`);
294	assert(OpMBB.isMBB() && "Block operand to a PHI is not a block!");
295	if (OpMBB.getMBB() != &MBB)
296	continue;
297
298	// If this is the last edge to the succesor, just replace MBB in the PHI
299	if (SuccCount == `1`) {
300	OpMBB.setMBB(&NewMBB);
301	break;
302	}
303
304	// Otherwise, append a new pair of operands for the new incoming edge.
305	MI.addOperand(MF, Op: OpV);
306	MI.addOperand(MF, Op: MachineOperand::CreateMBB(MBB: &NewMBB));
307	break;
308	}
309	}
310
311	// Inherit live-ins from the successor
312	for (auto &LI : Succ.liveins())
313	NewMBB.addLiveIn(RegMaskPair: LI);
314
315	LLVM_DEBUG(dbgs() << " Split edge from '" << MBB.getName() << "' to '"
316	<< Succ.getName() << "'.\n");
317	return NewMBB;
318	}
319
320	/// Removing duplicate PHI operands to leave the PHI in a canonical and
321	/// predictable form.
322	///
323	/// FIXME: It's really frustrating that we have to do this, but SSA-form in MIR
324	/// isn't what you might expect. We may have multiple entries in PHI nodes for
325	/// a single predecessor. This makes CFG-updating extremely complex, so here we
326	/// simplify all PHI nodes to a model even simpler than the IR's model: exactly
327	/// one entry per predecessor, regardless of how many edges there are.
328	static void canonicalizePHIOperands(MachineFunction &MF) {
329	SmallPtrSet<MachineBasicBlock *, `4`> Preds;
330	SmallVector<int, `4`> DupIndices;
331	for (auto &MBB : MF)
332	for (auto &MI : MBB) {
333	if (!MI.isPHI())
334	break;
335
336	// First we scan the operands of the PHI looking for duplicate entries
337	// a particular predecessor. We retain the operand index of each duplicate
338	// entry found.
339	for (int OpIdx = `1`, NumOps = MI.getNumOperands(); OpIdx < NumOps;
340	OpIdx += `2`)
341	if (!Preds.insert(Ptr: MI.getOperand(i: OpIdx + `1`).getMBB()).second)
342	DupIndices.push_back(Elt: OpIdx);
343
344	// Now walk the duplicate indices, removing both the block and value. Note
345	// that these are stored as a vector making this element-wise removal
346	// :w
347	// potentially quadratic.
348	//
349	// FIXME: It is really frustrating that we have to use a quadratic
350	// removal algorithm here. There should be a better way, but the use-def
351	// updates required make that impossible using the public API.
352	//
353	// Note that we have to process these backwards so that we don't
354	// invalidate other indices with each removal.
355	while (!DupIndices.empty()) {
356	int OpIdx = DupIndices.pop_back_val();
357	// Remove both the block and value operand, again in reverse order to
358	// preserve indices.
359	MI.removeOperand(OpNo: OpIdx + `1`);
360	MI.removeOperand(OpNo: OpIdx);
361	}
362
363	Preds.clear();
364	}
365	}
366
367	/// Helper to scan a function for loads vulnerable to misspeculation that we
368	/// want to harden.
369	///
370	/// We use this to avoid making changes to functions where there is nothing we
371	/// need to do to harden against misspeculation.
372	static bool hasVulnerableLoad(MachineFunction &MF) {
373	for (MachineBasicBlock &MBB : MF) {
374	for (MachineInstr &MI : MBB) {
375	// Loads within this basic block after an LFENCE are not at risk of
376	// speculatively executing with invalid predicates from prior control
377	// flow. So break out of this block but continue scanning the function.
378	if (MI.getOpcode() == X86::LFENCE)
379	break;
380
381	// Looking for loads only.
382	if (!MI.mayLoad())
383	continue;
384
385	// An MFENCE is modeled as a load but isn't vulnerable to misspeculation.
386	if (MI.getOpcode() == X86::MFENCE)
387	continue;
388
389	// We found a load.
390	return true;
391	}
392	}
393
394	// No loads found.
395	return false;
396	}
397
398	bool X86SpeculativeLoadHardeningPass::runOnMachineFunction(
399	MachineFunction &MF) {
400	LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
401	<< " **********\n");
402
403	// Only run if this pass is forced enabled or we detect the relevant function
404	// attribute requesting SLH.
405	if (!EnableSpeculativeLoadHardening &&
406	!MF.getFunction().hasFnAttribute(Kind: Attribute::SpeculativeLoadHardening))
407	return false;
408
409	Subtarget = &MF.getSubtarget<X86Subtarget>();
410	MRI = &MF.getRegInfo();
411	TII = Subtarget->getInstrInfo();
412	TRI = Subtarget->getRegisterInfo();
413
414	// FIXME: Support for 32-bit.
415	PS.emplace(args&: MF, args: &X86::GR64_NOSPRegClass);
416
417	if (MF.begin() == MF.end())
418	// Nothing to do for a degenerate empty function...
419	return false;
420
421	// We support an alternative hardening technique based on a debug flag.
422	if (HardenEdgesWithLFENCE) {
423	hardenEdgesWithLFENCE(MF);
424	return true;
425	}
426
427	// Create a dummy debug loc to use for all the generated code here.
428	DebugLoc Loc;
429
430	MachineBasicBlock &Entry = *MF.begin();
431	auto EntryInsertPt = Entry.SkipPHIsLabelsAndDebug(I: Entry.begin());
432
433	// Do a quick scan to see if we have any checkable loads.
434	bool HasVulnerableLoad = hasVulnerableLoad(MF);
435
436	// See if we have any conditional branching blocks that we will need to trace
437	// predicate state through.
438	SmallVector<BlockCondInfo, `16`> Infos = collectBlockCondInfo(MF);
439
440	// If we have no interesting conditions or loads, nothing to do here.
441	if (!HasVulnerableLoad && Infos.empty())
442	return true;
443
444	// The poison value is required to be an all-ones value for many aspects of
445	// this mitigation.
446	const int PoisonVal = -`1`;
447	PS ->PoisonReg = MRI->createVirtualRegister(RegClass: PS ->RC);
448	BuildMI(BB&: Entry, I: EntryInsertPt, MIMD: Loc, MCID: TII->get(Opcode: X86::MOV64ri32), DestReg: PS ->PoisonReg)
449	.addImm(Val: PoisonVal);
450	++NumInstsInserted;
451
452	// If we have loads being hardened and we've asked for call and ret edges to
453	// get a full fence-based mitigation, inject that fence.
454	if (HasVulnerableLoad && FenceCallAndRet) {
455	// We need to insert an LFENCE at the start of the function to suspend any
456	// incoming misspeculation from the caller. This helps two-fold: the caller
457	// may not have been protected as this code has been, and this code gets to
458	// not take any specific action to protect across calls.
459	// FIXME: We could skip this for functions which unconditionally return
460	// a constant.
461	BuildMI(BB&: Entry, I: EntryInsertPt, MIMD: Loc, MCID: TII->get(Opcode: X86::LFENCE));
462	++NumInstsInserted;
463	++NumLFENCEsInserted;
464	}
465
466	// If we guarded the entry with an LFENCE and have no conditionals to protect
467	// in blocks, then we're done.
468	if (FenceCallAndRet && Infos.empty())
469	// We may have changed the function's code at this point to insert fences.
470	return true;
471
472	// For every basic block in the function which can b
473	if (HardenInterprocedurally && !FenceCallAndRet) {
474	// Set up the predicate state by extracting it from the incoming stack
475	// pointer so we pick up any misspeculation in our caller.
476	PS ->InitialReg = extractPredStateFromSP(MBB&: Entry, InsertPt: EntryInsertPt, Loc);
477	} else {
478	// Otherwise, just build the predicate state itself by zeroing a register
479	// as we don't need any initial state.
480	PS ->InitialReg = MRI->createVirtualRegister(RegClass: PS ->RC);
481	Register PredStateSubReg = MRI->createVirtualRegister(RegClass: &X86::GR32RegClass);
482	auto ZeroI = BuildMI(BB&: Entry, I: EntryInsertPt, MIMD: Loc, MCID: TII->get(Opcode: X86::MOV32r0),
483	DestReg: PredStateSubReg);
484	++NumInstsInserted;
485	MachineOperand *ZeroEFLAGSDefOp =
486	ZeroI ->findRegisterDefOperand(Reg: X86::EFLAGS, /TRI=/nullptr);
487	assert(ZeroEFLAGSDefOp && ZeroEFLAGSDefOp->isImplicit() &&
488	"Must have an implicit def of EFLAGS!");
489	ZeroEFLAGSDefOp->setIsDead(true);
490	BuildMI(BB&: Entry, I: EntryInsertPt, MIMD: Loc, MCID: TII->get(Opcode: X86::SUBREG_TO_REG),
491	DestReg: PS ->InitialReg)
492	.addImm(Val: `0`)
493	.addReg(RegNo: PredStateSubReg)
494	.addImm(Val: X86::sub_32bit);
495	}
496
497	// We're going to need to trace predicate state throughout the function's
498	// CFG. Prepare for this by setting up our initial state of PHIs with unique
499	// predecessor entries and all the initial predicate state.
500	canonicalizePHIOperands(MF);
501
502	// Track the updated values in an SSA updater to rewrite into SSA form at the
503	// end.
504	PS ->SSA.Initialize(V: PS ->InitialReg);
505	PS ->SSA.AddAvailableValue(BB: &Entry, V: PS ->InitialReg);
506
507	// Trace through the CFG.
508	auto CMovs = tracePredStateThroughCFG(MF, Infos);
509
510	// We may also enter basic blocks in this function via exception handling
511	// control flow. Here, if we are hardening interprocedurally, we need to
512	// re-capture the predicate state from the throwing code. In the Itanium ABI,
513	// the throw will always look like a call to __cxa_throw and will have the
514	// predicate state in the stack pointer, so extract fresh predicate state from
515	// the stack pointer and make it available in SSA.
516	// FIXME: Handle non-itanium ABI EH models.
517	if (HardenInterprocedurally) {
518	for (MachineBasicBlock &MBB : MF) {
519	assert(!MBB.isEHScopeEntry() && "Only Itanium ABI EH supported!");
520	assert(!MBB.isEHFuncletEntry() && "Only Itanium ABI EH supported!");
521	assert(!MBB.isCleanupFuncletEntry() && "Only Itanium ABI EH supported!");
522	if (!MBB.isEHPad())
523	continue;
524	PS ->SSA.AddAvailableValue(
525	BB: &MBB,
526	V: extractPredStateFromSP(MBB, InsertPt: MBB.SkipPHIsAndLabels(I: MBB.begin()), Loc));
527	}
528	}
529
530	if (HardenIndirectCallsAndJumps) {
531	// If we are going to harden calls and jumps we need to unfold their memory
532	// operands.
533	unfoldCallAndJumpLoads(MF);
534
535	// Then we trace predicate state through the indirect branches.
536	auto IndirectBrCMovs = tracePredStateThroughIndirectBranches(MF);
537	CMovs.append(in_start: IndirectBrCMovs.begin(), in_end: IndirectBrCMovs.end());
538	}
539
540	// Now that we have the predicate state available at the start of each block
541	// in the CFG, trace it through each block, hardening vulnerable instructions
542	// as we go.
543	tracePredStateThroughBlocksAndHarden(MF);
544
545	// Now rewrite all the uses of the pred state using the SSA updater to insert
546	// PHIs connecting the state between blocks along the CFG edges.
547	for (MachineInstr *CMovI : CMovs)
548	for (MachineOperand &Op : CMovI->operands()) {
549	if (!Op.isReg() \|\| Op.getReg() != PS ->InitialReg)
550	continue;
551
552	PS ->SSA.RewriteUse(U&: Op);
553	}
554
555	LLVM_DEBUG(dbgs() << "Final speculative load hardened function:\n"; MF.dump();
556	dbgs() << "\n"; MF.verify(this));
557	return true;
558	}
559
560	/// Implements the naive hardening approach of putting an LFENCE after every
561	/// potentially mis-predicted control flow construct.
562	///
563	/// We include this as an alternative mostly for the purpose of comparison. The
564	/// performance impact of this is expected to be extremely severe and not
565	/// practical for any real-world users.
566	void X86SpeculativeLoadHardeningPass::hardenEdgesWithLFENCE(
567	MachineFunction &MF) {
568	// First, we scan the function looking for blocks that are reached along edges
569	// that we might want to harden.
570	SmallSetVector<MachineBasicBlock *, `8`> Blocks;
571	for (MachineBasicBlock &MBB : MF) {
572	// If there are no or only one successor, nothing to do here.
573	if (MBB.succ_size() <= `1`)
574	continue;
575
576	// Skip blocks unless their terminators start with a branch. Other
577	// terminators don't seem interesting for guarding against misspeculation.
578	auto TermIt = MBB.getFirstTerminator();
579	if (TermIt == MBB.end() \|\| !TermIt ->isBranch())
580	continue;
581
582	// Add all the non-EH-pad succossors to the blocks we want to harden. We
583	// skip EH pads because there isn't really a condition of interest on
584	// entering.
585	for (MachineBasicBlock *SuccMBB : MBB.successors())
586	if (!SuccMBB->isEHPad())
587	Blocks.insert(X: SuccMBB);
588	}
589
590	for (MachineBasicBlock *MBB : Blocks) {
591	auto InsertPt = MBB->SkipPHIsAndLabels(I: MBB->begin());
592	BuildMI(BB&: *MBB, I: InsertPt, MIMD: DebugLoc (), MCID: TII->get(Opcode: X86::LFENCE));
593	++NumInstsInserted;
594	++NumLFENCEsInserted;
595	}
596	}
597
598	SmallVector<X86SpeculativeLoadHardeningPass::BlockCondInfo, `16`>
599	X86SpeculativeLoadHardeningPass::collectBlockCondInfo(MachineFunction &MF) {
600	SmallVector<BlockCondInfo, `16`> Infos;
601
602	// Walk the function and build up a summary for each block's conditions that
603	// we need to trace through.
604	for (MachineBasicBlock &MBB : MF) {
605	// If there are no or only one successor, nothing to do here.
606	if (MBB.succ_size() <= `1`)
607	continue;
608
609	// We want to reliably handle any conditional branch terminators in the
610	// MBB, so we manually analyze the branch. We can handle all of the
611	// permutations here, including ones that analyze branch cannot.
612	//
613	// The approach is to walk backwards across the terminators, resetting at
614	// any unconditional non-indirect branch, and track all conditional edges
615	// to basic blocks as well as the fallthrough or unconditional successor
616	// edge. For each conditional edge, we track the target and the opposite
617	// condition code in order to inject a "no-op" cmov into that successor
618	// that will harden the predicate. For the fallthrough/unconditional
619	// edge, we inject a separate cmov for each conditional branch with
620	// matching condition codes. This effectively implements an "and" of the
621	// condition flags, even if there isn't a single condition flag that would
622	// directly implement that. We don't bother trying to optimize either of
623	// these cases because if such an optimization is possible, LLVM should
624	// have optimized the conditional branches* in that way already to reduce*
625	// instruction count. This late, we simply assume the minimal number of
626	// branch instructions is being emitted and use that to guide our cmov
627	// insertion.
628
629	BlockCondInfo Info = {.MBB: &MBB, .CondBrs: {}, .UncondBr: nullptr};
630
631	// Now walk backwards through the terminators and build up successors they
632	// reach and the conditions.
633	for (MachineInstr &MI : llvm::reverse(C&: MBB)) {
634	// Once we've handled all the terminators, we're done.
635	if (!MI.isTerminator())
636	break;
637
638	// If we see a non-branch terminator, we can't handle anything so bail.
639	if (!MI.isBranch()) {
640	Info.CondBrs.clear();
641	break;
642	}
643
644	// If we see an unconditional branch, reset our state, clear any
645	// fallthrough, and set this is the "else" successor.
646	if (MI.getOpcode() == X86::JMP_1) {
647	Info.CondBrs.clear();
648	Info.UncondBr = &MI;
649	continue;
650	}
651
652	// If we get an invalid condition, we have an indirect branch or some
653	// other unanalyzable "fallthrough" case. We model this as a nullptr for
654	// the destination so we can still guard any conditional successors.
655	// Consider code sequences like:
656	// ```
657	// jCC L1
658	// jmpq %rax*
659	// ```
660	// We still want to harden the edge to `L1`.
661	if (X86::getCondFromBranch(MI) == X86::COND_INVALID) {
662	Info.CondBrs.clear();
663	Info.UncondBr = &MI;
664	continue;
665	}
666
667	// We have a vanilla conditional branch, add it to our list.
668	Info.CondBrs.push_back(Elt: &MI);
669	}
670	if (Info.CondBrs.empty()) {
671	++NumBranchesUntraced;
672	LLVM_DEBUG(dbgs() << "WARNING: unable to secure successors of block:\n";
673	MBB.dump());
674	continue;
675	}
676
677	Infos.push_back(Elt: Info);
678	}
679
680	return Infos;
681	}
682
683	/// Trace the predicate state through the CFG, instrumenting each conditional
684	/// branch such that misspeculation through an edge will poison the predicate
685	/// state.
686	///
687	/// Returns the list of inserted CMov instructions so that they can have their
688	/// uses of the predicate state rewritten into proper SSA form once it is
689	/// complete.
690	SmallVector<MachineInstr *, `16`>
691	X86SpeculativeLoadHardeningPass::tracePredStateThroughCFG(
692	MachineFunction &MF, ArrayRef<BlockCondInfo> Infos) {
693	// Collect the inserted cmov instructions so we can rewrite their uses of the
694	// predicate state into SSA form.
695	SmallVector<MachineInstr *, `16`> CMovs;
696
697	// Now walk all of the basic blocks looking for ones that end in conditional
698	// jumps where we need to update this register along each edge.
699	for (const BlockCondInfo &Info : Infos) {
700	MachineBasicBlock &MBB = *Info.MBB;
701	const SmallVectorImpl<MachineInstr *> &CondBrs = Info.CondBrs;
702	MachineInstr *UncondBr = Info.UncondBr;
703
704	LLVM_DEBUG(dbgs() << "Tracing predicate through block: " << MBB.getName()
705	<< "\n");
706	++NumCondBranchesTraced;
707
708	// Compute the non-conditional successor as either the target of any
709	// unconditional branch or the layout successor.
710	MachineBasicBlock *UncondSucc =
711	UncondBr ? (UncondBr->getOpcode() == X86::JMP_1
712	? UncondBr->getOperand(i: `0`).getMBB()
713	: nullptr)
714	: &*std::next(x: MachineFunction::iterator (&MBB));
715
716	// Count how many edges there are to any given successor.
717	SmallDenseMap<MachineBasicBlock , int*> SuccCounts;
718	if (UncondSucc)
719	++SuccCounts [UncondSucc];
720	for (auto *CondBr : CondBrs)
721	++SuccCounts [CondBr->getOperand(i: `0`).getMBB()];
722
723	// A lambda to insert cmov instructions into a block checking all of the
724	// condition codes in a sequence.
725	auto BuildCheckingBlockForSuccAndConds =
726	[&](MachineBasicBlock &MBB, MachineBasicBlock &Succ, int SuccCount,
727	MachineInstr Br, MachineInstr &UncondBr,
728	ArrayRef<X86::CondCode> Conds) {
729	// First, we split the edge to insert the checking block into a safe
730	// location.
731	auto &CheckingMBB =
732	(SuccCount == `1` && Succ.pred_size() == `1`)
733	? Succ
734	: splitEdge(MBB, Succ, SuccCount, Br, UncondBr, TII: *TII);
735
736	bool LiveEFLAGS = Succ.isLiveIn(Reg: X86::EFLAGS);
737	if (!LiveEFLAGS)
738	CheckingMBB.addLiveIn(PhysReg: X86::EFLAGS);
739
740	// Now insert the cmovs to implement the checks.
741	auto InsertPt = CheckingMBB.begin();
742	assert((InsertPt == CheckingMBB.end() \|\| !InsertPt->isPHI()) &&
743	"Should never have a PHI in the initial checking block as it "
744	"always has a single predecessor!");
745
746	// We will wire each cmov to each other, but need to start with the
747	// incoming pred state.
748	unsigned CurStateReg = PS ->InitialReg;
749
750	for (X86::CondCode Cond : Conds) {
751	int PredStateSizeInBytes = TRI->getRegSizeInBits(RC: *PS ->RC) / `8`;
752	auto CMovOp = X86::getCMovOpcode(RegBytes: PredStateSizeInBytes);
753
754	Register UpdatedStateReg = MRI->createVirtualRegister(RegClass: PS ->RC);
755	// Note that we intentionally use an empty debug location so that
756	// this picks up the preceding location.
757	auto CMovI = BuildMI(BB&: CheckingMBB, I: InsertPt, MIMD: DebugLoc (),
758	MCID: TII->get(Opcode: CMovOp), DestReg: UpdatedStateReg)
759	.addReg(RegNo: CurStateReg)
760	.addReg(RegNo: PS ->PoisonReg)
761	.addImm(Val: Cond);
762	// If this is the last cmov and the EFLAGS weren't originally
763	// live-in, mark them as killed.
764	if (!LiveEFLAGS && Cond == Conds.back())
765	CMovI ->findRegisterUseOperand(Reg: X86::EFLAGS, /TRI=/nullptr)
766	->setIsKill(true);
767
768	++NumInstsInserted;
769	LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI->dump();
770	dbgs() << "\n");
771
772	// The first one of the cmovs will be using the top level
773	// `PredStateReg` and need to get rewritten into SSA form.
774	if (CurStateReg == PS ->InitialReg)
775	CMovs.push_back(Elt: &*CMovI);
776
777	// The next cmov should start from this one's def.
778	CurStateReg = UpdatedStateReg;
779	}
780
781	// And put the last one into the available values for SSA form of our
782	// predicate state.
783	PS ->SSA.AddAvailableValue(BB: &CheckingMBB, V: CurStateReg);
784	};
785
786	std::vector<X86::CondCode> UncondCodeSeq;
787	for (auto *CondBr : CondBrs) {
788	MachineBasicBlock &Succ = *CondBr->getOperand(i: `0`).getMBB();
789	int &SuccCount = SuccCounts [&Succ];
790
791	X86::CondCode Cond = X86::getCondFromBranch(MI: *CondBr);
792	X86::CondCode InvCond = X86::GetOppositeBranchCondition(CC: Cond);
793	UncondCodeSeq.push_back(x: Cond);
794
795	BuildCheckingBlockForSuccAndConds (MBB, Succ, SuccCount, CondBr, UncondBr,
796	{InvCond});
797
798	// Decrement the successor count now that we've split one of the edges.
799	// We need to keep the count of edges to the successor accurate in order
800	// to know above when to replace* the successor in the CFG vs. just*
801	// adding the new successor.
802	--SuccCount;
803	}
804
805	// Since we may have split edges and changed the number of successors,
806	// normalize the probabilities. This avoids doing it each time we split an
807	// edge.
808	MBB.normalizeSuccProbs();
809
810	// Finally, we need to insert cmovs into the "fallthrough" edge. Here, we
811	// need to intersect the other condition codes. We can do this by just
812	// doing a cmov for each one.
813	if (!UncondSucc)
814	// If we have no fallthrough to protect (perhaps it is an indirect jump?)
815	// just skip this and continue.
816	continue;
817
818	assert(SuccCounts[UncondSucc] == `1` &&
819	"We should never have more than one edge to the unconditional "
820	"successor at this point because every other edge must have been "
821	"split above!");
822
823	// Sort and unique the codes to minimize them.
824	llvm::sort(C&: UncondCodeSeq);
825	UncondCodeSeq.erase(first: llvm::unique(R&: UncondCodeSeq), last: UncondCodeSeq.end());
826
827	// Build a checking version of the successor.
828	BuildCheckingBlockForSuccAndConds (MBB, UncondSucc, /SuccCount/* `1`,
829	UncondBr, UncondBr, UncondCodeSeq);
830	}
831
832	return CMovs;
833	}
834
835	/// Compute the register class for the unfolded load.
836	///
837	/// FIXME: This should probably live in X86InstrInfo, potentially by adding
838	/// a way to unfold into a newly created vreg rather than requiring a register
839	/// input.
840	static const TargetRegisterClass *
841	getRegClassForUnfoldedLoad(MachineFunction &MF, const X86InstrInfo &TII,
842	unsigned Opcode) {
843	unsigned Index;
844	unsigned UnfoldedOpc = TII.getOpcodeAfterMemoryUnfold(
845	Opc: Opcode, /UnfoldLoad/ true, /UnfoldStore/ false, LoadRegIndex: &Index);
846	const MCInstrDesc &MCID = TII.get(Opcode: UnfoldedOpc);
847	return TII.getRegClass(MCID, OpNum: Index, TRI: &TII.getRegisterInfo(), MF);
848	}
849
850	void X86SpeculativeLoadHardeningPass::unfoldCallAndJumpLoads(
851	MachineFunction &MF) {
852	for (MachineBasicBlock &MBB : MF)
853	// We use make_early_inc_range here so we can remove instructions if needed
854	// without disturbing the iteration.
855	for (MachineInstr &MI : llvm::make_early_inc_range(Range: MBB.instrs())) {
856	// Must either be a call or a branch.
857	if (!MI.isCall() && !MI.isBranch())
858	continue;
859	// We only care about loading variants of these instructions.
860	if (!MI.mayLoad())
861	continue;
862
863	switch (MI.getOpcode()) {
864	default: {
865	LLVM_DEBUG(
866	dbgs() << "ERROR: Found an unexpected loading branch or call "
867	"instruction:\n";
868	MI.dump(); dbgs() << "\n");
869	report_fatal_error(reason: "Unexpected loading branch or call!");
870	}
871
872	case X86::FARCALL16m:
873	case X86::FARCALL32m:
874	case X86::FARCALL64m:
875	case X86::FARJMP16m:
876	case X86::FARJMP32m:
877	case X86::FARJMP64m:
878	// We cannot mitigate far jumps or calls, but we also don't expect them
879	// to be vulnerable to Spectre v1.2 style attacks.
880	continue;
881
882	case X86::CALL16m:
883	case X86::CALL16m_NT:
884	case X86::CALL32m:
885	case X86::CALL32m_NT:
886	case X86::CALL64m:
887	case X86::CALL64m_NT:
888	case X86::JMP16m:
889	case X86::JMP16m_NT:
890	case X86::JMP32m:
891	case X86::JMP32m_NT:
892	case X86::JMP64m:
893	case X86::JMP64m_NT:
894	case X86::TAILJMPm64:
895	case X86::TAILJMPm64_REX:
896	case X86::TAILJMPm:
897	case X86::TCRETURNmi64:
898	case X86::TCRETURNmi: {
899	// Use the generic unfold logic now that we know we're dealing with
900	// expected instructions.
901	// FIXME: We don't have test coverage for all of these!
902	auto UnfoldedRC = getRegClassForUnfoldedLoad(MF, TII: TII, Opcode: MI.getOpcode());
903	if (!UnfoldedRC) {
904	LLVM_DEBUG(dbgs()
905	<< "ERROR: Unable to unfold load from instruction:\n";
906	MI.dump(); dbgs() << "\n");
907	report_fatal_error(reason: "Unable to unfold load!");
908	}
909	Register Reg = MRI->createVirtualRegister(RegClass: UnfoldedRC);
910	SmallVector<MachineInstr *, `2`> NewMIs;
911	// If we were able to compute an unfolded reg class, any failure here
912	// is just a programming error so just assert.
913	bool Unfolded =
914	TII->unfoldMemoryOperand(MF, MI, Reg, /UnfoldLoad/ true,
915	/UnfoldStore/ false, NewMIs);
916	(void)Unfolded;
917	assert(Unfolded &&
918	"Computed unfolded register class but failed to unfold");
919	// Now stitch the new instructions into place and erase the old one.
920	for (auto *NewMI : NewMIs)
921	MBB.insert(I: MI.getIterator(), M: NewMI);
922
923	// Update the call site info.
924	if (MI.isCandidateForCallSiteEntry())
925	MF.eraseCallSiteInfo(MI: &MI);
926
927	MI.eraseFromParent();
928	LLVM_DEBUG({
929	dbgs() << "Unfolded load successfully into:\n";
930	for (auto *NewMI : NewMIs) {
931	NewMI->dump();
932	dbgs() << "\n";
933	}
934	});
935	continue;
936	}
937	}
938	llvm_unreachable("Escaped switch with default!");
939	}
940	}
941
942	/// Trace the predicate state through indirect branches, instrumenting them to
943	/// poison the state if a target is reached that does not match the expected
944	/// target.
945	///
946	/// This is designed to mitigate Spectre variant 1 attacks where an indirect
947	/// branch is trained to predict a particular target and then mispredicts that
948	/// target in a way that can leak data. Despite using an indirect branch, this
949	/// is really a variant 1 style attack: it does not steer execution to an
950	/// arbitrary or attacker controlled address, and it does not require any
951	/// special code executing next to the victim. This attack can also be mitigated
952	/// through retpolines, but those require either replacing indirect branches
953	/// with conditional direct branches or lowering them through a device that
954	/// blocks speculation. This mitigation can replace these retpoline-style
955	/// mitigations for jump tables and other indirect branches within a function
956	/// when variant 2 isn't a risk while allowing limited speculation. Indirect
957	/// calls, however, cannot be mitigated through this technique without changing
958	/// the ABI in a fundamental way.
959	SmallVector<MachineInstr *, `16`>
960	X86SpeculativeLoadHardeningPass::tracePredStateThroughIndirectBranches(
961	MachineFunction &MF) {
962	// We use the SSAUpdater to insert PHI nodes for the target addresses of
963	// indirect branches. We don't actually need the full power of the SSA updater
964	// in this particular case as we always have immediately available values, but
965	// this avoids us having to re-implement the PHI construction logic.
966	MachineSSAUpdater TargetAddrSSA(MF);
967	TargetAddrSSA.Initialize(V: MRI->createVirtualRegister(RegClass: &X86::GR64RegClass));
968
969	// Track which blocks were terminated with an indirect branch.
970	SmallPtrSet<MachineBasicBlock *, `4`> IndirectTerminatedMBBs;
971
972	// We need to know what blocks end up reached via indirect branches. We
973	// expect this to be a subset of those whose address is taken and so track it
974	// directly via the CFG.
975	SmallPtrSet<MachineBasicBlock *, `4`> IndirectTargetMBBs;
976
977	// Walk all the blocks which end in an indirect branch and make the
978	// target address available.
979	for (MachineBasicBlock &MBB : MF) {
980	// Find the last terminator.
981	auto MII = MBB.instr_rbegin();
982	while (MII != MBB.instr_rend() && MII ->isDebugInstr())
983	++MII;
984	if (MII == MBB.instr_rend())
985	continue;
986	MachineInstr &TI = *MII;
987	if (!TI.isTerminator() \|\| !TI.isBranch())
988	// No terminator or non-branch terminator.
989	continue;
990
991	unsigned TargetReg;
992
993	switch (TI.getOpcode()) {
994	default:
995	// Direct branch or conditional branch (leading to fallthrough).
996	continue;
997
998	case X86::FARJMP16m:
999	case X86::FARJMP32m:
1000	case X86::FARJMP64m:
1001	// We cannot mitigate far jumps or calls, but we also don't expect them
1002	// to be vulnerable to Spectre v1.2 or v2 (self trained) style attacks.
1003	continue;
1004
1005	case X86::JMP16m:
1006	case X86::JMP16m_NT:
1007	case X86::JMP32m:
1008	case X86::JMP32m_NT:
1009	case X86::JMP64m:
1010	case X86::JMP64m_NT:
1011	// Mostly as documentation.
1012	report_fatal_error(reason: "Memory operand jumps should have been unfolded!");
1013
1014	case X86::JMP16r:
1015	report_fatal_error(
1016	reason: "Support for 16-bit indirect branches is not implemented.");
1017	case X86::JMP32r:
1018	report_fatal_error(
1019	reason: "Support for 32-bit indirect branches is not implemented.");
1020
1021	case X86::JMP64r:
1022	TargetReg = TI.getOperand(i: `0`).getReg();
1023	}
1024
1025	// We have definitely found an indirect branch. Verify that there are no
1026	// preceding conditional branches as we don't yet support that.
1027	if (llvm::any_of(Range: MBB.terminators(), P: [&](MachineInstr &OtherTI) {
1028	return !OtherTI.isDebugInstr() && &OtherTI != &TI;
1029	})) {
1030	LLVM_DEBUG({
1031	dbgs() << "ERROR: Found other terminators in a block with an indirect "
1032	"branch! This is not yet supported! Terminator sequence:\n";
1033	for (MachineInstr &MI : MBB.terminators()) {
1034	MI.dump();
1035	dbgs() << `'\n'`;
1036	}
1037	});
1038	report_fatal_error(reason: "Unimplemented terminator sequence!");
1039	}
1040
1041	// Make the target register an available value for this block.
1042	TargetAddrSSA.AddAvailableValue(BB: &MBB, V: TargetReg);
1043	IndirectTerminatedMBBs.insert(Ptr: &MBB);
1044
1045	// Add all the successors to our target candidates.
1046	for (MachineBasicBlock *Succ : MBB.successors())
1047	IndirectTargetMBBs.insert(Ptr: Succ);
1048	}
1049
1050	// Keep track of the cmov instructions we insert so we can return them.
1051	SmallVector<MachineInstr *, `16`> CMovs;
1052
1053	// If we didn't find any indirect branches with targets, nothing to do here.
1054	if (IndirectTargetMBBs.empty())
1055	return CMovs;
1056
1057	// We found indirect branches and targets that need to be instrumented to
1058	// harden loads within them. Walk the blocks of the function (to get a stable
1059	// ordering) and instrument each target of an indirect branch.
1060	for (MachineBasicBlock &MBB : MF) {
1061	// Skip the blocks that aren't candidate targets.
1062	if (!IndirectTargetMBBs.count(Ptr: &MBB))
1063	continue;
1064
1065	// We don't expect EH pads to ever be reached via an indirect branch. If
1066	// this is desired for some reason, we could simply skip them here rather
1067	// than asserting.
1068	assert(!MBB.isEHPad() &&
1069	"Unexpected EH pad as target of an indirect branch!");
1070
1071	// We should never end up threading EFLAGS into a block to harden
1072	// conditional jumps as there would be an additional successor via the
1073	// indirect branch. As a consequence, all such edges would be split before
1074	// reaching here, and the inserted block will handle the EFLAGS-based
1075	// hardening.
1076	assert(!MBB.isLiveIn(X86::EFLAGS) &&
1077	"Cannot check within a block that already has live-in EFLAGS!");
1078
1079	// We can't handle having non-indirect edges into this block unless this is
1080	// the only successor and we can synthesize the necessary target address.
1081	for (MachineBasicBlock *Pred : MBB.predecessors()) {
1082	// If we've already handled this by extracting the target directly,
1083	// nothing to do.
1084	if (IndirectTerminatedMBBs.count(Ptr: Pred))
1085	continue;
1086
1087	// Otherwise, we have to be the only successor. We generally expect this
1088	// to be true as conditional branches should have had a critical edge
1089	// split already. We don't however need to worry about EH pad successors
1090	// as they'll happily ignore the target and their hardening strategy is
1091	// resilient to all ways in which they could be reached speculatively.
1092	if (!llvm::all_of(Range: Pred->successors(), P: [&](MachineBasicBlock *Succ) {
1093	return Succ->isEHPad() \|\| Succ == &MBB;
1094	})) {
1095	LLVM_DEBUG({
1096	dbgs() << "ERROR: Found conditional entry to target of indirect "
1097	"branch!\n";
1098	Pred->dump();
1099	MBB.dump();
1100	});
1101	report_fatal_error(reason: "Cannot harden a conditional entry to a target of "
1102	"an indirect branch!");
1103	}
1104
1105	// Now we need to compute the address of this block and install it as a
1106	// synthetic target in the predecessor. We do this at the bottom of the
1107	// predecessor.
1108	auto InsertPt = Pred->getFirstTerminator();
1109	Register TargetReg = MRI->createVirtualRegister(RegClass: &X86::GR64RegClass);
1110	if (MF.getTarget().getCodeModel() == CodeModel::Small &&
1111	!Subtarget->isPositionIndependent()) {
1112	// Directly materialize it into an immediate.
1113	auto AddrI = BuildMI(BB&: *Pred, I: InsertPt, MIMD: DebugLoc (),
1114	MCID: TII->get(Opcode: X86::MOV64ri32), DestReg: TargetReg)
1115	.addMBB(MBB: &MBB);
1116	++NumInstsInserted;
1117	(void)AddrI;
1118	LLVM_DEBUG(dbgs() << " Inserting mov: "; AddrI->dump();
1119	dbgs() << "\n");
1120	} else {
1121	auto AddrI = BuildMI(BB&: *Pred, I: InsertPt, MIMD: DebugLoc (), MCID: TII->get(Opcode: X86::LEA64r),
1122	DestReg: TargetReg)
1123	.addReg(/Base/ RegNo: X86::RIP)
1124	.addImm(/Scale/ Val: `1`)
1125	.addReg(/Index/ RegNo: `0`)
1126	.addMBB(MBB: &MBB)
1127	.addReg(/Segment/ RegNo: `0`);
1128	++NumInstsInserted;
1129	(void)AddrI;
1130	LLVM_DEBUG(dbgs() << " Inserting lea: "; AddrI->dump();
1131	dbgs() << "\n");
1132	}
1133	// And make this available.
1134	TargetAddrSSA.AddAvailableValue(BB: Pred, V: TargetReg);
1135	}
1136
1137	// Materialize the needed SSA value of the target. Note that we need the
1138	// middle of the block as this block might at the bottom have an indirect
1139	// branch back to itself. We can do this here because at this point, every
1140	// predecessor of this block has an available value. This is basically just
1141	// automating the construction of a PHI node for this target.
1142	Register TargetReg = TargetAddrSSA.GetValueInMiddleOfBlock(BB: &MBB);
1143
1144	// Insert a comparison of the incoming target register with this block's
1145	// address. This also requires us to mark the block as having its address
1146	// taken explicitly.
1147	MBB.setMachineBlockAddressTaken();
1148	auto InsertPt = MBB.SkipPHIsLabelsAndDebug(I: MBB.begin());
1149	if (MF.getTarget().getCodeModel() == CodeModel::Small &&
1150	!Subtarget->isPositionIndependent()) {
1151	// Check directly against a relocated immediate when we can.
1152	auto CheckI = BuildMI(BB&: MBB, I: InsertPt, MIMD: DebugLoc (), MCID: TII->get(Opcode: X86::CMP64ri32))
1153	.addReg(RegNo: TargetReg, flags: RegState::Kill)
1154	.addMBB(MBB: &MBB);
1155	++NumInstsInserted;
1156	(void)CheckI;
1157	LLVM_DEBUG(dbgs() << " Inserting cmp: "; CheckI->dump(); dbgs() << "\n");
1158	} else {
1159	// Otherwise compute the address into a register first.
1160	Register AddrReg = MRI->createVirtualRegister(RegClass: &X86::GR64RegClass);
1161	auto AddrI =
1162	BuildMI(BB&: MBB, I: InsertPt, MIMD: DebugLoc (), MCID: TII->get(Opcode: X86::LEA64r), DestReg: AddrReg)
1163	.addReg(/Base/ RegNo: X86::RIP)
1164	.addImm(/Scale/ Val: `1`)
1165	.addReg(/Index/ RegNo: `0`)
1166	.addMBB(MBB: &MBB)
1167	.addReg(/Segment/ RegNo: `0`);
1168	++NumInstsInserted;
1169	(void)AddrI;
1170	LLVM_DEBUG(dbgs() << " Inserting lea: "; AddrI->dump(); dbgs() << "\n");
1171	auto CheckI = BuildMI(BB&: MBB, I: InsertPt, MIMD: DebugLoc (), MCID: TII->get(Opcode: X86::CMP64rr))
1172	.addReg(RegNo: TargetReg, flags: RegState::Kill)
1173	.addReg(RegNo: AddrReg, flags: RegState::Kill);
1174	++NumInstsInserted;
1175	(void)CheckI;
1176	LLVM_DEBUG(dbgs() << " Inserting cmp: "; CheckI->dump(); dbgs() << "\n");
1177	}
1178
1179	// Now cmov over the predicate if the comparison wasn't equal.
1180	int PredStateSizeInBytes = TRI->getRegSizeInBits(RC: *PS ->RC) / `8`;
1181	auto CMovOp = X86::getCMovOpcode(RegBytes: PredStateSizeInBytes);
1182	Register UpdatedStateReg = MRI->createVirtualRegister(RegClass: PS ->RC);
1183	auto CMovI =
1184	BuildMI(BB&: MBB, I: InsertPt, MIMD: DebugLoc (), MCID: TII->get(Opcode: CMovOp), DestReg: UpdatedStateReg)
1185	.addReg(RegNo: PS ->InitialReg)
1186	.addReg(RegNo: PS ->PoisonReg)
1187	.addImm(Val: X86::COND_NE);
1188	CMovI ->findRegisterUseOperand(Reg: X86::EFLAGS, /TRI=/nullptr)
1189	->setIsKill(true);
1190	++NumInstsInserted;
1191	LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI->dump(); dbgs() << "\n");
1192	CMovs.push_back(Elt: &*CMovI);
1193
1194	// And put the new value into the available values for SSA form of our
1195	// predicate state.
1196	PS ->SSA.AddAvailableValue(BB: &MBB, V: UpdatedStateReg);
1197	}
1198
1199	// Return all the newly inserted cmov instructions of the predicate state.
1200	return CMovs;
1201	}
1202
1203	// Returns true if the MI has EFLAGS as a register def operand and it's live,
1204	// otherwise it returns false
1205	static bool isEFLAGSDefLive(const MachineInstr &MI) {
1206	if (const MachineOperand *DefOp =
1207	MI.findRegisterDefOperand(Reg: X86::EFLAGS, /TRI=/nullptr)) {
1208	return !DefOp->isDead();
1209	}
1210	return false;
1211	}
1212
1213	static bool isEFLAGSLive(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
1214	const TargetRegisterInfo &TRI) {
1215	// Check if EFLAGS are alive by seeing if there is a def of them or they
1216	// live-in, and then seeing if that def is in turn used.
1217	for (MachineInstr &MI : llvm::reverse(C: llvm::make_range(x: MBB.begin(), y: I))) {
1218	if (MachineOperand *DefOp =
1219	MI.findRegisterDefOperand(Reg: X86::EFLAGS, /TRI=/nullptr)) {
1220	// If the def is dead, then EFLAGS is not live.
1221	if (DefOp->isDead())
1222	return false;
1223
1224	// Otherwise we've def'ed it, and it is live.
1225	return true;
1226	}
1227	// While at this instruction, also check if we use and kill EFLAGS
1228	// which means it isn't live.
1229	if (MI.killsRegister(Reg: X86::EFLAGS, TRI: &TRI))
1230	return false;
1231	}
1232
1233	// If we didn't find anything conclusive (neither definitely alive or
1234	// definitely dead) return whether it lives into the block.
1235	return MBB.isLiveIn(Reg: X86::EFLAGS);
1236	}
1237
1238	/// Trace the predicate state through each of the blocks in the function,
1239	/// hardening everything necessary along the way.
1240	///
1241	/// We call this routine once the initial predicate state has been established
1242	/// for each basic block in the function in the SSA updater. This routine traces
1243	/// it through the instructions within each basic block, and for non-returning
1244	/// blocks informs the SSA updater about the final state that lives out of the
1245	/// block. Along the way, it hardens any vulnerable instruction using the
1246	/// currently valid predicate state. We have to do these two things together
1247	/// because the SSA updater only works across blocks. Within a block, we track
1248	/// the current predicate state directly and update it as it changes.
1249	///
1250	/// This operates in two passes over each block. First, we analyze the loads in
1251	/// the block to determine which strategy will be used to harden them: hardening
1252	/// the address or hardening the loaded value when loaded into a register
1253	/// amenable to hardening. We have to process these first because the two
1254	/// strategies may interact -- later hardening may change what strategy we wish
1255	/// to use. We also will analyze data dependencies between loads and avoid
1256	/// hardening those loads that are data dependent on a load with a hardened
1257	/// address. We also skip hardening loads already behind an LFENCE as that is
1258	/// sufficient to harden them against misspeculation.
1259	///
1260	/// Second, we actively trace the predicate state through the block, applying
1261	/// the hardening steps we determined necessary in the first pass as we go.
1262	///
1263	/// These two passes are applied to each basic block. We operate one block at a
1264	/// time to simplify reasoning about reachability and sequencing.
1265	void X86SpeculativeLoadHardeningPass::tracePredStateThroughBlocksAndHarden(
1266	MachineFunction &MF) {
1267	SmallPtrSet<MachineInstr *, `16`> HardenPostLoad;
1268	SmallPtrSet<MachineInstr *, `16`> HardenLoadAddr;
1269
1270	SmallSet<unsigned, `16`> HardenedAddrRegs;
1271
1272	SmallDenseMap<unsigned, unsigned, `32`> AddrRegToHardenedReg;
1273
1274	// Track the set of load-dependent registers through the basic block. Because
1275	// the values of these registers have an existing data dependency on a loaded
1276	// value which we would have checked, we can omit any checks on them.
1277	SparseBitVector<> LoadDepRegs;
1278
1279	for (MachineBasicBlock &MBB : MF) {
1280	// The first pass over the block: collect all the loads which can have their
1281	// loaded value hardened and all the loads that instead need their address
1282	// hardened. During this walk we propagate load dependence for address
1283	// hardened loads and also look for LFENCE to stop hardening wherever
1284	// possible. When deciding whether or not to harden the loaded value or not,
1285	// we check to see if any registers used in the address will have been
1286	// hardened at this point and if so, harden any remaining address registers
1287	// as that often successfully re-uses hardened addresses and minimizes
1288	// instructions.
1289	//
1290	// FIXME: We should consider an aggressive mode where we continue to keep as
1291	// many loads value hardened even when some address register hardening would
1292	// be free (due to reuse).
1293	//
1294	// Note that we only need this pass if we are actually hardening loads.
1295	if (HardenLoads)
1296	for (MachineInstr &MI : MBB) {
1297	// We naively assume that all def'ed registers of an instruction have
1298	// a data dependency on all of their operands.
1299	// FIXME: Do a more careful analysis of x86 to build a conservative
1300	// model here.
1301	if (llvm::any_of(Range: MI.uses(), P: [&](MachineOperand &Op) {
1302	return Op.isReg() && LoadDepRegs.test(Idx: Op.getReg());
1303	}))
1304	for (MachineOperand &Def : MI.defs())
1305	if (Def.isReg())
1306	LoadDepRegs.set(Def.getReg());
1307
1308	// Both Intel and AMD are guiding that they will change the semantics of
1309	// LFENCE to be a speculation barrier, so if we see an LFENCE, there is
1310	// no more need to guard things in this block.
1311	if (MI.getOpcode() == X86::LFENCE)
1312	break;
1313
1314	// If this instruction cannot load, nothing to do.
1315	if (!MI.mayLoad())
1316	continue;
1317
1318	// Some instructions which "load" are trivially safe or unimportant.
1319	if (MI.getOpcode() == X86::MFENCE)
1320	continue;
1321
1322	// Extract the memory operand information about this instruction.
1323	const int MemRefBeginIdx = X86::getFirstAddrOperandIdx(MI);
1324	if (MemRefBeginIdx < `0`) {
1325	LLVM_DEBUG(dbgs()
1326	<< "WARNING: unable to harden loading instruction: ";
1327	MI.dump());
1328	continue;
1329	}
1330
1331	MachineOperand &BaseMO =
1332	MI.getOperand(i: MemRefBeginIdx + X86::AddrBaseReg);
1333	MachineOperand &IndexMO =
1334	MI.getOperand(i: MemRefBeginIdx + X86::AddrIndexReg);
1335
1336	// If we have at least one (non-frame-index, non-RIP) register operand,
1337	// and neither operand is load-dependent, we need to check the load.
1338	unsigned BaseReg = `0`, IndexReg = `0`;
1339	if (!BaseMO.isFI() && BaseMO.getReg() != X86::RIP &&
1340	BaseMO.getReg() != X86::NoRegister)
1341	BaseReg = BaseMO.getReg();
1342	if (IndexMO.getReg() != X86::NoRegister)
1343	IndexReg = IndexMO.getReg();
1344
1345	if (!BaseReg && !IndexReg)
1346	// No register operands!
1347	continue;
1348
1349	// If any register operand is dependent, this load is dependent and we
1350	// needn't check it.
1351	// FIXME: Is this true in the case where we are hardening loads after
1352	// they complete? Unclear, need to investigate.
1353	if ((BaseReg && LoadDepRegs.test(Idx: BaseReg)) \|\|
1354	(IndexReg && LoadDepRegs.test(Idx: IndexReg)))
1355	continue;
1356
1357	// If post-load hardening is enabled, this load is compatible with
1358	// post-load hardening, and we aren't already going to harden one of the
1359	// address registers, queue it up to be hardened post-load. Notably,
1360	// even once hardened this won't introduce a useful dependency that
1361	// could prune out subsequent loads.
1362	if (EnablePostLoadHardening && X86InstrInfo::isDataInvariantLoad(MI) &&
1363	!isEFLAGSDefLive(MI) && MI.getDesc().getNumDefs() == `1` &&
1364	MI.getOperand(i: `0`).isReg() &&
1365	canHardenRegister(Reg: MI.getOperand(i: `0`).getReg()) &&
1366	!HardenedAddrRegs.count(V: BaseReg) &&
1367	!HardenedAddrRegs.count(V: IndexReg)) {
1368	HardenPostLoad.insert(Ptr: &MI);
1369	HardenedAddrRegs.insert(V: MI.getOperand(i: `0`).getReg());
1370	continue;
1371	}
1372
1373	// Record this instruction for address hardening and record its register
1374	// operands as being address-hardened.
1375	HardenLoadAddr.insert(Ptr: &MI);
1376	if (BaseReg)
1377	HardenedAddrRegs.insert(V: BaseReg);
1378	if (IndexReg)
1379	HardenedAddrRegs.insert(V: IndexReg);
1380
1381	for (MachineOperand &Def : MI.defs())
1382	if (Def.isReg())
1383	LoadDepRegs.set(Def.getReg());
1384	}
1385
1386	// Now re-walk the instructions in the basic block, and apply whichever
1387	// hardening strategy we have elected. Note that we do this in a second
1388	// pass specifically so that we have the complete set of instructions for
1389	// which we will do post-load hardening and can defer it in certain
1390	// circumstances.
1391	for (MachineInstr &MI : MBB) {
1392	if (HardenLoads) {
1393	// We cannot both require hardening the def of a load and its address.
1394	assert(!(HardenLoadAddr.count(&MI) && HardenPostLoad.count(&MI)) &&
1395	"Requested to harden both the address and def of a load!");
1396
1397	// Check if this is a load whose address needs to be hardened.
1398	if (HardenLoadAddr.erase(Ptr: &MI)) {
1399	const int MemRefBeginIdx = X86::getFirstAddrOperandIdx(MI);
1400	assert(MemRefBeginIdx >= `0` && "Cannot have an invalid index here!");
1401
1402	MachineOperand &BaseMO =
1403	MI.getOperand(i: MemRefBeginIdx + X86::AddrBaseReg);
1404	MachineOperand &IndexMO =
1405	MI.getOperand(i: MemRefBeginIdx + X86::AddrIndexReg);
1406	hardenLoadAddr(MI, BaseMO, IndexMO, AddrRegToHardenedReg);
1407	continue;
1408	}
1409
1410	// Test if this instruction is one of our post load instructions (and
1411	// remove it from the set if so).
1412	if (HardenPostLoad.erase(Ptr: &MI)) {
1413	assert(!MI.isCall() && "Must not try to post-load harden a call!");
1414
1415	// If this is a data-invariant load and there is no EFLAGS
1416	// interference, we want to try and sink any hardening as far as
1417	// possible.
1418	if (X86InstrInfo::isDataInvariantLoad(MI) && !isEFLAGSDefLive(MI)) {
1419	// Sink the instruction we'll need to harden as far as we can down
1420	// the graph.
1421	MachineInstr *SunkMI = sinkPostLoadHardenedInst(MI, HardenedInstrs&: HardenPostLoad);
1422
1423	// If we managed to sink this instruction, update everything so we
1424	// harden that instruction when we reach it in the instruction
1425	// sequence.
1426	if (SunkMI != &MI) {
1427	// If in sinking there was no instruction needing to be hardened,
1428	// we're done.
1429	if (!SunkMI)
1430	continue;
1431
1432	// Otherwise, add this to the set of defs we harden.
1433	HardenPostLoad.insert(Ptr: SunkMI);
1434	continue;
1435	}
1436	}
1437
1438	unsigned HardenedReg = hardenPostLoad(MI);
1439
1440	// Mark the resulting hardened register as such so we don't re-harden.
1441	AddrRegToHardenedReg [HardenedReg] = HardenedReg;
1442
1443	continue;
1444	}
1445
1446	// Check for an indirect call or branch that may need its input hardened
1447	// even if we couldn't find the specific load used, or were able to
1448	// avoid hardening it for some reason. Note that here we cannot break
1449	// out afterward as we may still need to handle any call aspect of this
1450	// instruction.
1451	if ((MI.isCall() \|\| MI.isBranch()) && HardenIndirectCallsAndJumps)
1452	hardenIndirectCallOrJumpInstr(MI, AddrRegToHardenedReg);
1453	}
1454
1455	// After we finish hardening loads we handle interprocedural hardening if
1456	// enabled and relevant for this instruction.
1457	if (!HardenInterprocedurally)
1458	continue;
1459	if (!MI.isCall() && !MI.isReturn())
1460	continue;
1461
1462	// If this is a direct return (IE, not a tail call) just directly harden
1463	// it.
1464	if (MI.isReturn() && !MI.isCall()) {
1465	hardenReturnInstr(MI);
1466	continue;
1467	}
1468
1469	// Otherwise we have a call. We need to handle transferring the predicate
1470	// state into a call and recovering it after the call returns (unless this
1471	// is a tail call).
1472	assert(MI.isCall() && "Should only reach here for calls!");
1473	tracePredStateThroughCall(MI);
1474	}
1475
1476	HardenPostLoad.clear();
1477	HardenLoadAddr.clear();
1478	HardenedAddrRegs.clear();
1479	AddrRegToHardenedReg.clear();
1480
1481	// Currently, we only track data-dependent loads within a basic block.
1482	// FIXME: We should see if this is necessary or if we could be more
1483	// aggressive here without opening up attack avenues.
1484	LoadDepRegs.clear();
1485	}
1486	}
1487
1488	/// Save EFLAGS into the returned GPR. This can in turn be restored with
1489	/// `restoreEFLAGS`.
1490	///
1491	/// Note that LLVM can only lower very simple patterns of saved and restored
1492	/// EFLAGS registers. The restore should always be within the same basic block
1493	/// as the save so that no PHI nodes are inserted.
1494	unsigned X86SpeculativeLoadHardeningPass::saveEFLAGS(
1495	MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
1496	const DebugLoc &Loc) {
1497	// FIXME: Hard coding this to a 32-bit register class seems weird, but matches
1498	// what instruction selection does.
1499	Register Reg = MRI->createVirtualRegister(RegClass: &X86::GR32RegClass);
1500	// We directly copy the FLAGS register and rely on later lowering to clean
1501	// this up into the appropriate setCC instructions.
1502	BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc, MCID: TII->get(Opcode: X86::COPY), DestReg: Reg).addReg(RegNo: X86::EFLAGS);
1503	++NumInstsInserted;
1504	return Reg;
1505	}
1506
1507	/// Restore EFLAGS from the provided GPR. This should be produced by
1508	/// `saveEFLAGS`.
1509	///
1510	/// This must be done within the same basic block as the save in order to
1511	/// reliably lower.
1512	void X86SpeculativeLoadHardeningPass::restoreEFLAGS(
1513	MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
1514	const DebugLoc &Loc, Register Reg) {
1515	BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc, MCID: TII->get(Opcode: X86::COPY), DestReg: X86::EFLAGS).addReg(RegNo: Reg);
1516	++NumInstsInserted;
1517	}
1518
1519	/// Takes the current predicate state (in a register) and merges it into the
1520	/// stack pointer. The state is essentially a single bit, but we merge this in
1521	/// a way that won't form non-canonical pointers and also will be preserved
1522	/// across normal stack adjustments.
1523	void X86SpeculativeLoadHardeningPass::mergePredStateIntoSP(
1524	MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
1525	const DebugLoc &Loc, unsigned PredStateReg) {
1526	Register TmpReg = MRI->createVirtualRegister(RegClass: PS ->RC);
1527	// FIXME: This hard codes a shift distance based on the number of bits needed
1528	// to stay canonical on 64-bit. We should compute this somehow and support
1529	// 32-bit as part of that.
1530	auto ShiftI = BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc, MCID: TII->get(Opcode: X86::SHL64ri), DestReg: TmpReg)
1531	.addReg(RegNo: PredStateReg, flags: RegState::Kill)
1532	.addImm(Val: `47`);
1533	ShiftI ->addRegisterDead(Reg: X86::EFLAGS, RegInfo: TRI);
1534	++NumInstsInserted;
1535	auto OrI = BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc, MCID: TII->get(Opcode: X86::OR64rr), DestReg: X86::RSP)
1536	.addReg(RegNo: X86::RSP)
1537	.addReg(RegNo: TmpReg, flags: RegState::Kill);
1538	OrI ->addRegisterDead(Reg: X86::EFLAGS, RegInfo: TRI);
1539	++NumInstsInserted;
1540	}
1541
1542	/// Extracts the predicate state stored in the high bits of the stack pointer.
1543	unsigned X86SpeculativeLoadHardeningPass::extractPredStateFromSP(
1544	MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
1545	const DebugLoc &Loc) {
1546	Register PredStateReg = MRI->createVirtualRegister(RegClass: PS ->RC);
1547	Register TmpReg = MRI->createVirtualRegister(RegClass: PS ->RC);
1548
1549	// We know that the stack pointer will have any preserved predicate state in
1550	// its high bit. We just want to smear this across the other bits. Turns out,
1551	// this is exactly what an arithmetic right shift does.
1552	BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc, MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: TmpReg)
1553	.addReg(RegNo: X86::RSP);
1554	auto ShiftI =
1555	BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc, MCID: TII->get(Opcode: X86::SAR64ri), DestReg: PredStateReg)
1556	.addReg(RegNo: TmpReg, flags: RegState::Kill)
1557	.addImm(Val: TRI->getRegSizeInBits(RC: *PS ->RC) - `1`);
1558	ShiftI ->addRegisterDead(Reg: X86::EFLAGS, RegInfo: TRI);
1559	++NumInstsInserted;
1560
1561	return PredStateReg;
1562	}
1563
1564	void X86SpeculativeLoadHardeningPass::hardenLoadAddr(
1565	MachineInstr &MI, MachineOperand &BaseMO, MachineOperand &IndexMO,
1566	SmallDenseMap<unsigned, unsigned, `32`> &AddrRegToHardenedReg) {
1567	MachineBasicBlock &MBB = *MI.getParent();
1568	const DebugLoc &Loc = MI.getDebugLoc();
1569
1570	// Check if EFLAGS are alive by seeing if there is a def of them or they
1571	// live-in, and then seeing if that def is in turn used.
1572	bool EFLAGSLive = isEFLAGSLive(MBB, I: MI.getIterator(), TRI: *TRI);
1573
1574	SmallVector<MachineOperand *, `2`> HardenOpRegs;
1575
1576	if (BaseMO.isFI()) {
1577	// A frame index is never a dynamically controllable load, so only
1578	// harden it if we're covering fixed address loads as well.
1579	LLVM_DEBUG(
1580	dbgs() << " Skipping hardening base of explicit stack frame load: ";
1581	MI.dump(); dbgs() << "\n");
1582	} else if (BaseMO.getReg() == X86::RSP) {
1583	// Some idempotent atomic operations are lowered directly to a locked
1584	// OR with 0 to the top of stack(or slightly offset from top) which uses an
1585	// explicit RSP register as the base.
1586	assert(IndexMO.getReg() == X86::NoRegister &&
1587	"Explicit RSP access with dynamic index!");
1588	LLVM_DEBUG(
1589	dbgs() << " Cannot harden base of explicit RSP offset in a load!");
1590	} else if (BaseMO.getReg() == X86::RIP \|\|
1591	BaseMO.getReg() == X86::NoRegister) {
1592	// For both RIP-relative addressed loads or absolute loads, we cannot
1593	// meaningfully harden them because the address being loaded has no
1594	// dynamic component.
1595	//
1596	// FIXME: When using a segment base (like TLS does) we end up with the
1597	// dynamic address being the base plus -1 because we can't mutate the
1598	// segment register here. This allows the signed 32-bit offset to point at
1599	// valid segment-relative addresses and load them successfully.
1600	LLVM_DEBUG(
1601	dbgs() << " Cannot harden base of "
1602	<< (BaseMO.getReg() == X86::RIP ? "RIP-relative" : "no-base")
1603	<< " address in a load!");
1604	} else {
1605	assert(BaseMO.isReg() &&
1606	"Only allowed to have a frame index or register base.");
1607	HardenOpRegs.push_back(Elt: &BaseMO);
1608	}
1609
1610	if (IndexMO.getReg() != X86::NoRegister &&
1611	(HardenOpRegs.empty() \|\|
1612	HardenOpRegs.front()->getReg() != IndexMO.getReg()))
1613	HardenOpRegs.push_back(Elt: &IndexMO);
1614
1615	assert((HardenOpRegs.size() == `1` \|\| HardenOpRegs.size() == `2`) &&
1616	"Should have exactly one or two registers to harden!");
1617	assert((HardenOpRegs.size() == `1` \|\|
1618	HardenOpRegs[`0`]->getReg() != HardenOpRegs[`1`]->getReg()) &&
1619	"Should not have two of the same registers!");
1620
1621	// Remove any registers that have alreaded been checked.
1622	llvm::erase_if(C&: HardenOpRegs, P: [&](MachineOperand *Op) {
1623	// See if this operand's register has already been checked.
1624	auto It = AddrRegToHardenedReg.find(Val: Op->getReg());
1625	if (It == AddrRegToHardenedReg.end())
1626	// Not checked, so retain this one.
1627	return false;
1628
1629	// Otherwise, we can directly update this operand and remove it.
1630	Op->setReg(It ->second);
1631	return true;
1632	});
1633	// If there are none left, we're done.
1634	if (HardenOpRegs.empty())
1635	return;
1636
1637	// Compute the current predicate state.
1638	Register StateReg = PS ->SSA.GetValueAtEndOfBlock(BB: &MBB);
1639
1640	auto InsertPt = MI.getIterator();
1641
1642	// If EFLAGS are live and we don't have access to instructions that avoid
1643	// clobbering EFLAGS we need to save and restore them. This in turn makes
1644	// the EFLAGS no longer live.
1645	unsigned FlagsReg = `0`;
1646	if (EFLAGSLive && !Subtarget->hasBMI2()) {
1647	EFLAGSLive = false;
1648	FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
1649	}
1650
1651	for (MachineOperand *Op : HardenOpRegs) {
1652	Register OpReg = Op->getReg();
1653	auto *OpRC = MRI->getRegClass(Reg: OpReg);
1654	Register TmpReg = MRI->createVirtualRegister(RegClass: OpRC);
1655
1656	// If this is a vector register, we'll need somewhat custom logic to handle
1657	// hardening it.
1658	if (!Subtarget->hasVLX() && (OpRC->hasSuperClassEq(RC: &X86::VR128RegClass) \|\|
1659	OpRC->hasSuperClassEq(RC: &X86::VR256RegClass))) {
1660	assert(Subtarget->hasAVX2() && "AVX2-specific register classes!");
1661	bool Is128Bit = OpRC->hasSuperClassEq(RC: &X86::VR128RegClass);
1662
1663	// Move our state into a vector register.
1664	// FIXME: We could skip this at the cost of longer encodings with AVX-512
1665	// but that doesn't seem likely worth it.
1666	Register VStateReg = MRI->createVirtualRegister(RegClass: &X86::VR128RegClass);
1667	auto MovI =
1668	BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc, MCID: TII->get(Opcode: X86::VMOV64toPQIrr), DestReg: VStateReg)
1669	.addReg(RegNo: StateReg);
1670	(void)MovI;
1671	++NumInstsInserted;
1672	LLVM_DEBUG(dbgs() << " Inserting mov: "; MovI->dump(); dbgs() << "\n");
1673
1674	// Broadcast it across the vector register.
1675	Register VBStateReg = MRI->createVirtualRegister(RegClass: OpRC);
1676	auto BroadcastI = BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc,
1677	MCID: TII->get(Opcode: Is128Bit ? X86::VPBROADCASTQrr
1678	: X86::VPBROADCASTQYrr),
1679	DestReg: VBStateReg)
1680	.addReg(RegNo: VStateReg);
1681	(void)BroadcastI;
1682	++NumInstsInserted;
1683	LLVM_DEBUG(dbgs() << " Inserting broadcast: "; BroadcastI->dump();
1684	dbgs() << "\n");
1685
1686	// Merge our potential poison state into the value with a vector or.
1687	auto OrI =
1688	BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc,
1689	MCID: TII->get(Opcode: Is128Bit ? X86::VPORrr : X86::VPORYrr), DestReg: TmpReg)
1690	.addReg(RegNo: VBStateReg)
1691	.addReg(RegNo: OpReg);
1692	(void)OrI;
1693	++NumInstsInserted;
1694	LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
1695	} else if (OpRC->hasSuperClassEq(RC: &X86::VR128XRegClass) \|\|
1696	OpRC->hasSuperClassEq(RC: &X86::VR256XRegClass) \|\|
1697	OpRC->hasSuperClassEq(RC: &X86::VR512RegClass)) {
1698	assert(Subtarget->hasAVX512() && "AVX512-specific register classes!");
1699	bool Is128Bit = OpRC->hasSuperClassEq(RC: &X86::VR128XRegClass);
1700	bool Is256Bit = OpRC->hasSuperClassEq(RC: &X86::VR256XRegClass);
1701	if (Is128Bit \|\| Is256Bit)
1702	assert(Subtarget->hasVLX() && "AVX512VL-specific register classes!");
1703
1704	// Broadcast our state into a vector register.
1705	Register VStateReg = MRI->createVirtualRegister(RegClass: OpRC);
1706	unsigned BroadcastOp = Is128Bit ? X86::VPBROADCASTQrZ128rr
1707	: Is256Bit ? X86::VPBROADCASTQrZ256rr
1708	: X86::VPBROADCASTQrZrr;
1709	auto BroadcastI =
1710	BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc, MCID: TII->get(Opcode: BroadcastOp), DestReg: VStateReg)
1711	.addReg(RegNo: StateReg);
1712	(void)BroadcastI;
1713	++NumInstsInserted;
1714	LLVM_DEBUG(dbgs() << " Inserting broadcast: "; BroadcastI->dump();
1715	dbgs() << "\n");
1716
1717	// Merge our potential poison state into the value with a vector or.
1718	unsigned OrOp = Is128Bit ? X86::VPORQZ128rr
1719	: Is256Bit ? X86::VPORQZ256rr : X86::VPORQZrr;
1720	auto OrI = BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc, MCID: TII->get(Opcode: OrOp), DestReg: TmpReg)
1721	.addReg(RegNo: VStateReg)
1722	.addReg(RegNo: OpReg);
1723	(void)OrI;
1724	++NumInstsInserted;
1725	LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
1726	} else {
1727	// FIXME: Need to support GR32 here for 32-bit code.
1728	assert(OpRC->hasSuperClassEq(&X86::GR64RegClass) &&
1729	"Not a supported register class for address hardening!");
1730
1731	if (!EFLAGSLive) {
1732	// Merge our potential poison state into the value with an or.
1733	auto OrI = BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc, MCID: TII->get(Opcode: X86::OR64rr), DestReg: TmpReg)
1734	.addReg(RegNo: StateReg)
1735	.addReg(RegNo: OpReg);
1736	OrI ->addRegisterDead(Reg: X86::EFLAGS, RegInfo: TRI);
1737	++NumInstsInserted;
1738	LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
1739	} else {
1740	// We need to avoid touching EFLAGS so shift out all but the least
1741	// significant bit using the instruction that doesn't update flags.
1742	auto ShiftI =
1743	BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc, MCID: TII->get(Opcode: X86::SHRX64rr), DestReg: TmpReg)
1744	.addReg(RegNo: OpReg)
1745	.addReg(RegNo: StateReg);
1746	(void)ShiftI;
1747	++NumInstsInserted;
1748	LLVM_DEBUG(dbgs() << " Inserting shrx: "; ShiftI->dump();
1749	dbgs() << "\n");
1750	}
1751	}
1752
1753	// Record this register as checked and update the operand.
1754	assert(!AddrRegToHardenedReg.count(Op->getReg()) &&
1755	"Should not have checked this register yet!");
1756	AddrRegToHardenedReg [Op->getReg()] = TmpReg;
1757	Op->setReg(TmpReg);
1758	++NumAddrRegsHardened;
1759	}
1760
1761	// And restore the flags if needed.
1762	if (FlagsReg)
1763	restoreEFLAGS(MBB, InsertPt, Loc, Reg: FlagsReg);
1764	}
1765
1766	MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
1767	MachineInstr &InitialMI, SmallPtrSetImpl<MachineInstr *> &HardenedInstrs) {
1768	assert(X86InstrInfo::isDataInvariantLoad(InitialMI) &&
1769	"Cannot get here with a non-invariant load!");
1770	assert(!isEFLAGSDefLive(InitialMI) &&
1771	"Cannot get here with a data invariant load "
1772	"that interferes with EFLAGS!");
1773
1774	// See if we can sink hardening the loaded value.
1775	auto SinkCheckToSingleUse =
1776	[&](MachineInstr &MI) -> std::optional<MachineInstr *> {
1777	Register DefReg = MI.getOperand(i: `0`).getReg();
1778
1779	// We need to find a single use which we can sink the check. We can
1780	// primarily do this because many uses may already end up checked on their
1781	// own.
1782	MachineInstr SingleUseMI = nullptr*;
1783	for (MachineInstr &UseMI : MRI->use_instructions(Reg: DefReg)) {
1784	// If we're already going to harden this use, it is data invariant, it
1785	// does not interfere with EFLAGS, and within our block.
1786	if (HardenedInstrs.count(Ptr: &UseMI)) {
1787	if (!X86InstrInfo::isDataInvariantLoad(MI&: UseMI) \|\| isEFLAGSDefLive(MI: UseMI)) {
1788	// If we've already decided to harden a non-load, we must have sunk
1789	// some other post-load hardened instruction to it and it must itself
1790	// be data-invariant.
1791	assert(X86InstrInfo::isDataInvariant(UseMI) &&
1792	"Data variant instruction being hardened!");
1793	continue;
1794	}
1795
1796	// Otherwise, this is a load and the load component can't be data
1797	// invariant so check how this register is being used.
1798	const int MemRefBeginIdx = X86::getFirstAddrOperandIdx(MI: UseMI);
1799	assert(MemRefBeginIdx >= `0` &&
1800	"Should always have mem references here!");
1801
1802	MachineOperand &BaseMO =
1803	UseMI.getOperand(i: MemRefBeginIdx + X86::AddrBaseReg);
1804	MachineOperand &IndexMO =
1805	UseMI.getOperand(i: MemRefBeginIdx + X86::AddrIndexReg);
1806	if ((BaseMO.isReg() && BaseMO.getReg() == DefReg) \|\|
1807	(IndexMO.isReg() && IndexMO.getReg() == DefReg))
1808	// The load uses the register as part of its address making it not
1809	// invariant.
1810	return {};
1811
1812	continue;
1813	}
1814
1815	if (SingleUseMI)
1816	// We already have a single use, this would make two. Bail.
1817	return {};
1818
1819	// If this single use isn't data invariant, isn't in this block, or has
1820	// interfering EFLAGS, we can't sink the hardening to it.
1821	if (!X86InstrInfo::isDataInvariant(MI&: UseMI) \|\| UseMI.getParent() != MI.getParent() \|\|
1822	isEFLAGSDefLive(MI: UseMI))
1823	return {};
1824
1825	// If this instruction defines multiple registers bail as we won't harden
1826	// all of them.
1827	if (UseMI.getDesc().getNumDefs() > `1`)
1828	return {};
1829
1830	// If this register isn't a virtual register we can't walk uses of sanely,
1831	// just bail. Also check that its register class is one of the ones we
1832	// can harden.
1833	Register UseDefReg = UseMI.getOperand(i: `0`).getReg();
1834	if (!canHardenRegister(Reg: UseDefReg))
1835	return {};
1836
1837	SingleUseMI = &UseMI;
1838	}
1839
1840	// If SingleUseMI is still null, there is no use that needs its own
1841	// checking. Otherwise, it is the single use that needs checking.
1842	return {SingleUseMI};
1843	};
1844
1845	MachineInstr *MI = &InitialMI;
1846	while (std::optional<MachineInstr > SingleUse = SinkCheckToSingleUse (MI)) {
1847	// Update which MI we're checking now.
1848	MI = *SingleUse;
1849	if (!MI)
1850	break;
1851	}
1852
1853	return MI;
1854	}
1855
1856	bool X86SpeculativeLoadHardeningPass::canHardenRegister(Register Reg) {
1857	// We only support hardening virtual registers.
1858	if (!Reg.isVirtual())
1859	return false;
1860
1861	auto *RC = MRI->getRegClass(Reg);
1862	int RegBytes = TRI->getRegSizeInBits(RC: *RC) / `8`;
1863	if (RegBytes > `8`)
1864	// We don't support post-load hardening of vectors.
1865	return false;
1866
1867	unsigned RegIdx = Log2_32(Value: RegBytes);
1868	assert(RegIdx < `4` && "Unsupported register size");
1869
1870	// If this register class is explicitly constrained to a class that doesn't
1871	// require REX prefix, we may not be able to satisfy that constraint when
1872	// emitting the hardening instructions, so bail out here.
1873	// FIXME: This seems like a pretty lame hack. The way this comes up is when we
1874	// end up both with a NOREX and REX-only register as operands to the hardening
1875	// instructions. It would be better to fix that code to handle this situation
1876	// rather than hack around it in this way.
1877	const TargetRegisterClass *NOREXRegClasses[] = {
1878	&X86::GR8_NOREXRegClass, &X86::GR16_NOREXRegClass,
1879	&X86::GR32_NOREXRegClass, &X86::GR64_NOREXRegClass};
1880	if (RC == NOREXRegClasses[RegIdx])
1881	return false;
1882
1883	const TargetRegisterClass *GPRRegClasses[] = {
1884	&X86::GR8RegClass, &X86::GR16RegClass, &X86::GR32RegClass,
1885	&X86::GR64RegClass};
1886	return RC->hasSuperClassEq(RC: GPRRegClasses[RegIdx]);
1887	}
1888
1889	/// Harden a value in a register.
1890	///
1891	/// This is the low-level logic to fully harden a value sitting in a register
1892	/// against leaking during speculative execution.
1893	///
1894	/// Unlike hardening an address that is used by a load, this routine is required
1895	/// to hide all* incoming bits in the register.*
1896	///
1897	/// `Reg` must be a virtual register. Currently, it is required to be a GPR no
1898	/// larger than the predicate state register. FIXME: We should support vector
1899	/// registers here by broadcasting the predicate state.
1900	///
1901	/// The new, hardened virtual register is returned. It will have the same
1902	/// register class as `Reg`.
1903	unsigned X86SpeculativeLoadHardeningPass::hardenValueInRegister(
1904	Register Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
1905	const DebugLoc &Loc) {
1906	assert(canHardenRegister(Reg) && "Cannot harden this register!");
1907
1908	auto *RC = MRI->getRegClass(Reg);
1909	int Bytes = TRI->getRegSizeInBits(RC: *RC) / `8`;
1910	Register StateReg = PS ->SSA.GetValueAtEndOfBlock(BB: &MBB);
1911	assert((Bytes == `1` \|\| Bytes == `2` \|\| Bytes == `4` \|\| Bytes == `8`) &&
1912	"Unknown register size");
1913
1914	// FIXME: Need to teach this about 32-bit mode.
1915	if (Bytes != `8`) {
1916	unsigned SubRegImms[] = {X86::sub_8bit, X86::sub_16bit, X86::sub_32bit};
1917	unsigned SubRegImm = SubRegImms[Log2_32(Value: Bytes)];
1918	Register NarrowStateReg = MRI->createVirtualRegister(RegClass: RC);
1919	BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc, MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: NarrowStateReg)
1920	.addReg(RegNo: StateReg, flags: `0`, SubReg: SubRegImm);
1921	StateReg = NarrowStateReg;
1922	}
1923
1924	unsigned FlagsReg = `0`;
1925	if (isEFLAGSLive(MBB, I: InsertPt, TRI: *TRI))
1926	FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
1927
1928	Register NewReg = MRI->createVirtualRegister(RegClass: RC);
1929	unsigned OrOpCodes[] = {X86::OR8rr, X86::OR16rr, X86::OR32rr, X86::OR64rr};
1930	unsigned OrOpCode = OrOpCodes[Log2_32(Value: Bytes)];
1931	auto OrI = BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc, MCID: TII->get(Opcode: OrOpCode), DestReg: NewReg)
1932	.addReg(RegNo: StateReg)
1933	.addReg(RegNo: Reg);
1934	OrI ->addRegisterDead(Reg: X86::EFLAGS, RegInfo: TRI);
1935	++NumInstsInserted;
1936	LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n");
1937
1938	if (FlagsReg)
1939	restoreEFLAGS(MBB, InsertPt, Loc, Reg: FlagsReg);
1940
1941	return NewReg;
1942	}
1943
1944	/// Harden a load by hardening the loaded value in the defined register.
1945	///
1946	/// We can harden a non-leaking load into a register without touching the
1947	/// address by just hiding all of the loaded bits during misspeculation. We use
1948	/// an `or` instruction to do this because we set up our poison value as all
1949	/// ones. And the goal is just for the loaded bits to not be exposed to
1950	/// execution and coercing them to one is sufficient.
1951	///
1952	/// Returns the newly hardened register.
1953	unsigned X86SpeculativeLoadHardeningPass::hardenPostLoad(MachineInstr &MI) {
1954	MachineBasicBlock &MBB = *MI.getParent();
1955	const DebugLoc &Loc = MI.getDebugLoc();
1956
1957	auto &DefOp = MI.getOperand(i: `0`);
1958	Register OldDefReg = DefOp.getReg();
1959	auto *DefRC = MRI->getRegClass(Reg: OldDefReg);
1960
1961	// Because we want to completely replace the uses of this def'ed value with
1962	// the hardened value, create a dedicated new register that will only be used
1963	// to communicate the unhardened value to the hardening.
1964	Register UnhardenedReg = MRI->createVirtualRegister(RegClass: DefRC);
1965	DefOp.setReg(UnhardenedReg);
1966
1967	// Now harden this register's value, getting a hardened reg that is safe to
1968	// use. Note that we insert the instructions to compute this after* the*
1969	// defining instruction, not before it.
1970	unsigned HardenedReg = hardenValueInRegister(
1971	Reg: UnhardenedReg, MBB, InsertPt: std::next(x: MI.getIterator()), Loc);
1972
1973	// Finally, replace the old register (which now only has the uses of the
1974	// original def) with the hardened register.
1975	MRI->replaceRegWith(/FromReg/ OldDefReg, /ToReg/ HardenedReg);
1976
1977	++NumPostLoadRegsHardened;
1978	return HardenedReg;
1979	}
1980
1981	/// Harden a return instruction.
1982	///
1983	/// Returns implicitly perform a load which we need to harden. Without hardening
1984	/// this load, an attacker my speculatively write over the return address to
1985	/// steer speculation of the return to an attacker controlled address. This is
1986	/// called Spectre v1.1 or Bounds Check Bypass Store (BCBS) and is described in
1987	/// this paper:
1988	/// https://people.csail.mit.edu/vlk/spectre11.pdf
1989	///
1990	/// We can harden this by introducing an LFENCE that will delay any load of the
1991	/// return address until prior instructions have retired (and thus are not being
1992	/// speculated), or we can harden the address used by the implicit load: the
1993	/// stack pointer.
1994	///
1995	/// If we are not using an LFENCE, hardening the stack pointer has an additional
1996	/// benefit: it allows us to pass the predicate state accumulated in this
1997	/// function back to the caller. In the absence of a BCBS attack on the return,
1998	/// the caller will typically be resumed and speculatively executed due to the
1999	/// Return Stack Buffer (RSB) prediction which is very accurate and has a high
2000	/// priority. It is possible that some code from the caller will be executed
2001	/// speculatively even during a BCBS-attacked return until the steering takes
2002	/// effect. Whenever this happens, the caller can recover the (poisoned)
2003	/// predicate state from the stack pointer and continue to harden loads.
2004	void X86SpeculativeLoadHardeningPass::hardenReturnInstr(MachineInstr &MI) {
2005	MachineBasicBlock &MBB = *MI.getParent();
2006	const DebugLoc &Loc = MI.getDebugLoc();
2007	auto InsertPt = MI.getIterator();
2008
2009	if (FenceCallAndRet)
2010	// No need to fence here as we'll fence at the return site itself. That
2011	// handles more cases than we can handle here.
2012	return;
2013
2014	// Take our predicate state, shift it to the high 17 bits (so that we keep
2015	// pointers canonical) and merge it into RSP. This will allow the caller to
2016	// extract it when we return (speculatively).
2017	mergePredStateIntoSP(MBB, InsertPt, Loc, PredStateReg: PS ->SSA.GetValueAtEndOfBlock(BB: &MBB));
2018	}
2019
2020	/// Trace the predicate state through a call.
2021	///
2022	/// There are several layers of this needed to handle the full complexity of
2023	/// calls.
2024	///
2025	/// First, we need to send the predicate state into the called function. We do
2026	/// this by merging it into the high bits of the stack pointer.
2027	///
2028	/// For tail calls, this is all we need to do.
2029	///
2030	/// For calls where we might return and resume the control flow, we need to
2031	/// extract the predicate state from the high bits of the stack pointer after
2032	/// control returns from the called function.
2033	///
2034	/// We also need to verify that we intended to return to this location in the
2035	/// code. An attacker might arrange for the processor to mispredict the return
2036	/// to this valid but incorrect return address in the program rather than the
2037	/// correct one. See the paper on this attack, called "ret2spec" by the
2038	/// researchers, here:
2039	/// https://christian-rossow.de/publications/ret2spec-ccs2018.pdf
2040	///
2041	/// The way we verify that we returned to the correct location is by preserving
2042	/// the expected return address across the call. One technique involves taking
2043	/// advantage of the red-zone to load the return address from `8(%rsp)` where it
2044	/// was left by the RET instruction when it popped `%rsp`. Alternatively, we can
2045	/// directly save the address into a register that will be preserved across the
2046	/// call. We compare this intended return address against the address
2047	/// immediately following the call (the observed return address). If these
2048	/// mismatch, we have detected misspeculation and can poison our predicate
2049	/// state.
2050	void X86SpeculativeLoadHardeningPass::tracePredStateThroughCall(
2051	MachineInstr &MI) {
2052	MachineBasicBlock &MBB = *MI.getParent();
2053	MachineFunction &MF = *MBB.getParent();
2054	auto InsertPt = MI.getIterator();
2055	const DebugLoc &Loc = MI.getDebugLoc();
2056
2057	if (FenceCallAndRet) {
2058	if (MI.isReturn())
2059	// Tail call, we don't return to this function.
2060	// FIXME: We should also handle noreturn calls.
2061	return;
2062
2063	// We don't need to fence before the call because the function should fence
2064	// in its entry. However, we do need to fence after the call returns.
2065	// Fencing before the return doesn't correctly handle cases where the return
2066	// itself is mispredicted.
2067	BuildMI(BB&: MBB, I: std::next(x: InsertPt), MIMD: Loc, MCID: TII->get(Opcode: X86::LFENCE));
2068	++NumInstsInserted;
2069	++NumLFENCEsInserted;
2070	return;
2071	}
2072
2073	// First, we transfer the predicate state into the called function by merging
2074	// it into the stack pointer. This will kill the current def of the state.
2075	Register StateReg = PS ->SSA.GetValueAtEndOfBlock(BB: &MBB);
2076	mergePredStateIntoSP(MBB, InsertPt, Loc, PredStateReg: StateReg);
2077
2078	// If this call is also a return, it is a tail call and we don't need anything
2079	// else to handle it so just return. Also, if there are no further
2080	// instructions and no successors, this call does not return so we can also
2081	// bail.
2082	if (MI.isReturn() \|\| (std::next(x: InsertPt) == MBB.end() && MBB.succ_empty()))
2083	return;
2084
2085	// Create a symbol to track the return address and attach it to the call
2086	// machine instruction. We will lower extra symbols attached to call
2087	// instructions as label immediately following the call.
2088	MCSymbol *RetSymbol =
2089	MF.getContext().createTempSymbol(Name: "slh_ret_addr",
2090	/AlwaysAddSuffix/ true);
2091	MI.setPostInstrSymbol(MF, Symbol: RetSymbol);
2092
2093	const TargetRegisterClass *AddrRC = &X86::GR64RegClass;
2094	unsigned ExpectedRetAddrReg = `0`;
2095
2096	// If we have no red zones or if the function returns twice (possibly without
2097	// using the `ret` instruction) like setjmp, we need to save the expected
2098	// return address prior to the call.
2099	if (!Subtarget->getFrameLowering()->has128ByteRedZone(MF) \|\|
2100	MF.exposesReturnsTwice()) {
2101	// If we don't have red zones, we need to compute the expected return
2102	// address prior to the call and store it in a register that lives across
2103	// the call.
2104	//
2105	// In some ways, this is doubly satisfying as a mitigation because it will
2106	// also successfully detect stack smashing bugs in some cases (typically,
2107	// when a callee-saved register is used and the callee doesn't push it onto
2108	// the stack). But that isn't our primary goal, so we only use it as
2109	// a fallback.
2110	//
2111	// FIXME: It isn't clear that this is reliable in the face of
2112	// rematerialization in the register allocator. We somehow need to force
2113	// that to not occur for this particular instruction, and instead to spill
2114	// or otherwise preserve the value computed prior* to the call.*
2115	//
2116	// FIXME: It is even less clear why MachineCSE can't just fold this when we
2117	// end up having to use identical instructions both before and after the
2118	// call to feed the comparison.
2119	ExpectedRetAddrReg = MRI->createVirtualRegister(RegClass: AddrRC);
2120	if (MF.getTarget().getCodeModel() == CodeModel::Small &&
2121	!Subtarget->isPositionIndependent()) {
2122	BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc, MCID: TII->get(Opcode: X86::MOV64ri32), DestReg: ExpectedRetAddrReg)
2123	.addSym(Sym: RetSymbol);
2124	} else {
2125	BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc, MCID: TII->get(Opcode: X86::LEA64r), DestReg: ExpectedRetAddrReg)
2126	.addReg(/Base/ RegNo: X86::RIP)
2127	.addImm(/Scale/ Val: `1`)
2128	.addReg(/Index/ RegNo: `0`)
2129	.addSym(Sym: RetSymbol)
2130	.addReg(/Segment/ RegNo: `0`);
2131	}
2132	}
2133
2134	// Step past the call to handle when it returns.
2135	++InsertPt;
2136
2137	// If we didn't pre-compute the expected return address into a register, then
2138	// red zones are enabled and the return address is still available on the
2139	// stack immediately after the call. As the very first instruction, we load it
2140	// into a register.
2141	if (!ExpectedRetAddrReg) {
2142	ExpectedRetAddrReg = MRI->createVirtualRegister(RegClass: AddrRC);
2143	BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc, MCID: TII->get(Opcode: X86::MOV64rm), DestReg: ExpectedRetAddrReg)
2144	.addReg(/Base/ RegNo: X86::RSP)
2145	.addImm(/Scale/ Val: `1`)
2146	.addReg(/Index/ RegNo: `0`)
2147	.addImm(/Displacement/ Val: -`8`) // The stack pointer has been popped, so
2148	// the return address is 8-bytes past it.
2149	.addReg(/Segment/ RegNo: `0`);
2150	}
2151
2152	// Now we extract the callee's predicate state from the stack pointer.
2153	unsigned NewStateReg = extractPredStateFromSP(MBB, InsertPt, Loc);
2154
2155	// Test the expected return address against our actual address. If we can
2156	// form this basic block's address as an immediate, this is easy. Otherwise
2157	// we compute it.
2158	if (MF.getTarget().getCodeModel() == CodeModel::Small &&
2159	!Subtarget->isPositionIndependent()) {
2160	// FIXME: Could we fold this with the load? It would require careful EFLAGS
2161	// management.
2162	BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc, MCID: TII->get(Opcode: X86::CMP64ri32))
2163	.addReg(RegNo: ExpectedRetAddrReg, flags: RegState::Kill)
2164	.addSym(Sym: RetSymbol);
2165	} else {
2166	Register ActualRetAddrReg = MRI->createVirtualRegister(RegClass: AddrRC);
2167	BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc, MCID: TII->get(Opcode: X86::LEA64r), DestReg: ActualRetAddrReg)
2168	.addReg(/Base/ RegNo: X86::RIP)
2169	.addImm(/Scale/ Val: `1`)
2170	.addReg(/Index/ RegNo: `0`)
2171	.addSym(Sym: RetSymbol)
2172	.addReg(/Segment/ RegNo: `0`);
2173	BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc, MCID: TII->get(Opcode: X86::CMP64rr))
2174	.addReg(RegNo: ExpectedRetAddrReg, flags: RegState::Kill)
2175	.addReg(RegNo: ActualRetAddrReg, flags: RegState::Kill);
2176	}
2177
2178	// Now conditionally update the predicate state we just extracted if we ended
2179	// up at a different return address than expected.
2180	int PredStateSizeInBytes = TRI->getRegSizeInBits(RC: *PS ->RC) / `8`;
2181	auto CMovOp = X86::getCMovOpcode(RegBytes: PredStateSizeInBytes);
2182
2183	Register UpdatedStateReg = MRI->createVirtualRegister(RegClass: PS ->RC);
2184	auto CMovI = BuildMI(BB&: MBB, I: InsertPt, MIMD: Loc, MCID: TII->get(Opcode: CMovOp), DestReg: UpdatedStateReg)
2185	.addReg(RegNo: NewStateReg, flags: RegState::Kill)
2186	.addReg(RegNo: PS ->PoisonReg)
2187	.addImm(Val: X86::COND_NE);
2188	CMovI ->findRegisterUseOperand(Reg: X86::EFLAGS, /TRI=/nullptr)->setIsKill(true);
2189	++NumInstsInserted;
2190	LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI->dump(); dbgs() << "\n");
2191
2192	PS ->SSA.AddAvailableValue(BB: &MBB, V: UpdatedStateReg);
2193	}
2194
2195	/// An attacker may speculatively store over a value that is then speculatively
2196	/// loaded and used as the target of an indirect call or jump instruction. This
2197	/// is called Spectre v1.2 or Bounds Check Bypass Store (BCBS) and is described
2198	/// in this paper:
2199	/// https://people.csail.mit.edu/vlk/spectre11.pdf
2200	///
2201	/// When this happens, the speculative execution of the call or jump will end up
2202	/// being steered to this attacker controlled address. While most such loads
2203	/// will be adequately hardened already, we want to ensure that they are
2204	/// definitively treated as needing post-load hardening. While address hardening
2205	/// is sufficient to prevent secret data from leaking to the attacker, it may
2206	/// not be sufficient to prevent an attacker from steering speculative
2207	/// execution. We forcibly unfolded all relevant loads above and so will always
2208	/// have an opportunity to post-load harden here, we just need to scan for cases
2209	/// not already flagged and add them.
2210	void X86SpeculativeLoadHardeningPass::hardenIndirectCallOrJumpInstr(
2211	MachineInstr &MI,
2212	SmallDenseMap<unsigned, unsigned, `32`> &AddrRegToHardenedReg) {
2213	switch (MI.getOpcode()) {
2214	case X86::FARCALL16m:
2215	case X86::FARCALL32m:
2216	case X86::FARCALL64m:
2217	case X86::FARJMP16m:
2218	case X86::FARJMP32m:
2219	case X86::FARJMP64m:
2220	// We don't need to harden either far calls or far jumps as they are
2221	// safe from Spectre.
2222	return;
2223
2224	default:
2225	break;
2226	}
2227
2228	// We should never see a loading instruction at this point, as those should
2229	// have been unfolded.
2230	assert(!MI.mayLoad() && "Found a lingering loading instruction!");
2231
2232	// If the first operand isn't a register, this is a branch or call
2233	// instruction with an immediate operand which doesn't need to be hardened.
2234	if (!MI.getOperand(i: `0`).isReg())
2235	return;
2236
2237	// For all of these, the target register is the first operand of the
2238	// instruction.
2239	auto &TargetOp = MI.getOperand(i: `0`);
2240	Register OldTargetReg = TargetOp.getReg();
2241
2242	// Try to lookup a hardened version of this register. We retain a reference
2243	// here as we want to update the map to track any newly computed hardened
2244	// register.
2245	unsigned &HardenedTargetReg = AddrRegToHardenedReg [OldTargetReg];
2246
2247	// If we don't have a hardened register yet, compute one. Otherwise, just use
2248	// the already hardened register.
2249	//
2250	// FIXME: It is a little suspect that we use partially hardened registers that
2251	// only feed addresses. The complexity of partial hardening with SHRX
2252	// continues to pile up. Should definitively measure its value and consider
2253	// eliminating it.
2254	if (!HardenedTargetReg)
2255	HardenedTargetReg = hardenValueInRegister(
2256	Reg: OldTargetReg, MBB&: *MI.getParent(), InsertPt: MI.getIterator(), Loc: MI.getDebugLoc());
2257
2258	// Set the target operand to the hardened register.
2259	TargetOp.setReg(HardenedTargetReg);
2260
2261	++NumCallsOrJumpsHardened;
2262	}
2263
2264	INITIALIZE_PASS_BEGIN(X86SpeculativeLoadHardeningPass, PASS_KEY,
2265	"X86 speculative load hardener", false, false)
2266	INITIALIZE_PASS_END(X86SpeculativeLoadHardeningPass, PASS_KEY,
2267	"X86 speculative load hardener", false, false)
2268
2269	FunctionPass *llvm::createX86SpeculativeLoadHardeningPass() {
2270	return new X86SpeculativeLoadHardeningPass ();
2271	}
2272

Browse the source code of llvm_projects/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp