MemorySanitizer.cpp source code [llvm_projects/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp]

1	//===- MemorySanitizer.cpp - detector of uninitialized reads --------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// This file is a part of MemorySanitizer, a detector of uninitialized
11	/// reads.
12	///
13	/// The algorithm of the tool is similar to Memcheck
14	/// (https://static.usenix.org/event/usenix05/tech/general/full_papers/seward/seward_html/usenix2005.html)
15	/// We associate a few shadow bits with every byte of the application memory,
16	/// poison the shadow of the malloc-ed or alloca-ed memory, load the shadow,
17	/// bits on every memory read, propagate the shadow bits through some of the
18	/// arithmetic instruction (including MOV), store the shadow bits on every
19	/// memory write, report a bug on some other instructions (e.g. JMP) if the
20	/// associated shadow is poisoned.
21	///
22	/// But there are differences too. The first and the major one:
23	/// compiler instrumentation instead of binary instrumentation. This
24	/// gives us much better register allocation, possible compiler
25	/// optimizations and a fast start-up. But this brings the major issue
26	/// as well: msan needs to see all program events, including system
27	/// calls and reads/writes in system libraries, so we either need to
28	/// compile everything* with msan or use a binary translation*
29	/// component (e.g. DynamoRIO) to instrument pre-built libraries.
30	/// Another difference from Memcheck is that we use 8 shadow bits per
31	/// byte of application memory and use a direct shadow mapping. This
32	/// greatly simplifies the instrumentation code and avoids races on
33	/// shadow updates (Memcheck is single-threaded so races are not a
34	/// concern there. Memcheck uses 2 shadow bits per byte with a slow
35	/// path storage that uses 8 bits per byte).
36	///
37	/// The default value of shadow is 0, which means "clean" (not poisoned).
38	///
39	/// Every module initializer should call __msan_init to ensure that the
40	/// shadow memory is ready. On error, __msan_warning is called. Since
41	/// parameters and return values may be passed via registers, we have a
42	/// specialized thread-local shadow for return values
43	/// (__msan_retval_tls) and parameters (__msan_param_tls).
44	///
45	/// Origin tracking.
46	///
47	/// MemorySanitizer can track origins (allocation points) of all uninitialized
48	/// values. This behavior is controlled with a flag (msan-track-origins) and is
49	/// disabled by default.
50	///
51	/// Origins are 4-byte values created and interpreted by the runtime library.
52	/// They are stored in a second shadow mapping, one 4-byte value for 4 bytes
53	/// of application memory. Propagation of origins is basically a bunch of
54	/// "select" instructions that pick the origin of a dirty argument, if an
55	/// instruction has one.
56	///
57	/// Every 4 aligned, consecutive bytes of application memory have one origin
58	/// value associated with them. If these bytes contain uninitialized data
59	/// coming from 2 different allocations, the last store wins. Because of this,
60	/// MemorySanitizer reports can show unrelated origins, but this is unlikely in
61	/// practice.
62	///
63	/// Origins are meaningless for fully initialized values, so MemorySanitizer
64	/// avoids storing origin to memory when a fully initialized value is stored.
65	/// This way it avoids needless overwriting origin of the 4-byte region on
66	/// a short (i.e. 1 byte) clean store, and it is also good for performance.
67	///
68	/// Atomic handling.
69	///
70	/// Ideally, every atomic store of application value should update the
71	/// corresponding shadow location in an atomic way. Unfortunately, atomic store
72	/// of two disjoint locations can not be done without severe slowdown.
73	///
74	/// Therefore, we implement an approximation that may err on the safe side.
75	/// In this implementation, every atomically accessed location in the program
76	/// may only change from (partially) uninitialized to fully initialized, but
77	/// not the other way around. We load the shadow _after_ the application load,
78	/// and we store the shadow _before_ the app store. Also, we always store clean
79	/// shadow (if the application store is atomic). This way, if the store-load
80	/// pair constitutes a happens-before arc, shadow store and load are correctly
81	/// ordered such that the load will get either the value that was stored, or
82	/// some later value (which is always clean).
83	///
84	/// This does not work very well with Compare-And-Swap (CAS) and
85	/// Read-Modify-Write (RMW) operations. To follow the above logic, CAS and RMW
86	/// must store the new shadow before the app operation, and load the shadow
87	/// after the app operation. Computers don't work this way. Current
88	/// implementation ignores the load aspect of CAS/RMW, always returning a clean
89	/// value. It implements the store part as a simple atomic store by storing a
90	/// clean shadow.
91	///
92	/// Instrumenting inline assembly.
93	///
94	/// For inline assembly code LLVM has little idea about which memory locations
95	/// become initialized depending on the arguments. It can be possible to figure
96	/// out which arguments are meant to point to inputs and outputs, but the
97	/// actual semantics can be only visible at runtime. In the Linux kernel it's
98	/// also possible that the arguments only indicate the offset for a base taken
99	/// from a segment register, so it's dangerous to treat any asm() arguments as
100	/// pointers. We take a conservative approach generating calls to
101	/// __msan_instrument_asm_store(ptr, size)
102	/// , which defer the memory unpoisoning to the runtime library.
103	/// The latter can perform more complex address checks to figure out whether
104	/// it's safe to touch the shadow memory.
105	/// Like with atomic operations, we call __msan_instrument_asm_store() before
106	/// the assembly call, so that changes to the shadow memory will be seen by
107	/// other threads together with main memory initialization.
108	///
109	/// KernelMemorySanitizer (KMSAN) implementation.
110	///
111	/// The major differences between KMSAN and MSan instrumentation are:
112	/// - KMSAN always tracks the origins and implies msan-keep-going=true;
113	/// - KMSAN allocates shadow and origin memory for each page separately, so
114	/// there are no explicit accesses to shadow and origin in the
115	/// instrumentation.
116	/// Shadow and origin values for a particular X-byte memory location
117	/// (X=1,2,4,8) are accessed through pointers obtained via the
118	/// __msan_metadata_ptr_for_load_X(ptr)
119	/// __msan_metadata_ptr_for_store_X(ptr)
120	/// functions. The corresponding functions check that the X-byte accesses
121	/// are possible and returns the pointers to shadow and origin memory.
122	/// Arbitrary sized accesses are handled with:
123	/// __msan_metadata_ptr_for_load_n(ptr, size)
124	/// __msan_metadata_ptr_for_store_n(ptr, size);
125	/// Note that the sanitizer code has to deal with how shadow/origin pairs
126	/// returned by the these functions are represented in different ABIs. In
127	/// the X86_64 ABI they are returned in RDX:RAX, in PowerPC64 they are
128	/// returned in r3 and r4, and in the SystemZ ABI they are written to memory
129	/// pointed to by a hidden parameter.
130	/// - TLS variables are stored in a single per-task struct. A call to a
131	/// function __msan_get_context_state() returning a pointer to that struct
132	/// is inserted into every instrumented function before the entry block;
133	/// - __msan_warning() takes a 32-bit origin parameter;
134	/// - local variables are poisoned with __msan_poison_alloca() upon function
135	/// entry and unpoisoned with __msan_unpoison_alloca() before leaving the
136	/// function;
137	/// - the pass doesn't declare any global variables or add global constructors
138	/// to the translation unit.
139	///
140	/// Also, KMSAN currently ignores uninitialized memory passed into inline asm
141	/// calls, making sure we're on the safe side wrt. possible false positives.
142	///
143	/// KernelMemorySanitizer only supports X86_64, SystemZ and PowerPC64 at the
144	/// moment.
145	///
146	//
147	// FIXME: This sanitizer does not yet handle scalable vectors
148	//
149	//===----------------------------------------------------------------------===//
150
151	#include "llvm/Transforms/Instrumentation/MemorySanitizer.h"
152	#include "llvm/ADT/APInt.h"
153	#include "llvm/ADT/ArrayRef.h"
154	#include "llvm/ADT/DenseMap.h"
155	#include "llvm/ADT/DepthFirstIterator.h"
156	#include "llvm/ADT/SetVector.h"
157	#include "llvm/ADT/SmallPtrSet.h"
158	#include "llvm/ADT/SmallVector.h"
159	#include "llvm/ADT/StringExtras.h"
160	#include "llvm/ADT/StringRef.h"
161	#include "llvm/Analysis/GlobalsModRef.h"
162	#include "llvm/Analysis/TargetLibraryInfo.h"
163	#include "llvm/Analysis/ValueTracking.h"
164	#include "llvm/IR/Argument.h"
165	#include "llvm/IR/AttributeMask.h"
166	#include "llvm/IR/Attributes.h"
167	#include "llvm/IR/BasicBlock.h"
168	#include "llvm/IR/CallingConv.h"
169	#include "llvm/IR/Constant.h"
170	#include "llvm/IR/Constants.h"
171	#include "llvm/IR/DataLayout.h"
172	#include "llvm/IR/DerivedTypes.h"
173	#include "llvm/IR/Function.h"
174	#include "llvm/IR/GlobalValue.h"
175	#include "llvm/IR/GlobalVariable.h"
176	#include "llvm/IR/IRBuilder.h"
177	#include "llvm/IR/InlineAsm.h"
178	#include "llvm/IR/InstVisitor.h"
179	#include "llvm/IR/InstrTypes.h"
180	#include "llvm/IR/Instruction.h"
181	#include "llvm/IR/Instructions.h"
182	#include "llvm/IR/IntrinsicInst.h"
183	#include "llvm/IR/Intrinsics.h"
184	#include "llvm/IR/IntrinsicsAArch64.h"
185	#include "llvm/IR/IntrinsicsX86.h"
186	#include "llvm/IR/MDBuilder.h"
187	#include "llvm/IR/Module.h"
188	#include "llvm/IR/Type.h"
189	#include "llvm/IR/Value.h"
190	#include "llvm/IR/ValueMap.h"
191	#include "llvm/Support/Alignment.h"
192	#include "llvm/Support/AtomicOrdering.h"
193	#include "llvm/Support/Casting.h"
194	#include "llvm/Support/CommandLine.h"
195	#include "llvm/Support/Debug.h"
196	#include "llvm/Support/DebugCounter.h"
197	#include "llvm/Support/ErrorHandling.h"
198	#include "llvm/Support/MathExtras.h"
199	#include "llvm/Support/raw_ostream.h"
200	#include "llvm/TargetParser/Triple.h"
201	#include "llvm/Transforms/Utils/BasicBlockUtils.h"
202	#include "llvm/Transforms/Utils/Instrumentation.h"
203	#include "llvm/Transforms/Utils/Local.h"
204	#include "llvm/Transforms/Utils/ModuleUtils.h"
205	#include <algorithm>
206	#include <cassert>
207	#include <cstddef>
208	#include <cstdint>
209	#include <memory>
210	#include <numeric>
211	#include <string>
212	#include <tuple>
213
214	using namespace llvm;
215
216	#define DEBUG_TYPE "msan"
217
218	DEBUG_COUNTER(DebugInsertCheck, "msan-insert-check",
219	"Controls which checks to insert");
220
221	DEBUG_COUNTER(DebugInstrumentInstruction, "msan-instrument-instruction",
222	"Controls which instruction to instrument");
223
224	static const unsigned kOriginSize = `4`;
225	static const Align kMinOriginAlignment = Align (`4`);
226	static const Align kShadowTLSAlignment = Align (`8`);
227
228	// These constants must be kept in sync with the ones in msan.h.
229	// TODO: increase size to match SVE/SVE2/SME/SME2 limits
230	static const unsigned kParamTLSSize = `800`;
231	static const unsigned kRetvalTLSSize = `800`;
232
233	// Accesses sizes are powers of two: 1, 2, 4, 8.
234	static const size_t kNumberOfAccessSizes = `4`;
235
236	/// Track origins of uninitialized values.
237	///
238	/// Adds a section to MemorySanitizer report that points to the allocation
239	/// (stack or heap) the uninitialized bits came from originally.
240	static cl::opt<int> ClTrackOrigins(
241	"msan-track-origins",
242	cl::desc ("Track origins (allocation sites) of poisoned memory"), cl::Hidden,
243	cl::init(Val: `0`));
244
245	static cl::opt<bool> ClKeepGoing("msan-keep-going",
246	cl::desc ("keep going after reporting a UMR"),
247	cl::Hidden, cl::init(Val: false));
248
249	static cl::opt<bool>
250	ClPoisonStack("msan-poison-stack",
251	cl::desc ("poison uninitialized stack variables"), cl::Hidden,
252	cl::init(Val: true));
253
254	static cl::opt<bool> ClPoisonStackWithCall(
255	"msan-poison-stack-with-call",
256	cl::desc ("poison uninitialized stack variables with a call"), cl::Hidden,
257	cl::init(Val: false));
258
259	static cl::opt<int> ClPoisonStackPattern(
260	"msan-poison-stack-pattern",
261	cl::desc ("poison uninitialized stack variables with the given pattern"),
262	cl::Hidden, cl::init(Val: `0xff`));
263
264	static cl::opt<bool>
265	ClPrintStackNames("msan-print-stack-names",
266	cl::desc ("Print name of local stack variable"),
267	cl::Hidden, cl::init(Val: true));
268
269	static cl::opt<bool>
270	ClPoisonUndef("msan-poison-undef",
271	cl::desc ("Poison fully undef temporary values. "
272	"Partially undefined constant vectors "
273	"are unaffected by this flag (see "
274	"-msan-poison-undef-vectors)."),
275	cl::Hidden, cl::init(Val: true));
276
277	static cl::opt<bool> ClPoisonUndefVectors(
278	"msan-poison-undef-vectors",
279	cl::desc ("Precisely poison partially undefined constant vectors. "
280	"If false (legacy behavior), the entire vector is "
281	"considered fully initialized, which may lead to false "
282	"negatives. Fully undefined constant vectors are "
283	"unaffected by this flag (see -msan-poison-undef)."),
284	cl::Hidden, cl::init(Val: false));
285
286	static cl::opt<bool> ClPreciseDisjointOr(
287	"msan-precise-disjoint-or",
288	cl::desc ("Precisely poison disjoint OR. If false (legacy behavior), "
289	"disjointedness is ignored (i.e., 1\|1 is initialized)."),
290	cl::Hidden, cl::init(Val: false));
291
292	static cl::opt<bool>
293	ClHandleICmp("msan-handle-icmp",
294	cl::desc ("propagate shadow through ICmpEQ and ICmpNE"),
295	cl::Hidden, cl::init(Val: true));
296
297	static cl::opt<bool>
298	ClHandleICmpExact("msan-handle-icmp-exact",
299	cl::desc ("exact handling of relational integer ICmp"),
300	cl::Hidden, cl::init(Val: true));
301
302	static cl::opt<int> ClSwitchPrecision(
303	"msan-switch-precision",
304	cl::desc ("Controls the number of cases considered by MSan for LLVM switch "
305	"instructions. 0 means no UUMs detected. Higher values lead to "
306	"fewer false negatives but may impact compiler and/or "
307	"application performance. N.B. LLVM switch instructions do not "
308	"correspond exactly to C++ switch statements."),
309	cl::Hidden, cl::init(Val: `99`));
310
311	static cl::opt<bool> ClHandleLifetimeIntrinsics(
312	"msan-handle-lifetime-intrinsics",
313	cl::desc (
314	"when possible, poison scoped variables at the beginning of the scope "
315	"(slower, but more precise)"),
316	cl::Hidden, cl::init(Val: true));
317
318	// When compiling the Linux kernel, we sometimes see false positives related to
319	// MSan being unable to understand that inline assembly calls may initialize
320	// local variables.
321	// This flag makes the compiler conservatively unpoison every memory location
322	// passed into an assembly call. Note that this may cause false positives.
323	// Because it's impossible to figure out the array sizes, we can only unpoison
324	// the first sizeof(type) bytes for each type pointer.*
325	static cl::opt<bool> ClHandleAsmConservative(
326	"msan-handle-asm-conservative",
327	cl::desc ("conservative handling of inline assembly"), cl::Hidden,
328	cl::init(Val: true));
329
330	// This flag controls whether we check the shadow of the address
331	// operand of load or store. Such bugs are very rare, since load from
332	// a garbage address typically results in SEGV, but still happen
333	// (e.g. only lower bits of address are garbage, or the access happens
334	// early at program startup where malloc-ed memory is more likely to
335	// be zeroed. As of 2012-08-28 this flag adds 20% slowdown.
336	static cl::opt<bool> ClCheckAccessAddress(
337	"msan-check-access-address",
338	cl::desc ("report accesses through a pointer which has poisoned shadow"),
339	cl::Hidden, cl::init(Val: true));
340
341	static cl::opt<bool> ClEagerChecks(
342	"msan-eager-checks",
343	cl::desc ("check arguments and return values at function call boundaries"),
344	cl::Hidden, cl::init(Val: false));
345
346	static cl::opt<bool> ClDumpStrictInstructions(
347	"msan-dump-strict-instructions",
348	cl::desc ("print out instructions with default strict semantics i.e.,"
349	"check that all the inputs are fully initialized, and mark "
350	"the output as fully initialized. These semantics are applied "
351	"to instructions that could not be handled explicitly nor "
352	"heuristically."),
353	cl::Hidden, cl::init(Val: false));
354
355	// Currently, all the heuristically handled instructions are specifically
356	// IntrinsicInst. However, we use the broader "HeuristicInstructions" name
357	// to parallel 'msan-dump-strict-instructions', and to keep the door open to
358	// handling non-intrinsic instructions heuristically.
359	static cl::opt<bool> ClDumpHeuristicInstructions(
360	"msan-dump-heuristic-instructions",
361	cl::desc ("Prints 'unknown' instructions that were handled heuristically. "
362	"Use -msan-dump-strict-instructions to print instructions that "
363	"could not be handled explicitly nor heuristically."),
364	cl::Hidden, cl::init(Val: false));
365
366	static cl::opt<int> ClInstrumentationWithCallThreshold(
367	"msan-instrumentation-with-call-threshold",
368	cl::desc (
369	"If the function being instrumented requires more than "
370	"this number of checks and origin stores, use callbacks instead of "
371	"inline checks (-1 means never use callbacks)."),
372	cl::Hidden, cl::init(Val: `3500`));
373
374	static cl::opt<bool>
375	ClEnableKmsan("msan-kernel",
376	cl::desc ("Enable KernelMemorySanitizer instrumentation"),
377	cl::Hidden, cl::init(Val: false));
378
379	static cl::opt<bool>
380	ClDisableChecks("msan-disable-checks",
381	cl::desc ("Apply no_sanitize to the whole file"), cl::Hidden,
382	cl::init(Val: false));
383
384	static cl::opt<bool>
385	ClCheckConstantShadow("msan-check-constant-shadow",
386	cl::desc ("Insert checks for constant shadow values"),
387	cl::Hidden, cl::init(Val: true));
388
389	// This is off by default because of a bug in gold:
390	// https://sourceware.org/bugzilla/show_bug.cgi?id=19002
391	static cl::opt<bool>
392	ClWithComdat("msan-with-comdat",
393	cl::desc ("Place MSan constructors in comdat sections"),
394	cl::Hidden, cl::init(Val: false));
395
396	// These options allow to specify custom memory map parameters
397	// See MemoryMapParams for details.
398	static cl::opt<uint64_t> ClAndMask("msan-and-mask",
399	cl::desc ("Define custom MSan AndMask"),
400	cl::Hidden, cl::init(Val: `0`));
401
402	static cl::opt<uint64_t> ClXorMask("msan-xor-mask",
403	cl::desc ("Define custom MSan XorMask"),
404	cl::Hidden, cl::init(Val: `0`));
405
406	static cl::opt<uint64_t> ClShadowBase("msan-shadow-base",
407	cl::desc ("Define custom MSan ShadowBase"),
408	cl::Hidden, cl::init(Val: `0`));
409
410	static cl::opt<uint64_t> ClOriginBase("msan-origin-base",
411	cl::desc ("Define custom MSan OriginBase"),
412	cl::Hidden, cl::init(Val: `0`));
413
414	static cl::opt<int>
415	ClDisambiguateWarning("msan-disambiguate-warning-threshold",
416	cl::desc ("Define threshold for number of checks per "
417	"debug location to force origin update."),
418	cl::Hidden, cl::init(Val: `3`));
419
420	const char kMsanModuleCtorName[] = "msan.module_ctor";
421	const char kMsanInitName[] = "__msan_init";
422
423	namespace {
424
425	// Memory map parameters used in application-to-shadow address calculation.
426	// Offset = (Addr & ~AndMask) ^ XorMask
427	// Shadow = ShadowBase + Offset
428	// Origin = OriginBase + Offset
429	struct MemoryMapParams {
430	uint64_t AndMask;
431	uint64_t XorMask;
432	uint64_t ShadowBase;
433	uint64_t OriginBase;
434	};
435
436	struct PlatformMemoryMapParams {
437	const MemoryMapParams *bits32;
438	const MemoryMapParams *bits64;
439	};
440
441	} // end anonymous namespace
442
443	// i386 Linux
444	static const MemoryMapParams Linux_I386_MemoryMapParams = {
445	.AndMask: `0x000080000000`, // AndMask
446	.XorMask: `0`, // XorMask (not used)
447	.ShadowBase: `0`, // ShadowBase (not used)
448	.OriginBase: `0x000040000000`, // OriginBase
449	};
450
451	// x86_64 Linux
452	static const MemoryMapParams Linux_X86_64_MemoryMapParams = {
453	.AndMask: `0`, // AndMask (not used)
454	.XorMask: `0x500000000000`, // XorMask
455	.ShadowBase: `0`, // ShadowBase (not used)
456	.OriginBase: `0x100000000000`, // OriginBase
457	};
458
459	// mips32 Linux
460	// FIXME: Remove -msan-origin-base -msan-and-mask added by PR #109284 to tests
461	// after picking good constants
462
463	// mips64 Linux
464	static const MemoryMapParams Linux_MIPS64_MemoryMapParams = {
465	.AndMask: `0`, // AndMask (not used)
466	.XorMask: `0x008000000000`, // XorMask
467	.ShadowBase: `0`, // ShadowBase (not used)
468	.OriginBase: `0x002000000000`, // OriginBase
469	};
470
471	// ppc32 Linux
472	// FIXME: Remove -msan-origin-base -msan-and-mask added by PR #109284 to tests
473	// after picking good constants
474
475	// ppc64 Linux
476	static const MemoryMapParams Linux_PowerPC64_MemoryMapParams = {
477	.AndMask: `0xE00000000000`, // AndMask
478	.XorMask: `0x100000000000`, // XorMask
479	.ShadowBase: `0x080000000000`, // ShadowBase
480	.OriginBase: `0x1C0000000000`, // OriginBase
481	};
482
483	// s390x Linux
484	static const MemoryMapParams Linux_S390X_MemoryMapParams = {
485	.AndMask: `0xC00000000000`, // AndMask
486	.XorMask: `0`, // XorMask (not used)
487	.ShadowBase: `0x080000000000`, // ShadowBase
488	.OriginBase: `0x1C0000000000`, // OriginBase
489	};
490
491	// arm32 Linux
492	// FIXME: Remove -msan-origin-base -msan-and-mask added by PR #109284 to tests
493	// after picking good constants
494
495	// aarch64 Linux
496	static const MemoryMapParams Linux_AArch64_MemoryMapParams = {
497	.AndMask: `0`, // AndMask (not used)
498	.XorMask: `0x0B00000000000`, // XorMask
499	.ShadowBase: `0`, // ShadowBase (not used)
500	.OriginBase: `0x0200000000000`, // OriginBase
501	};
502
503	// loongarch64 Linux
504	static const MemoryMapParams Linux_LoongArch64_MemoryMapParams = {
505	.AndMask: `0`, // AndMask (not used)
506	.XorMask: `0x500000000000`, // XorMask
507	.ShadowBase: `0`, // ShadowBase (not used)
508	.OriginBase: `0x100000000000`, // OriginBase
509	};
510
511	// riscv32 Linux
512	// FIXME: Remove -msan-origin-base -msan-and-mask added by PR #109284 to tests
513	// after picking good constants
514
515	// aarch64 FreeBSD
516	static const MemoryMapParams FreeBSD_AArch64_MemoryMapParams = {
517	.AndMask: `0x1800000000000`, // AndMask
518	.XorMask: `0x0400000000000`, // XorMask
519	.ShadowBase: `0x0200000000000`, // ShadowBase
520	.OriginBase: `0x0700000000000`, // OriginBase
521	};
522
523	// i386 FreeBSD
524	static const MemoryMapParams FreeBSD_I386_MemoryMapParams = {
525	.AndMask: `0x000180000000`, // AndMask
526	.XorMask: `0x000040000000`, // XorMask
527	.ShadowBase: `0x000020000000`, // ShadowBase
528	.OriginBase: `0x000700000000`, // OriginBase
529	};
530
531	// x86_64 FreeBSD
532	static const MemoryMapParams FreeBSD_X86_64_MemoryMapParams = {
533	.AndMask: `0xc00000000000`, // AndMask
534	.XorMask: `0x200000000000`, // XorMask
535	.ShadowBase: `0x100000000000`, // ShadowBase
536	.OriginBase: `0x380000000000`, // OriginBase
537	};
538
539	// x86_64 NetBSD
540	static const MemoryMapParams NetBSD_X86_64_MemoryMapParams = {
541	.AndMask: `0`, // AndMask
542	.XorMask: `0x500000000000`, // XorMask
543	.ShadowBase: `0`, // ShadowBase
544	.OriginBase: `0x100000000000`, // OriginBase
545	};
546
547	static const PlatformMemoryMapParams Linux_X86_MemoryMapParams = {
548	.bits32: &Linux_I386_MemoryMapParams,
549	.bits64: &Linux_X86_64_MemoryMapParams,
550	};
551
552	static const PlatformMemoryMapParams Linux_MIPS_MemoryMapParams = {
553	.bits32: nullptr,
554	.bits64: &Linux_MIPS64_MemoryMapParams,
555	};
556
557	static const PlatformMemoryMapParams Linux_PowerPC_MemoryMapParams = {
558	.bits32: nullptr,
559	.bits64: &Linux_PowerPC64_MemoryMapParams,
560	};
561
562	static const PlatformMemoryMapParams Linux_S390_MemoryMapParams = {
563	.bits32: nullptr,
564	.bits64: &Linux_S390X_MemoryMapParams,
565	};
566
567	static const PlatformMemoryMapParams Linux_ARM_MemoryMapParams = {
568	.bits32: nullptr,
569	.bits64: &Linux_AArch64_MemoryMapParams,
570	};
571
572	static const PlatformMemoryMapParams Linux_LoongArch_MemoryMapParams = {
573	.bits32: nullptr,
574	.bits64: &Linux_LoongArch64_MemoryMapParams,
575	};
576
577	static const PlatformMemoryMapParams FreeBSD_ARM_MemoryMapParams = {
578	.bits32: nullptr,
579	.bits64: &FreeBSD_AArch64_MemoryMapParams,
580	};
581
582	static const PlatformMemoryMapParams FreeBSD_X86_MemoryMapParams = {
583	.bits32: &FreeBSD_I386_MemoryMapParams,
584	.bits64: &FreeBSD_X86_64_MemoryMapParams,
585	};
586
587	static const PlatformMemoryMapParams NetBSD_X86_MemoryMapParams = {
588	.bits32: nullptr,
589	.bits64: &NetBSD_X86_64_MemoryMapParams,
590	};
591
592	enum OddOrEvenLanes { kBothLanes, kEvenLanes, kOddLanes };
593
594	namespace {
595
596	/// Instrument functions of a module to detect uninitialized reads.
597	///
598	/// Instantiating MemorySanitizer inserts the msan runtime library API function
599	/// declarations into the module if they don't exist already. Instantiating
600	/// ensures the __msan_init function is in the list of global constructors for
601	/// the module.
602	class MemorySanitizer {
603	public:
604	MemorySanitizer(Module &M, MemorySanitizerOptions Options)
605	: CompileKernel(Options.Kernel), TrackOrigins(Options.TrackOrigins),
606	Recover(Options.Recover), EagerChecks(Options.EagerChecks) {
607	initializeModule(M);
608	}
609
610	// MSan cannot be moved or copied because of MapParams.
611	MemorySanitizer(MemorySanitizer &&) = delete;
612	MemorySanitizer &operator=(MemorySanitizer &&) = delete;
613	MemorySanitizer(const MemorySanitizer &) = delete;
614	MemorySanitizer &operator=(const MemorySanitizer &) = delete;
615
616	bool sanitizeFunction(Function &F, TargetLibraryInfo &TLI);
617
618	private:
619	friend struct MemorySanitizerVisitor;
620	friend struct VarArgHelperBase;
621	friend struct VarArgAMD64Helper;
622	friend struct VarArgAArch64Helper;
623	friend struct VarArgPowerPC64Helper;
624	friend struct VarArgPowerPC32Helper;
625	friend struct VarArgSystemZHelper;
626	friend struct VarArgI386Helper;
627	friend struct VarArgGenericHelper;
628
629	void initializeModule(Module &M);
630	void initializeCallbacks(Module &M, const TargetLibraryInfo &TLI);
631	void createKernelApi(Module &M, const TargetLibraryInfo &TLI);
632	void createUserspaceApi(Module &M, const TargetLibraryInfo &TLI);
633
634	template <typename... ArgsTy>
635	FunctionCallee getOrInsertMsanMetadataFunction(Module &M, StringRef Name,
636	ArgsTy... Args);
637
638	/// True if we're compiling the Linux kernel.
639	bool CompileKernel;
640	/// Track origins (allocation points) of uninitialized values.
641	int TrackOrigins;
642	bool Recover;
643	bool EagerChecks;
644
645	Triple TargetTriple;
646	LLVMContext *C;
647	Type IntptrTy; ///< Integer type with the size of a ptr in default AS.*
648	Type *OriginTy;
649	PointerType PtrTy; ///< Integer type with the size of a ptr in default AS.*
650
651	// XxxTLS variables represent the per-thread state in MSan and per-task state
652	// in KMSAN.
653	// For the userspace these point to thread-local globals. In the kernel land
654	// they point to the members of a per-task struct obtained via a call to
655	// __msan_get_context_state().
656
657	/// Thread-local shadow storage for function parameters.
658	Value *ParamTLS;
659
660	/// Thread-local origin storage for function parameters.
661	Value *ParamOriginTLS;
662
663	/// Thread-local shadow storage for function return value.
664	Value *RetvalTLS;
665
666	/// Thread-local origin storage for function return value.
667	Value *RetvalOriginTLS;
668
669	/// Thread-local shadow storage for in-register va_arg function.
670	Value *VAArgTLS;
671
672	/// Thread-local shadow storage for in-register va_arg function.
673	Value *VAArgOriginTLS;
674
675	/// Thread-local shadow storage for va_arg overflow area.
676	Value *VAArgOverflowSizeTLS;
677
678	/// Are the instrumentation callbacks set up?
679	bool CallbacksInitialized = false;
680
681	/// The run-time callback to print a warning.
682	FunctionCallee WarningFn;
683
684	// These arrays are indexed by log2(AccessSize).
685	FunctionCallee MaybeWarningFn[kNumberOfAccessSizes];
686	FunctionCallee MaybeWarningVarSizeFn;
687	FunctionCallee MaybeStoreOriginFn[kNumberOfAccessSizes];
688
689	/// Run-time helper that generates a new origin value for a stack
690	/// allocation.
691	FunctionCallee MsanSetAllocaOriginWithDescriptionFn;
692	// No description version
693	FunctionCallee MsanSetAllocaOriginNoDescriptionFn;
694
695	/// Run-time helper that poisons stack on function entry.
696	FunctionCallee MsanPoisonStackFn;
697
698	/// Run-time helper that records a store (or any event) of an
699	/// uninitialized value and returns an updated origin id encoding this info.
700	FunctionCallee MsanChainOriginFn;
701
702	/// Run-time helper that paints an origin over a region.
703	FunctionCallee MsanSetOriginFn;
704
705	/// MSan runtime replacements for memmove, memcpy and memset.
706	FunctionCallee MemmoveFn, MemcpyFn, MemsetFn;
707
708	/// KMSAN callback for task-local function argument shadow.
709	StructType *MsanContextStateTy;
710	FunctionCallee MsanGetContextStateFn;
711
712	/// Functions for poisoning/unpoisoning local variables
713	FunctionCallee MsanPoisonAllocaFn, MsanUnpoisonAllocaFn;
714
715	/// Pair of shadow/origin pointers.
716	Type *MsanMetadata;
717
718	/// Each of the MsanMetadataPtrXxx functions returns a MsanMetadata.
719	FunctionCallee MsanMetadataPtrForLoadN, MsanMetadataPtrForStoreN;
720	FunctionCallee MsanMetadataPtrForLoad_1_8[`4`];
721	FunctionCallee MsanMetadataPtrForStore_1_8[`4`];
722	FunctionCallee MsanInstrumentAsmStoreFn;
723
724	/// Storage for return values of the MsanMetadataPtrXxx functions.
725	Value *MsanMetadataAlloca;
726
727	/// Helper to choose between different MsanMetadataPtrXxx().
728	FunctionCallee getKmsanShadowOriginAccessFn(bool isStore, int size);
729
730	/// Memory map parameters used in application-to-shadow calculation.
731	const MemoryMapParams *MapParams;
732
733	/// Custom memory map parameters used when -msan-shadow-base or
734	// -msan-origin-base is provided.
735	MemoryMapParams CustomMapParams;
736
737	MDNode *ColdCallWeights;
738
739	/// Branch weights for origin store.
740	MDNode *OriginStoreWeights;
741	};
742
743	void insertModuleCtor(Module &M) {
744	getOrCreateSanitizerCtorAndInitFunctions(
745	M, CtorName: kMsanModuleCtorName, InitName: kMsanInitName,
746	/InitArgTypes=/{},
747	/InitArgs=/{},
748	// This callback is invoked when the functions are created the first
749	// time. Hook them into the global ctors list in that case:
750	FunctionsCreatedCallback: [&](Function *Ctor, FunctionCallee) {
751	if (!ClWithComdat) {
752	appendToGlobalCtors(M, F: Ctor, Priority: `0`);
753	return;
754	}
755	Comdat *MsanCtorComdat = M.getOrInsertComdat(Name: kMsanModuleCtorName);
756	Ctor->setComdat(MsanCtorComdat);
757	appendToGlobalCtors(M, F: Ctor, Priority: `0`, Data: Ctor);
758	});
759	}
760
761	template <class T> T getOptOrDefault(const cl::opt<T> &Opt, T Default) {
762	return (Opt.getNumOccurrences() > `0`) ? Opt : Default;
763	}
764
765	} // end anonymous namespace
766
767	MemorySanitizerOptions::MemorySanitizerOptions(int TO, bool R, bool K,
768	bool EagerChecks)
769	: Kernel(getOptOrDefault(Opt: ClEnableKmsan, Default: K)),
770	TrackOrigins(getOptOrDefault(Opt: ClTrackOrigins, Default: Kernel ? `2` : TO)),
771	Recover(getOptOrDefault(Opt: ClKeepGoing, Default: Kernel \|\| R)),
772	EagerChecks(getOptOrDefault(Opt: ClEagerChecks, Default: EagerChecks)) {}
773
774	PreservedAnalyses MemorySanitizerPass::run(Module &M,
775	ModuleAnalysisManager &AM) {
776	// Return early if nosanitize_memory module flag is present for the module.
777	if (checkIfAlreadyInstrumented(M, Flag: "nosanitize_memory"))
778	return PreservedAnalyses::all();
779	bool Modified = false;
780	if (!Options.Kernel) {
781	insertModuleCtor(M);
782	Modified = true;
783	}
784
785	auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(IR&: M).getManager();
786	for (Function &F : M) {
787	if (F.empty())
788	continue;
789	MemorySanitizer Msan(*F.getParent(), Options);
790	Modified \|=
791	Msan.sanitizeFunction(F, TLI&: FAM.getResult<TargetLibraryAnalysis>(IR&: F));
792	}
793
794	if (!Modified)
795	return PreservedAnalyses::all();
796
797	PreservedAnalyses PA = PreservedAnalyses::none();
798	// GlobalsAA is considered stateless and does not get invalidated unless
799	// explicitly invalidated; PreservedAnalyses::none() is not enough. Sanitizers
800	// make changes that require GlobalsAA to be invalidated.
801	PA.abandon<GlobalsAA>();
802	return PA;
803	}
804
805	void MemorySanitizerPass::printPipeline(
806	raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
807	static_cast<PassInfoMixin<MemorySanitizerPass> >(this*)->printPipeline(
808	OS, MapClassName2PassName);
809	OS << `'<'`;
810	if (Options.Recover)
811	OS << "recover;";
812	if (Options.Kernel)
813	OS << "kernel;";
814	if (Options.EagerChecks)
815	OS << "eager-checks;";
816	OS << "track-origins=" << Options.TrackOrigins;
817	OS << `'>'`;
818	}
819
820	/// Create a non-const global initialized with the given string.
821	///
822	/// Creates a writable global for Str so that we can pass it to the
823	/// run-time lib. Runtime uses first 4 bytes of the string to store the
824	/// frame ID, so the string needs to be mutable.
825	static GlobalVariable *createPrivateConstGlobalForString(Module &M,
826	StringRef Str) {
827	Constant *StrConst = ConstantDataArray::getString(Context&: M.getContext(), Initializer: Str);
828	return new GlobalVariable (M, StrConst->getType(), /isConstant=/true,
829	GlobalValue::PrivateLinkage, StrConst, "");
830	}
831
832	template <typename... ArgsTy>
833	FunctionCallee
834	MemorySanitizer::getOrInsertMsanMetadataFunction(Module &M, StringRef Name,
835	ArgsTy... Args) {
836	if (TargetTriple.getArch() == Triple::systemz) {
837	// SystemZ ABI: shadow/origin pair is returned via a hidden parameter.
838	return M.getOrInsertFunction(Name, Type::getVoidTy(C&: *C), PtrTy,
839	std::forward<ArgsTy>(Args)...);
840	}
841
842	return M.getOrInsertFunction(Name, MsanMetadata,
843	std::forward<ArgsTy>(Args)...);
844	}
845
846	/// Create KMSAN API callbacks.
847	void MemorySanitizer::createKernelApi(Module &M, const TargetLibraryInfo &TLI) {
848	IRBuilder<> IRB(*C);
849
850	// These will be initialized in insertKmsanPrologue().
851	RetvalTLS = nullptr;
852	RetvalOriginTLS = nullptr;
853	ParamTLS = nullptr;
854	ParamOriginTLS = nullptr;
855	VAArgTLS = nullptr;
856	VAArgOriginTLS = nullptr;
857	VAArgOverflowSizeTLS = nullptr;
858
859	WarningFn = M.getOrInsertFunction(Name: "__msan_warning",
860	AttributeList: TLI.getAttrList(C, ArgNos: {`0`}, /Signed=/false),
861	RetTy: IRB.getVoidTy(), Args: IRB.getInt32Ty());
862
863	// Requests the per-task context state (kmsan_context_state) from the*
864	// runtime library.
865	MsanContextStateTy = StructType::get(
866	elt1: ArrayType::get(ElementType: IRB.getInt64Ty(), NumElements: kParamTLSSize / `8`),
867	elts: ArrayType::get(ElementType: IRB.getInt64Ty(), NumElements: kRetvalTLSSize / `8`),
868	elts: ArrayType::get(ElementType: IRB.getInt64Ty(), NumElements: kParamTLSSize / `8`),
869	elts: ArrayType::get(ElementType: IRB.getInt64Ty(), NumElements: kParamTLSSize / `8`), / va_arg_origin /
870	elts: IRB.getInt64Ty(), elts: ArrayType::get(ElementType: OriginTy, NumElements: kParamTLSSize / `4`), elts: OriginTy,
871	elts: OriginTy);
872	MsanGetContextStateFn =
873	M.getOrInsertFunction(Name: "__msan_get_context_state", RetTy: PtrTy);
874
875	MsanMetadata = StructType::get(elt1: PtrTy, elts: PtrTy);
876
877	for (int ind = `0`, size = `1`; ind < `4`; ind++, size <<= `1`) {
878	std::string name_load =
879	"__msan_metadata_ptr_for_load_" + std::to_string(val: size);
880	std::string name_store =
881	"__msan_metadata_ptr_for_store_" + std::to_string(val: size);
882	MsanMetadataPtrForLoad_1_8[ind] =
883	getOrInsertMsanMetadataFunction(M, Name: name_load, Args: PtrTy);
884	MsanMetadataPtrForStore_1_8[ind] =
885	getOrInsertMsanMetadataFunction(M, Name: name_store, Args: PtrTy);
886	}
887
888	MsanMetadataPtrForLoadN = getOrInsertMsanMetadataFunction(
889	M, Name: "__msan_metadata_ptr_for_load_n", Args: PtrTy, Args: IntptrTy);
890	MsanMetadataPtrForStoreN = getOrInsertMsanMetadataFunction(
891	M, Name: "__msan_metadata_ptr_for_store_n", Args: PtrTy, Args: IntptrTy);
892
893	// Functions for poisoning and unpoisoning memory.
894	MsanPoisonAllocaFn = M.getOrInsertFunction(
895	Name: "__msan_poison_alloca", RetTy: IRB.getVoidTy(), Args: PtrTy, Args: IntptrTy, Args: PtrTy);
896	MsanUnpoisonAllocaFn = M.getOrInsertFunction(
897	Name: "__msan_unpoison_alloca", RetTy: IRB.getVoidTy(), Args: PtrTy, Args: IntptrTy);
898	}
899
900	static Constant getOrInsertGlobal(Module &M, StringRef Name, Type Ty) {
901	return M.getOrInsertGlobal(Name, Ty, CreateGlobalCallback: [&] {
902	return new GlobalVariable (M, Ty, false, GlobalVariable::ExternalLinkage,
903	nullptr, Name, nullptr,
904	GlobalVariable::InitialExecTLSModel);
905	});
906	}
907
908	/// Insert declarations for userspace-specific functions and globals.
909	void MemorySanitizer::createUserspaceApi(Module &M,
910	const TargetLibraryInfo &TLI) {
911	IRBuilder<> IRB(*C);
912
913	// Create the callback.
914	// FIXME: this function should have "Cold" calling conv,
915	// which is not yet implemented.
916	if (TrackOrigins) {
917	StringRef WarningFnName = Recover ? "__msan_warning_with_origin"
918	: "__msan_warning_with_origin_noreturn";
919	WarningFn = M.getOrInsertFunction(Name: WarningFnName,
920	AttributeList: TLI.getAttrList(C, ArgNos: {`0`}, /Signed=/false),
921	RetTy: IRB.getVoidTy(), Args: IRB.getInt32Ty());
922	} else {
923	StringRef WarningFnName =
924	Recover ? "__msan_warning" : "__msan_warning_noreturn";
925	WarningFn = M.getOrInsertFunction(Name: WarningFnName, RetTy: IRB.getVoidTy());
926	}
927
928	// Create the global TLS variables.
929	RetvalTLS =
930	getOrInsertGlobal(M, Name: "__msan_retval_tls",
931	Ty: ArrayType::get(ElementType: IRB.getInt64Ty(), NumElements: kRetvalTLSSize / `8`));
932
933	RetvalOriginTLS = getOrInsertGlobal(M, Name: "__msan_retval_origin_tls", Ty: OriginTy);
934
935	ParamTLS =
936	getOrInsertGlobal(M, Name: "__msan_param_tls",
937	Ty: ArrayType::get(ElementType: IRB.getInt64Ty(), NumElements: kParamTLSSize / `8`));
938
939	ParamOriginTLS =
940	getOrInsertGlobal(M, Name: "__msan_param_origin_tls",
941	Ty: ArrayType::get(ElementType: OriginTy, NumElements: kParamTLSSize / `4`));
942
943	VAArgTLS =
944	getOrInsertGlobal(M, Name: "__msan_va_arg_tls",
945	Ty: ArrayType::get(ElementType: IRB.getInt64Ty(), NumElements: kParamTLSSize / `8`));
946
947	VAArgOriginTLS =
948	getOrInsertGlobal(M, Name: "__msan_va_arg_origin_tls",
949	Ty: ArrayType::get(ElementType: OriginTy, NumElements: kParamTLSSize / `4`));
950
951	VAArgOverflowSizeTLS = getOrInsertGlobal(M, Name: "__msan_va_arg_overflow_size_tls",
952	Ty: IRB.getIntPtrTy(DL: M.getDataLayout()));
953
954	for (size_t AccessSizeIndex = `0`; AccessSizeIndex < kNumberOfAccessSizes;
955	AccessSizeIndex++) {
956	unsigned AccessSize = `1` << AccessSizeIndex;
957	std::string FunctionName = "__msan_maybe_warning_" + itostr(X: AccessSize);
958	MaybeWarningFn[AccessSizeIndex] = M.getOrInsertFunction(
959	Name: FunctionName, AttributeList: TLI.getAttrList(C, ArgNos: {`0`, `1`}, /Signed=/false),
960	RetTy: IRB.getVoidTy(), Args: IRB.getIntNTy(N: AccessSize * `8`), Args: IRB.getInt32Ty());
961	MaybeWarningVarSizeFn = M.getOrInsertFunction(
962	Name: "__msan_maybe_warning_N", AttributeList: TLI.getAttrList(C, ArgNos: {}, /Signed=/false),
963	RetTy: IRB.getVoidTy(), Args: PtrTy, Args: IRB.getInt64Ty(), Args: IRB.getInt32Ty());
964	FunctionName = "__msan_maybe_store_origin_" + itostr(X: AccessSize);
965	MaybeStoreOriginFn[AccessSizeIndex] = M.getOrInsertFunction(
966	Name: FunctionName, AttributeList: TLI.getAttrList(C, ArgNos: {`0`, `2`}, /Signed=/false),
967	RetTy: IRB.getVoidTy(), Args: IRB.getIntNTy(N: AccessSize * `8`), Args: PtrTy,
968	Args: IRB.getInt32Ty());
969	}
970
971	MsanSetAllocaOriginWithDescriptionFn =
972	M.getOrInsertFunction(Name: "__msan_set_alloca_origin_with_descr",
973	RetTy: IRB.getVoidTy(), Args: PtrTy, Args: IntptrTy, Args: PtrTy, Args: PtrTy);
974	MsanSetAllocaOriginNoDescriptionFn =
975	M.getOrInsertFunction(Name: "__msan_set_alloca_origin_no_descr",
976	RetTy: IRB.getVoidTy(), Args: PtrTy, Args: IntptrTy, Args: PtrTy);
977	MsanPoisonStackFn = M.getOrInsertFunction(Name: "__msan_poison_stack",
978	RetTy: IRB.getVoidTy(), Args: PtrTy, Args: IntptrTy);
979	}
980
981	/// Insert extern declaration of runtime-provided functions and globals.
982	void MemorySanitizer::initializeCallbacks(Module &M,
983	const TargetLibraryInfo &TLI) {
984	// Only do this once.
985	if (CallbacksInitialized)
986	return;
987
988	IRBuilder<> IRB(*C);
989	// Initialize callbacks that are common for kernel and userspace
990	// instrumentation.
991	MsanChainOriginFn = M.getOrInsertFunction(
992	Name: "__msan_chain_origin",
993	AttributeList: TLI.getAttrList(C, ArgNos: {`0`}, /Signed=/false, /Ret=/true), RetTy: IRB.getInt32Ty(),
994	Args: IRB.getInt32Ty());
995	MsanSetOriginFn = M.getOrInsertFunction(
996	Name: "__msan_set_origin", AttributeList: TLI.getAttrList(C, ArgNos: {`2`}, /Signed=/false),
997	RetTy: IRB.getVoidTy(), Args: PtrTy, Args: IntptrTy, Args: IRB.getInt32Ty());
998	MemmoveFn =
999	M.getOrInsertFunction(Name: "__msan_memmove", RetTy: PtrTy, Args: PtrTy, Args: PtrTy, Args: IntptrTy);
1000	MemcpyFn =
1001	M.getOrInsertFunction(Name: "__msan_memcpy", RetTy: PtrTy, Args: PtrTy, Args: PtrTy, Args: IntptrTy);
1002	MemsetFn = M.getOrInsertFunction(Name: "__msan_memset",
1003	AttributeList: TLI.getAttrList(C, ArgNos: {`1`}, /Signed=/true),
1004	RetTy: PtrTy, Args: PtrTy, Args: IRB.getInt32Ty(), Args: IntptrTy);
1005
1006	MsanInstrumentAsmStoreFn = M.getOrInsertFunction(
1007	Name: "__msan_instrument_asm_store", RetTy: IRB.getVoidTy(), Args: PtrTy, Args: IntptrTy);
1008
1009	if (CompileKernel) {
1010	createKernelApi(M, TLI);
1011	} else {
1012	createUserspaceApi(M, TLI);
1013	}
1014	CallbacksInitialized = true;
1015	}
1016
1017	FunctionCallee MemorySanitizer::getKmsanShadowOriginAccessFn(bool isStore,
1018	int size) {
1019	FunctionCallee *Fns =
1020	isStore ? MsanMetadataPtrForStore_1_8 : MsanMetadataPtrForLoad_1_8;
1021	switch (size) {
1022	case `1`:
1023	return Fns[`0`];
1024	case `2`:
1025	return Fns[`1`];
1026	case `4`:
1027	return Fns[`2`];
1028	case `8`:
1029	return Fns[`3`];
1030	default:
1031	return nullptr;
1032	}
1033	}
1034
1035	/// Module-level initialization.
1036	///
1037	/// inserts a call to __msan_init to the module's constructor list.
1038	void MemorySanitizer::initializeModule(Module &M) {
1039	auto &DL = M.getDataLayout();
1040
1041	TargetTriple = M.getTargetTriple();
1042
1043	bool ShadowPassed = ClShadowBase.getNumOccurrences() > `0`;
1044	bool OriginPassed = ClOriginBase.getNumOccurrences() > `0`;
1045	// Check the overrides first
1046	if (ShadowPassed \|\| OriginPassed) {
1047	CustomMapParams.AndMask = ClAndMask;
1048	CustomMapParams.XorMask = ClXorMask;
1049	CustomMapParams.ShadowBase = ClShadowBase;
1050	CustomMapParams.OriginBase = ClOriginBase;
1051	MapParams = &CustomMapParams;
1052	} else {
1053	switch (TargetTriple.getOS()) {
1054	case Triple::FreeBSD:
1055	switch (TargetTriple.getArch()) {
1056	case Triple::aarch64:
1057	MapParams = FreeBSD_ARM_MemoryMapParams.bits64;
1058	break;
1059	case Triple::x86_64:
1060	MapParams = FreeBSD_X86_MemoryMapParams.bits64;
1061	break;
1062	case Triple::x86:
1063	MapParams = FreeBSD_X86_MemoryMapParams.bits32;
1064	break;
1065	default:
1066	report_fatal_error(reason: "unsupported architecture");
1067	}
1068	break;
1069	case Triple::NetBSD:
1070	switch (TargetTriple.getArch()) {
1071	case Triple::x86_64:
1072	MapParams = NetBSD_X86_MemoryMapParams.bits64;
1073	break;
1074	default:
1075	report_fatal_error(reason: "unsupported architecture");
1076	}
1077	break;
1078	case Triple::Linux:
1079	switch (TargetTriple.getArch()) {
1080	case Triple::x86_64:
1081	MapParams = Linux_X86_MemoryMapParams.bits64;
1082	break;
1083	case Triple::x86:
1084	MapParams = Linux_X86_MemoryMapParams.bits32;
1085	break;
1086	case Triple::mips64:
1087	case Triple::mips64el:
1088	MapParams = Linux_MIPS_MemoryMapParams.bits64;
1089	break;
1090	case Triple::ppc64:
1091	case Triple::ppc64le:
1092	MapParams = Linux_PowerPC_MemoryMapParams.bits64;
1093	break;
1094	case Triple::systemz:
1095	MapParams = Linux_S390_MemoryMapParams.bits64;
1096	break;
1097	case Triple::aarch64:
1098	case Triple::aarch64_be:
1099	MapParams = Linux_ARM_MemoryMapParams.bits64;
1100	break;
1101	case Triple::loongarch64:
1102	MapParams = Linux_LoongArch_MemoryMapParams.bits64;
1103	break;
1104	default:
1105	report_fatal_error(reason: "unsupported architecture");
1106	}
1107	break;
1108	default:
1109	report_fatal_error(reason: "unsupported operating system");
1110	}
1111	}
1112
1113	C = &(M.getContext());
1114	IRBuilder<> IRB(*C);
1115	IntptrTy = IRB.getIntPtrTy(DL);
1116	OriginTy = IRB.getInt32Ty();
1117	PtrTy = IRB.getPtrTy();
1118
1119	ColdCallWeights = MDBuilder (*C).createUnlikelyBranchWeights();
1120	OriginStoreWeights = MDBuilder (*C).createUnlikelyBranchWeights();
1121
1122	if (!CompileKernel) {
1123	if (TrackOrigins)
1124	M.getOrInsertGlobal(Name: "__msan_track_origins", Ty: IRB.getInt32Ty(), CreateGlobalCallback: [&] {
1125	return new GlobalVariable (
1126	M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage,
1127	IRB.getInt32(C: TrackOrigins), "__msan_track_origins");
1128	});
1129
1130	if (Recover)
1131	M.getOrInsertGlobal(Name: "__msan_keep_going", Ty: IRB.getInt32Ty(), CreateGlobalCallback: [&] {
1132	return new GlobalVariable (M, IRB.getInt32Ty(), true,
1133	GlobalValue::WeakODRLinkage,
1134	IRB.getInt32(C: Recover), "__msan_keep_going");
1135	});
1136	}
1137	}
1138
1139	namespace {
1140
1141	/// A helper class that handles instrumentation of VarArg
1142	/// functions on a particular platform.
1143	///
1144	/// Implementations are expected to insert the instrumentation
1145	/// necessary to propagate argument shadow through VarArg function
1146	/// calls. Visit methods are called during an InstVisitor pass over*
1147	/// the function, and should avoid creating new basic blocks. A new
1148	/// instance of this class is created for each instrumented function.
1149	struct VarArgHelper {
1150	virtual ~VarArgHelper() = default;
1151
1152	/// Visit a CallBase.
1153	virtual void visitCallBase(CallBase &CB, IRBuilder<> &IRB) = `0`;
1154
1155	/// Visit a va_start call.
1156	virtual void visitVAStartInst(VAStartInst &I) = `0`;
1157
1158	/// Visit a va_copy call.
1159	virtual void visitVACopyInst(VACopyInst &I) = `0`;
1160
1161	/// Finalize function instrumentation.
1162	///
1163	/// This method is called after visiting all interesting (see above)
1164	/// instructions in a function.
1165	virtual void finalizeInstrumentation() = `0`;
1166	};
1167
1168	struct MemorySanitizerVisitor;
1169
1170	} // end anonymous namespace
1171
1172	static VarArgHelper *CreateVarArgHelper(Function &Func, MemorySanitizer &Msan,
1173	MemorySanitizerVisitor &Visitor);
1174
1175	static unsigned TypeSizeToSizeIndex(TypeSize TS) {
1176	if (TS.isScalable())
1177	// Scalable types unconditionally take slowpaths.
1178	return kNumberOfAccessSizes;
1179	unsigned TypeSizeFixed = TS.getFixedValue();
1180	if (TypeSizeFixed <= `8`)
1181	return `0`;
1182	return Log2_32_Ceil(Value: (TypeSizeFixed + `7`) / `8`);
1183	}
1184
1185	namespace {
1186
1187	/// Helper class to attach debug information of the given instruction onto new
1188	/// instructions inserted after.
1189	class NextNodeIRBuilder : public IRBuilder<> {
1190	public:
1191	explicit NextNodeIRBuilder(Instruction *IP) : IRBuilder<>(IP->getNextNode()) {
1192	SetCurrentDebugLocation(IP->getDebugLoc());
1193	}
1194	};
1195
1196	/// This class does all the work for a given function. Store and Load
1197	/// instructions store and load corresponding shadow and origin
1198	/// values. Most instructions propagate shadow from arguments to their
1199	/// return values. Certain instructions (most importantly, BranchInst)
1200	/// test their argument shadow and print reports (with a runtime call) if it's
1201	/// non-zero.
1202	struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
1203	Function &F;
1204	MemorySanitizer &MS;
1205	SmallVector<PHINode *, `16`> ShadowPHINodes, OriginPHINodes;
1206	ValueMap<Value , Value > ShadowMap, OriginMap;
1207	std::unique_ptr<VarArgHelper> VAHelper;
1208	const TargetLibraryInfo *TLI;
1209	Instruction *FnPrologueEnd;
1210	SmallVector<Instruction *, `16`> Instructions;
1211
1212	// The following flags disable parts of MSan instrumentation based on
1213	// exclusion list contents and command-line options.
1214	bool InsertChecks;
1215	bool PropagateShadow;
1216	bool PoisonStack;
1217	bool PoisonUndef;
1218	bool PoisonUndefVectors;
1219
1220	struct ShadowOriginAndInsertPoint {
1221	Value *Shadow;
1222	Value *Origin;
1223	Instruction *OrigIns;
1224
1225	ShadowOriginAndInsertPoint(Value S, Value O, Instruction *I)
1226	: Shadow(S), Origin(O), OrigIns(I) {}
1227	};
1228	SmallVector<ShadowOriginAndInsertPoint, `16`> InstrumentationList;
1229	DenseMap<const DILocation , int*> LazyWarningDebugLocationCount;
1230	SmallSetVector<AllocaInst *, `16`> AllocaSet;
1231	SmallVector<std::pair<IntrinsicInst , AllocaInst >, `16`> LifetimeStartList;
1232	SmallVector<StoreInst *, `16`> StoreList;
1233	int64_t SplittableBlocksCount = `0`;
1234
1235	MemorySanitizerVisitor(Function &F, MemorySanitizer &MS,
1236	const TargetLibraryInfo &TLI)
1237	: F(F), MS(MS), VAHelper (CreateVarArgHelper(Func&: F, Msan&: MS, Visitor&: *this)), TLI(&TLI) {
1238	bool SanitizeFunction =
1239	F.hasFnAttribute(Kind: Attribute::SanitizeMemory) && !ClDisableChecks;
1240	InsertChecks = SanitizeFunction;
1241	PropagateShadow = SanitizeFunction;
1242	PoisonStack = SanitizeFunction && ClPoisonStack;
1243	PoisonUndef = SanitizeFunction && ClPoisonUndef;
1244	PoisonUndefVectors = SanitizeFunction && ClPoisonUndefVectors;
1245
1246	// In the presence of unreachable blocks, we may see Phi nodes with
1247	// incoming nodes from such blocks. Since InstVisitor skips unreachable
1248	// blocks, such nodes will not have any shadow value associated with them.
1249	// It's easier to remove unreachable blocks than deal with missing shadow.
1250	removeUnreachableBlocks(F);
1251
1252	MS.initializeCallbacks(M&: *F.getParent(), TLI);
1253	FnPrologueEnd =
1254	IRBuilder<>(&F.getEntryBlock(), F.getEntryBlock().getFirstNonPHIIt())
1255	.CreateIntrinsic(ID: Intrinsic::donothing, Args: {});
1256
1257	if (MS.CompileKernel) {
1258	IRBuilder<> IRB(FnPrologueEnd);
1259	insertKmsanPrologue(IRB);
1260	}
1261
1262	LLVM_DEBUG(if (!InsertChecks) dbgs()
1263	<< "MemorySanitizer is not inserting checks into '"
1264	<< F.getName() << "'\n");
1265	}
1266
1267	bool instrumentWithCalls(Value *V) {
1268	// Constants likely will be eliminated by follow-up passes.
1269	if (isa<Constant>(Val: V))
1270	return false;
1271	++SplittableBlocksCount;
1272	return ClInstrumentationWithCallThreshold >= `0` &&
1273	SplittableBlocksCount > ClInstrumentationWithCallThreshold;
1274	}
1275
1276	bool isInPrologue(Instruction &I) {
1277	return I.getParent() == FnPrologueEnd->getParent() &&
1278	(&I == FnPrologueEnd \|\| I.comesBefore(Other: FnPrologueEnd));
1279	}
1280
1281	// Creates a new origin and records the stack trace. In general we can call
1282	// this function for any origin manipulation we like. However it will cost
1283	// runtime resources. So use this wisely only if it can provide additional
1284	// information helpful to a user.
1285	Value updateOrigin(Value V, IRBuilder<> &IRB) {
1286	if (MS.TrackOrigins <= `1`)
1287	return V;
1288	return IRB.CreateCall(Callee: MS.MsanChainOriginFn, Args: V);
1289	}
1290
1291	Value originToIntptr(IRBuilder<> &IRB, Value Origin) {
1292	const DataLayout &DL = F.getDataLayout();
1293	unsigned IntptrSize = DL.getTypeStoreSize(Ty: MS.IntptrTy);
1294	if (IntptrSize == kOriginSize)
1295	return Origin;
1296	assert(IntptrSize == kOriginSize * `2`);
1297	Origin = IRB.CreateIntCast(V: Origin, DestTy: MS.IntptrTy, / isSigned / false);
1298	return IRB.CreateOr(LHS: Origin, RHS: IRB.CreateShl(LHS: Origin, RHS: kOriginSize * `8`));
1299	}
1300
1301	/// Fill memory range with the given origin value.
1302	void paintOrigin(IRBuilder<> &IRB, Value Origin, Value OriginPtr,
1303	TypeSize TS, Align Alignment) {
1304	const DataLayout &DL = F.getDataLayout();
1305	const Align IntptrAlignment = DL.getABITypeAlign(Ty: MS.IntptrTy);
1306	unsigned IntptrSize = DL.getTypeStoreSize(Ty: MS.IntptrTy);
1307	assert(IntptrAlignment >= kMinOriginAlignment);
1308	assert(IntptrSize >= kOriginSize);
1309
1310	// Note: The loop based formation works for fixed length vectors too,
1311	// however we prefer to unroll and specialize alignment below.
1312	if (TS.isScalable()) {
1313	Value *Size = IRB.CreateTypeSize(Ty: MS.IntptrTy, Size: TS);
1314	Value *RoundUp =
1315	IRB.CreateAdd(LHS: Size, RHS: ConstantInt::get(Ty: MS.IntptrTy, V: kOriginSize - `1`));
1316	Value *End =
1317	IRB.CreateUDiv(LHS: RoundUp, RHS: ConstantInt::get(Ty: MS.IntptrTy, V: kOriginSize));
1318	auto [InsertPt, Index] =
1319	SplitBlockAndInsertSimpleForLoop(End, SplitBefore: IRB.GetInsertPoint());
1320	IRB.SetInsertPoint(InsertPt);
1321
1322	Value *GEP = IRB.CreateGEP(Ty: MS.OriginTy, Ptr: OriginPtr, IdxList: Index);
1323	IRB.CreateAlignedStore(Val: Origin, Ptr: GEP, Align: kMinOriginAlignment);
1324	return;
1325	}
1326
1327	unsigned Size = TS.getFixedValue();
1328
1329	unsigned Ofs = `0`;
1330	Align CurrentAlignment = Alignment;
1331	if (Alignment >= IntptrAlignment && IntptrSize > kOriginSize) {
1332	Value *IntptrOrigin = originToIntptr(IRB, Origin);
1333	Value *IntptrOriginPtr = IRB.CreatePointerCast(V: OriginPtr, DestTy: MS.PtrTy);
1334	for (unsigned i = `0`; i < Size / IntptrSize; ++i) {
1335	Value *Ptr = i ? IRB.CreateConstGEP1_32(Ty: MS.IntptrTy, Ptr: IntptrOriginPtr, Idx0: i)
1336	: IntptrOriginPtr;
1337	IRB.CreateAlignedStore(Val: IntptrOrigin, Ptr, Align: CurrentAlignment);
1338	Ofs += IntptrSize / kOriginSize;
1339	CurrentAlignment = IntptrAlignment;
1340	}
1341	}
1342
1343	for (unsigned i = Ofs; i < (Size + kOriginSize - `1`) / kOriginSize; ++i) {
1344	Value *GEP =
1345	i ? IRB.CreateConstGEP1_32(Ty: MS.OriginTy, Ptr: OriginPtr, Idx0: i) : OriginPtr;
1346	IRB.CreateAlignedStore(Val: Origin, Ptr: GEP, Align: CurrentAlignment);
1347	CurrentAlignment = kMinOriginAlignment;
1348	}
1349	}
1350
1351	void storeOrigin(IRBuilder<> &IRB, Value Addr, Value Shadow, Value *Origin,
1352	Value *OriginPtr, Align Alignment) {
1353	const DataLayout &DL = F.getDataLayout();
1354	const Align OriginAlignment = std::max(a: kMinOriginAlignment, b: Alignment);
1355	TypeSize StoreSize = DL.getTypeStoreSize(Ty: Shadow->getType());
1356	// ZExt cannot convert between vector and scalar
1357	Value *ConvertedShadow = convertShadowToScalar(V: Shadow, IRB);
1358	if (auto *ConstantShadow = dyn_cast<Constant>(Val: ConvertedShadow)) {
1359	if (!ClCheckConstantShadow \|\| ConstantShadow->isNullValue()) {
1360	// Origin is not needed: value is initialized or const shadow is
1361	// ignored.
1362	return;
1363	}
1364	if (llvm::isKnownNonZero(V: ConvertedShadow, Q: DL)) {
1365	// Copy origin as the value is definitely uninitialized.
1366	paintOrigin(IRB, Origin: updateOrigin(V: Origin, IRB), OriginPtr, TS: StoreSize,
1367	Alignment: OriginAlignment);
1368	return;
1369	}
1370	// Fallback to runtime check, which still can be optimized out later.
1371	}
1372
1373	TypeSize TypeSizeInBits = DL.getTypeSizeInBits(Ty: ConvertedShadow->getType());
1374	unsigned SizeIndex = TypeSizeToSizeIndex(TS: TypeSizeInBits);
1375	if (instrumentWithCalls(V: ConvertedShadow) &&
1376	SizeIndex < kNumberOfAccessSizes && !MS.CompileKernel) {
1377	FunctionCallee Fn = MS.MaybeStoreOriginFn[SizeIndex];
1378	Value *ConvertedShadow2 =
1379	IRB.CreateZExt(V: ConvertedShadow, DestTy: IRB.getIntNTy(N: `8` * (`1` << SizeIndex)));
1380	CallBase *CB = IRB.CreateCall(Callee: Fn, Args: {ConvertedShadow2, Addr, Origin});
1381	CB->addParamAttr(ArgNo: `0`, Kind: Attribute::ZExt);
1382	CB->addParamAttr(ArgNo: `2`, Kind: Attribute::ZExt);
1383	} else {
1384	Value *Cmp = convertToBool(V: ConvertedShadow, IRB, name: "_mscmp");
1385	Instruction *CheckTerm = SplitBlockAndInsertIfThen(
1386	Cond: Cmp, SplitBefore: &IRB.GetInsertPoint(), Unreachable: false*, BranchWeights: MS.OriginStoreWeights);
1387	IRBuilder<> IRBNew(CheckTerm);
1388	paintOrigin(IRB&: IRBNew, Origin: updateOrigin(V: Origin, IRB&: IRBNew), OriginPtr, TS: StoreSize,
1389	Alignment: OriginAlignment);
1390	}
1391	}
1392
1393	void materializeStores() {
1394	for (StoreInst *SI : StoreList) {
1395	IRBuilder<> IRB(SI);
1396	Value *Val = SI->getValueOperand();
1397	Value *Addr = SI->getPointerOperand();
1398	Value *Shadow = SI->isAtomic() ? getCleanShadow(V: Val) : getShadow(V: Val);
1399	Value ShadowPtr, OriginPtr;
1400	Type *ShadowTy = Shadow->getType();
1401	const Align Alignment = SI->getAlign();
1402	const Align OriginAlignment = std::max(a: kMinOriginAlignment, b: Alignment);
1403	std::tie(args&: ShadowPtr, args&: OriginPtr) =
1404	getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /isStore/ true);
1405
1406	[[maybe_unused]] StoreInst *NewSI =
1407	IRB.CreateAlignedStore(Val: Shadow, Ptr: ShadowPtr, Align: Alignment);
1408	LLVM_DEBUG(dbgs() << " STORE: " << *NewSI << "\n");
1409
1410	if (SI->isAtomic())
1411	SI->setOrdering(addReleaseOrdering(a: SI->getOrdering()));
1412
1413	if (MS.TrackOrigins && !SI->isAtomic())
1414	storeOrigin(IRB, Addr, Shadow, Origin: getOrigin(V: Val), OriginPtr,
1415	Alignment: OriginAlignment);
1416	}
1417	}
1418
1419	// Returns true if Debug Location corresponds to multiple warnings.
1420	bool shouldDisambiguateWarningLocation(const DebugLoc &DebugLoc) {
1421	if (MS.TrackOrigins < `2`)
1422	return false;
1423
1424	if (LazyWarningDebugLocationCount.empty())
1425	for (const auto &I : InstrumentationList)
1426	++LazyWarningDebugLocationCount [I.OrigIns->getDebugLoc()];
1427
1428	return LazyWarningDebugLocationCount [DebugLoc] >= ClDisambiguateWarning;
1429	}
1430
1431	/// Helper function to insert a warning at IRB's current insert point.
1432	void insertWarningFn(IRBuilder<> &IRB, Value *Origin) {
1433	if (!Origin)
1434	Origin = (Value *)IRB.getInt32(C: `0`);
1435	assert(Origin->getType()->isIntegerTy());
1436
1437	if (shouldDisambiguateWarningLocation(DebugLoc: IRB.getCurrentDebugLocation())) {
1438	// Try to create additional origin with debug info of the last origin
1439	// instruction. It may provide additional information to the user.
1440	if (Instruction *OI = dyn_cast_or_null<Instruction>(Val: Origin)) {
1441	assert(MS.TrackOrigins);
1442	auto NewDebugLoc = OI->getDebugLoc();
1443	// Origin update with missing or the same debug location provides no
1444	// additional value.
1445	if (NewDebugLoc && NewDebugLoc != IRB.getCurrentDebugLocation()) {
1446	// Insert update just before the check, so we call runtime only just
1447	// before the report.
1448	IRBuilder<> IRBOrigin(&*IRB.GetInsertPoint());
1449	IRBOrigin.SetCurrentDebugLocation(NewDebugLoc);
1450	Origin = updateOrigin(V: Origin, IRB&: IRBOrigin);
1451	}
1452	}
1453	}
1454
1455	if (MS.CompileKernel \|\| MS.TrackOrigins)
1456	IRB.CreateCall(Callee: MS.WarningFn, Args: Origin)->setCannotMerge();
1457	else
1458	IRB.CreateCall(Callee: MS.WarningFn)->setCannotMerge();
1459	// FIXME: Insert UnreachableInst if !MS.Recover?
1460	// This may invalidate some of the following checks and needs to be done
1461	// at the very end.
1462	}
1463
1464	void materializeOneCheck(IRBuilder<> &IRB, Value *ConvertedShadow,
1465	Value *Origin) {
1466	const DataLayout &DL = F.getDataLayout();
1467	TypeSize TypeSizeInBits = DL.getTypeSizeInBits(Ty: ConvertedShadow->getType());
1468	unsigned SizeIndex = TypeSizeToSizeIndex(TS: TypeSizeInBits);
1469	if (instrumentWithCalls(V: ConvertedShadow) && !MS.CompileKernel) {
1470	// ZExt cannot convert between vector and scalar
1471	ConvertedShadow = convertShadowToScalar(V: ConvertedShadow, IRB);
1472	Value *ConvertedShadow2 =
1473	IRB.CreateZExt(V: ConvertedShadow, DestTy: IRB.getIntNTy(N: `8` * (`1` << SizeIndex)));
1474
1475	if (SizeIndex < kNumberOfAccessSizes) {
1476	FunctionCallee Fn = MS.MaybeWarningFn[SizeIndex];
1477	CallBase *CB = IRB.CreateCall(
1478	Callee: Fn,
1479	Args: {ConvertedShadow2,
1480	MS.TrackOrigins && Origin ? Origin : (Value *)IRB.getInt32(C: `0`)});
1481	CB->addParamAttr(ArgNo: `0`, Kind: Attribute::ZExt);
1482	CB->addParamAttr(ArgNo: `1`, Kind: Attribute::ZExt);
1483	} else {
1484	FunctionCallee Fn = MS.MaybeWarningVarSizeFn;
1485	Value *ShadowAlloca = IRB.CreateAlloca(Ty: ConvertedShadow2->getType(), AddrSpace: `0u`);
1486	IRB.CreateStore(Val: ConvertedShadow2, Ptr: ShadowAlloca);
1487	unsigned ShadowSize = DL.getTypeAllocSize(Ty: ConvertedShadow2->getType());
1488	CallBase *CB = IRB.CreateCall(
1489	Callee: Fn,
1490	Args: {ShadowAlloca, ConstantInt::get(Ty: IRB.getInt64Ty(), V: ShadowSize),
1491	MS.TrackOrigins && Origin ? Origin : (Value *)IRB.getInt32(C: `0`)});
1492	CB->addParamAttr(ArgNo: `1`, Kind: Attribute::ZExt);
1493	CB->addParamAttr(ArgNo: `2`, Kind: Attribute::ZExt);
1494	}
1495	} else {
1496	Value *Cmp = convertToBool(V: ConvertedShadow, IRB, name: "_mscmp");
1497	Instruction *CheckTerm = SplitBlockAndInsertIfThen(
1498	Cond: Cmp, SplitBefore: &*IRB.GetInsertPoint(),
1499	/ Unreachable / !MS.Recover, BranchWeights: MS.ColdCallWeights);
1500
1501	IRB.SetInsertPoint(CheckTerm);
1502	insertWarningFn(IRB, Origin);
1503	LLVM_DEBUG(dbgs() << " CHECK: " << *Cmp << "\n");
1504	}
1505	}
1506
1507	void materializeInstructionChecks(
1508	ArrayRef<ShadowOriginAndInsertPoint> InstructionChecks) {
1509	const DataLayout &DL = F.getDataLayout();
1510	// Disable combining in some cases. TrackOrigins checks each shadow to pick
1511	// correct origin.
1512	bool Combine = !MS.TrackOrigins;
1513	Instruction *Instruction = InstructionChecks.front().OrigIns;
1514	Value Shadow = nullptr*;
1515	for (const auto &ShadowData : InstructionChecks) {
1516	assert(ShadowData.OrigIns == Instruction);
1517	IRBuilder<> IRB(Instruction);
1518
1519	Value *ConvertedShadow = ShadowData.Shadow;
1520
1521	if (auto *ConstantShadow = dyn_cast<Constant>(Val: ConvertedShadow)) {
1522	if (!ClCheckConstantShadow \|\| ConstantShadow->isNullValue()) {
1523	// Skip, value is initialized or const shadow is ignored.
1524	continue;
1525	}
1526	if (llvm::isKnownNonZero(V: ConvertedShadow, Q: DL)) {
1527	// Report as the value is definitely uninitialized.
1528	insertWarningFn(IRB, Origin: ShadowData.Origin);
1529	if (!MS.Recover)
1530	return; // Always fail and stop here, not need to check the rest.
1531	// Skip entire instruction,
1532	continue;
1533	}
1534	// Fallback to runtime check, which still can be optimized out later.
1535	}
1536
1537	if (!Combine) {
1538	materializeOneCheck(IRB, ConvertedShadow, Origin: ShadowData.Origin);
1539	continue;
1540	}
1541
1542	if (!Shadow) {
1543	Shadow = ConvertedShadow;
1544	continue;
1545	}
1546
1547	Shadow = convertToBool(V: Shadow, IRB, name: "_mscmp");
1548	ConvertedShadow = convertToBool(V: ConvertedShadow, IRB, name: "_mscmp");
1549	Shadow = IRB.CreateOr(LHS: Shadow, RHS: ConvertedShadow, Name: "_msor");
1550	}
1551
1552	if (Shadow) {
1553	assert(Combine);
1554	IRBuilder<> IRB(Instruction);
1555	materializeOneCheck(IRB, ConvertedShadow: Shadow, Origin: nullptr);
1556	}
1557	}
1558
1559	static bool isAArch64SVCount(Type *Ty) {
1560	if (TargetExtType *TTy = dyn_cast<TargetExtType>(Val: Ty))
1561	return TTy->getName() == "aarch64.svcount";
1562	return false;
1563	}
1564
1565	// This is intended to match the "AArch64 Predicate-as-Counter Type" (aka
1566	// 'target("aarch64.svcount")', but not e.g., <vscale x 4 x i32>.
1567	static bool isScalableNonVectorType(Type *Ty) {
1568	if (!isAArch64SVCount(Ty))
1569	LLVM_DEBUG(dbgs() << "isScalableNonVectorType: Unexpected type " << *Ty
1570	<< "\n");
1571
1572	return Ty->isScalableTy() && !isa<VectorType>(Val: Ty);
1573	}
1574
1575	void materializeChecks() {
1576	#ifndef NDEBUG
1577	// For assert below.
1578	SmallPtrSet<Instruction *, `16`> Done;
1579	#endif
1580
1581	for (auto I = InstrumentationList.begin();
1582	I != InstrumentationList.end();) {
1583	auto OrigIns = I->OrigIns;
1584	// Checks are grouped by the original instruction. We call all
1585	// `insertShadowCheck` for an instruction at once.
1586	assert(Done.insert(OrigIns).second);
1587	auto J = std::find_if(first: I + `1`, last: InstrumentationList.end(),
1588	pred: [OrigIns](const ShadowOriginAndInsertPoint &R) {
1589	return OrigIns != R.OrigIns;
1590	});
1591	// Process all checks of instruction at once.
1592	materializeInstructionChecks(InstructionChecks: ArrayRef<ShadowOriginAndInsertPoint>(I, J));
1593	I = J;
1594	}
1595
1596	LLVM_DEBUG(dbgs() << "DONE:\n" << F);
1597	}
1598
1599	// Returns the last instruction in the new prologue
1600	void insertKmsanPrologue(IRBuilder<> &IRB) {
1601	Value *ContextState = IRB.CreateCall(Callee: MS.MsanGetContextStateFn, Args: {});
1602	Constant *Zero = IRB.getInt32(C: `0`);
1603	MS.ParamTLS = IRB.CreateGEP(Ty: MS.MsanContextStateTy, Ptr: ContextState,
1604	IdxList: {Zero, IRB.getInt32(C: `0`)}, Name: "param_shadow");
1605	MS.RetvalTLS = IRB.CreateGEP(Ty: MS.MsanContextStateTy, Ptr: ContextState,
1606	IdxList: {Zero, IRB.getInt32(C: `1`)}, Name: "retval_shadow");
1607	MS.VAArgTLS = IRB.CreateGEP(Ty: MS.MsanContextStateTy, Ptr: ContextState,
1608	IdxList: {Zero, IRB.getInt32(C: `2`)}, Name: "va_arg_shadow");
1609	MS.VAArgOriginTLS = IRB.CreateGEP(Ty: MS.MsanContextStateTy, Ptr: ContextState,
1610	IdxList: {Zero, IRB.getInt32(C: `3`)}, Name: "va_arg_origin");
1611	MS.VAArgOverflowSizeTLS =
1612	IRB.CreateGEP(Ty: MS.MsanContextStateTy, Ptr: ContextState,
1613	IdxList: {Zero, IRB.getInt32(C: `4`)}, Name: "va_arg_overflow_size");
1614	MS.ParamOriginTLS = IRB.CreateGEP(Ty: MS.MsanContextStateTy, Ptr: ContextState,
1615	IdxList: {Zero, IRB.getInt32(C: `5`)}, Name: "param_origin");
1616	MS.RetvalOriginTLS =
1617	IRB.CreateGEP(Ty: MS.MsanContextStateTy, Ptr: ContextState,
1618	IdxList: {Zero, IRB.getInt32(C: `6`)}, Name: "retval_origin");
1619	if (MS.TargetTriple.getArch() == Triple::systemz)
1620	MS.MsanMetadataAlloca = IRB.CreateAlloca(Ty: MS.MsanMetadata, AddrSpace: `0u`);
1621	}
1622
1623	/// Add MemorySanitizer instrumentation to a function.
1624	bool runOnFunction() {
1625	// Iterate all BBs in depth-first order and create shadow instructions
1626	// for all instructions (where applicable).
1627	// For PHI nodes we create dummy shadow PHIs which will be finalized later.
1628	for (BasicBlock *BB : depth_first(G: FnPrologueEnd->getParent()))
1629	visit(BB&: *BB);
1630
1631	// `visit` above only collects instructions. Process them after iterating
1632	// CFG to avoid requirement on CFG transformations.
1633	for (Instruction *I : Instructions)
1634	InstVisitor<MemorySanitizerVisitor>::visit(I&: *I);
1635
1636	// Finalize PHI nodes.
1637	for (PHINode *PN : ShadowPHINodes) {
1638	PHINode *PNS = cast<PHINode>(Val: getShadow(V: PN));
1639	PHINode PNO = MS.TrackOrigins ? cast<PHINode>(Val: getOrigin(V: PN)) : nullptr*;
1640	size_t NumValues = PN->getNumIncomingValues();
1641	for (size_t v = `0`; v < NumValues; v++) {
1642	PNS->addIncoming(V: getShadow(I: PN, i: v), BB: PN->getIncomingBlock(i: v));
1643	if (PNO)
1644	PNO->addIncoming(V: getOrigin(I: PN, i: v), BB: PN->getIncomingBlock(i: v));
1645	}
1646	}
1647
1648	VAHelper ->finalizeInstrumentation();
1649
1650	// Poison llvm.lifetime.start intrinsics, if we haven't fallen back to
1651	// instrumenting only allocas.
1652	if (ClHandleLifetimeIntrinsics) {
1653	for (auto Item : LifetimeStartList) {
1654	instrumentAlloca(I&: *Item.second, InsPoint: Item.first);
1655	AllocaSet.remove(X: Item.second);
1656	}
1657	}
1658	// Poison the allocas for which we didn't instrument the corresponding
1659	// lifetime intrinsics.
1660	for (AllocaInst *AI : AllocaSet)
1661	instrumentAlloca(I&: *AI);
1662
1663	// Insert shadow value checks.
1664	materializeChecks();
1665
1666	// Delayed instrumentation of StoreInst.
1667	// This may not add new address checks.
1668	materializeStores();
1669
1670	return true;
1671	}
1672
1673	/// Compute the shadow type that corresponds to a given Value.
1674	Type getShadowTy(Value V) { return getShadowTy(OrigTy: V->getType()); }
1675
1676	/// Compute the shadow type that corresponds to a given Type.
1677	Type getShadowTy(Type OrigTy) {
1678	if (!OrigTy->isSized()) {
1679	return nullptr;
1680	}
1681	// For integer type, shadow is the same as the original type.
1682	// This may return weird-sized types like i1.
1683	if (IntegerType *IT = dyn_cast<IntegerType>(Val: OrigTy))
1684	return IT;
1685	const DataLayout &DL = F.getDataLayout();
1686	if (VectorType *VT = dyn_cast<VectorType>(Val: OrigTy)) {
1687	uint32_t EltSize = DL.getTypeSizeInBits(Ty: VT->getElementType());
1688	return VectorType::get(ElementType: IntegerType::get(C&: *MS.C, NumBits: EltSize),
1689	EC: VT->getElementCount());
1690	}
1691	if (ArrayType *AT = dyn_cast<ArrayType>(Val: OrigTy)) {
1692	return ArrayType::get(ElementType: getShadowTy(OrigTy: AT->getElementType()),
1693	NumElements: AT->getNumElements());
1694	}
1695	if (StructType *ST = dyn_cast<StructType>(Val: OrigTy)) {
1696	SmallVector<Type *, `4`> Elements;
1697	for (unsigned i = `0`, n = ST->getNumElements(); i < n; i++)
1698	Elements.push_back(Elt: getShadowTy(OrigTy: ST->getElementType(N: i)));
1699	StructType Res = StructType::get(Context&: MS.C, Elements, isPacked: ST->isPacked());
1700	LLVM_DEBUG(dbgs() << "getShadowTy: " << ST << " ===> " << Res << "\n");
1701	return Res;
1702	}
1703	if (isScalableNonVectorType(Ty: OrigTy)) {
1704	LLVM_DEBUG(dbgs() << "getShadowTy: Scalable non-vector type: " << *OrigTy
1705	<< "\n");
1706	return OrigTy;
1707	}
1708
1709	uint32_t TypeSize = DL.getTypeSizeInBits(Ty: OrigTy);
1710	return IntegerType::get(C&: *MS.C, NumBits: TypeSize);
1711	}
1712
1713	/// Extract combined shadow of struct elements as a bool
1714	Value collapseStructShadow(StructType Struct, Value *Shadow,
1715	IRBuilder<> &IRB) {
1716	Value FalseVal = IRB.getIntN(/* width / N: `1`, / value / C: `0`);
1717	Value *Aggregator = FalseVal;
1718
1719	for (unsigned Idx = `0`; Idx < Struct->getNumElements(); Idx++) {
1720	// Combine by ORing together each element's bool shadow
1721	Value *ShadowItem = IRB.CreateExtractValue(Agg: Shadow, Idxs: Idx);
1722	Value *ShadowBool = convertToBool(V: ShadowItem, IRB);
1723
1724	if (Aggregator != FalseVal)
1725	Aggregator = IRB.CreateOr(LHS: Aggregator, RHS: ShadowBool);
1726	else
1727	Aggregator = ShadowBool;
1728	}
1729
1730	return Aggregator;
1731	}
1732
1733	// Extract combined shadow of array elements
1734	Value collapseArrayShadow(ArrayType Array, Value *Shadow,
1735	IRBuilder<> &IRB) {
1736	if (!Array->getNumElements())
1737	return IRB.getIntN(/ width / N: `1`, / value / C: `0`);
1738
1739	Value *FirstItem = IRB.CreateExtractValue(Agg: Shadow, Idxs: `0`);
1740	Value *Aggregator = convertShadowToScalar(V: FirstItem, IRB);
1741
1742	for (unsigned Idx = `1`; Idx < Array->getNumElements(); Idx++) {
1743	Value *ShadowItem = IRB.CreateExtractValue(Agg: Shadow, Idxs: Idx);
1744	Value *ShadowInner = convertShadowToScalar(V: ShadowItem, IRB);
1745	Aggregator = IRB.CreateOr(LHS: Aggregator, RHS: ShadowInner);
1746	}
1747	return Aggregator;
1748	}
1749
1750	/// Convert a shadow value to it's flattened variant. The resulting
1751	/// shadow may not necessarily have the same bit width as the input
1752	/// value, but it will always be comparable to zero.
1753	Value convertShadowToScalar(Value V, IRBuilder<> &IRB) {
1754	if (StructType *Struct = dyn_cast<StructType>(Val: V->getType()))
1755	return collapseStructShadow(Struct, Shadow: V, IRB);
1756	if (ArrayType *Array = dyn_cast<ArrayType>(Val: V->getType()))
1757	return collapseArrayShadow(Array, Shadow: V, IRB);
1758	if (isa<VectorType>(Val: V->getType())) {
1759	if (isa<ScalableVectorType>(Val: V->getType()))
1760	return convertShadowToScalar(V: IRB.CreateOrReduce(Src: V), IRB);
1761	unsigned BitWidth =
1762	V->getType()->getPrimitiveSizeInBits().getFixedValue();
1763	return IRB.CreateBitCast(V, DestTy: IntegerType::get(C&: *MS.C, NumBits: BitWidth));
1764	}
1765	return V;
1766	}
1767
1768	// Convert a scalar value to an i1 by comparing with 0
1769	Value convertToBool(Value V, IRBuilder<> &IRB, const Twine &name = "") {
1770	Type *VTy = V->getType();
1771	if (!VTy->isIntegerTy())
1772	return convertToBool(V: convertShadowToScalar(V, IRB), IRB, name);
1773	if (VTy->getIntegerBitWidth() == `1`)
1774	// Just converting a bool to a bool, so do nothing.
1775	return V;
1776	return IRB.CreateICmpNE(LHS: V, RHS: ConstantInt::get(Ty: VTy, V: `0`), Name: name);
1777	}
1778
1779	Type ptrToIntPtrType(Type PtrTy) const {
1780	if (VectorType *VectTy = dyn_cast<VectorType>(Val: PtrTy)) {
1781	return VectorType::get(ElementType: ptrToIntPtrType(PtrTy: VectTy->getElementType()),
1782	EC: VectTy->getElementCount());
1783	}
1784	assert(PtrTy->isIntOrPtrTy());
1785	return MS.IntptrTy;
1786	}
1787
1788	Type getPtrToShadowPtrType(Type IntPtrTy, Type ShadowTy) const* {
1789	if (VectorType *VectTy = dyn_cast<VectorType>(Val: IntPtrTy)) {
1790	return VectorType::get(
1791	ElementType: getPtrToShadowPtrType(IntPtrTy: VectTy->getElementType(), ShadowTy),
1792	EC: VectTy->getElementCount());
1793	}
1794	assert(IntPtrTy == MS.IntptrTy);
1795	return MS.PtrTy;
1796	}
1797
1798	Constant constToIntPtr(Type IntPtrTy, uint64_t C) const {
1799	if (VectorType *VectTy = dyn_cast<VectorType>(Val: IntPtrTy)) {
1800	return ConstantVector::getSplat(
1801	EC: VectTy->getElementCount(),
1802	Elt: constToIntPtr(IntPtrTy: VectTy->getElementType(), C));
1803	}
1804	assert(IntPtrTy == MS.IntptrTy);
1805	// TODO: Avoid implicit trunc?
1806	// See https://github.com/llvm/llvm-project/issues/112510.
1807	return ConstantInt::get(Ty: MS.IntptrTy, V: C, /IsSigned=/false,
1808	/ImplicitTrunc=/true);
1809	}
1810
1811	/// Returns the integer shadow offset that corresponds to a given
1812	/// application address, whereby:
1813	///
1814	/// Offset = (Addr & ~AndMask) ^ XorMask
1815	/// Shadow = ShadowBase + Offset
1816	/// Origin = (OriginBase + Offset) & ~Alignment
1817	///
1818	/// Note: for efficiency, many shadow mappings only require use the XorMask
1819	/// and OriginBase; the AndMask and ShadowBase are often zero.
1820	Value getShadowPtrOffset(Value Addr, IRBuilder<> &IRB) {
1821	Type *IntptrTy = ptrToIntPtrType(PtrTy: Addr->getType());
1822	Value *OffsetLong = IRB.CreatePointerCast(V: Addr, DestTy: IntptrTy);
1823
1824	if (uint64_t AndMask = MS.MapParams->AndMask)
1825	OffsetLong = IRB.CreateAnd(LHS: OffsetLong, RHS: constToIntPtr(IntPtrTy: IntptrTy, C: ~AndMask));
1826
1827	if (uint64_t XorMask = MS.MapParams->XorMask)
1828	OffsetLong = IRB.CreateXor(LHS: OffsetLong, RHS: constToIntPtr(IntPtrTy: IntptrTy, C: XorMask));
1829	return OffsetLong;
1830	}
1831
1832	/// Compute the shadow and origin addresses corresponding to a given
1833	/// application address.
1834	///
1835	/// Shadow = ShadowBase + Offset
1836	/// Origin = (OriginBase + Offset) & ~3ULL
1837	/// Addr can be a ptr or <N x ptr>. In both cases ShadowTy the shadow type of
1838	/// a single pointee.
1839	/// Returns <shadow_ptr, origin_ptr> or <<N x shadow_ptr>, <N x origin_ptr>>.
1840	std::pair<Value , Value >
1841	getShadowOriginPtrUserspace(Value Addr, IRBuilder<> &IRB, Type ShadowTy,
1842	MaybeAlign Alignment) {
1843	VectorType *VectTy = dyn_cast<VectorType>(Val: Addr->getType());
1844	if (!VectTy) {
1845	assert(Addr->getType()->isPointerTy());
1846	} else {
1847	assert(VectTy->getElementType()->isPointerTy());
1848	}
1849	Type *IntptrTy = ptrToIntPtrType(PtrTy: Addr->getType());
1850	Value *ShadowOffset = getShadowPtrOffset(Addr, IRB);
1851	Value *ShadowLong = ShadowOffset;
1852	if (uint64_t ShadowBase = MS.MapParams->ShadowBase) {
1853	ShadowLong =
1854	IRB.CreateAdd(LHS: ShadowLong, RHS: constToIntPtr(IntPtrTy: IntptrTy, C: ShadowBase));
1855	}
1856	Value *ShadowPtr = IRB.CreateIntToPtr(
1857	V: ShadowLong, DestTy: getPtrToShadowPtrType(IntPtrTy: IntptrTy, ShadowTy));
1858
1859	Value OriginPtr = nullptr*;
1860	if (MS.TrackOrigins) {
1861	Value *OriginLong = ShadowOffset;
1862	uint64_t OriginBase = MS.MapParams->OriginBase;
1863	if (OriginBase != `0`)
1864	OriginLong =
1865	IRB.CreateAdd(LHS: OriginLong, RHS: constToIntPtr(IntPtrTy: IntptrTy, C: OriginBase));
1866	if (!Alignment \|\| *Alignment < kMinOriginAlignment) {
1867	uint64_t Mask = kMinOriginAlignment.value() - `1`;
1868	OriginLong = IRB.CreateAnd(LHS: OriginLong, RHS: constToIntPtr(IntPtrTy: IntptrTy, C: ~Mask));
1869	}
1870	OriginPtr = IRB.CreateIntToPtr(
1871	V: OriginLong, DestTy: getPtrToShadowPtrType(IntPtrTy: IntptrTy, ShadowTy: MS.OriginTy));
1872	}
1873	return std::make_pair(x&: ShadowPtr, y&: OriginPtr);
1874	}
1875
1876	template <typename... ArgsTy>
1877	Value *createMetadataCall(IRBuilder<> &IRB, FunctionCallee Callee,
1878	ArgsTy... Args) {
1879	if (MS.TargetTriple.getArch() == Triple::systemz) {
1880	IRB.CreateCall(Callee,
1881	{MS.MsanMetadataAlloca, std::forward<ArgsTy>(Args)...});
1882	return IRB.CreateLoad(Ty: MS.MsanMetadata, Ptr: MS.MsanMetadataAlloca);
1883	}
1884
1885	return IRB.CreateCall(Callee, {std::forward<ArgsTy>(Args)...});
1886	}
1887
1888	std::pair<Value , Value > getShadowOriginPtrKernelNoVec(Value *Addr,
1889	IRBuilder<> &IRB,
1890	Type *ShadowTy,
1891	bool isStore) {
1892	Value *ShadowOriginPtrs;
1893	const DataLayout &DL = F.getDataLayout();
1894	TypeSize Size = DL.getTypeStoreSize(Ty: ShadowTy);
1895
1896	FunctionCallee Getter = MS.getKmsanShadowOriginAccessFn(isStore, size: Size);
1897	Value *AddrCast = IRB.CreatePointerCast(V: Addr, DestTy: MS.PtrTy);
1898	if (Getter) {
1899	ShadowOriginPtrs = createMetadataCall(IRB, Callee: Getter, Args: AddrCast);
1900	} else {
1901	Value *SizeVal = ConstantInt::get(Ty: MS.IntptrTy, V: Size);
1902	ShadowOriginPtrs = createMetadataCall(
1903	IRB,
1904	Callee: isStore ? MS.MsanMetadataPtrForStoreN : MS.MsanMetadataPtrForLoadN,
1905	Args: AddrCast, Args: SizeVal);
1906	}
1907	Value *ShadowPtr = IRB.CreateExtractValue(Agg: ShadowOriginPtrs, Idxs: `0`);
1908	ShadowPtr = IRB.CreatePointerCast(V: ShadowPtr, DestTy: MS.PtrTy);
1909	Value *OriginPtr = IRB.CreateExtractValue(Agg: ShadowOriginPtrs, Idxs: `1`);
1910
1911	return std::make_pair(x&: ShadowPtr, y&: OriginPtr);
1912	}
1913
1914	/// Addr can be a ptr or <N x ptr>. In both cases ShadowTy the shadow type of
1915	/// a single pointee.
1916	/// Returns <shadow_ptr, origin_ptr> or <<N x shadow_ptr>, <N x origin_ptr>>.
1917	std::pair<Value , Value > getShadowOriginPtrKernel(Value *Addr,
1918	IRBuilder<> &IRB,
1919	Type *ShadowTy,
1920	bool isStore) {
1921	VectorType *VectTy = dyn_cast<VectorType>(Val: Addr->getType());
1922	if (!VectTy) {
1923	assert(Addr->getType()->isPointerTy());
1924	return getShadowOriginPtrKernelNoVec(Addr, IRB, ShadowTy, isStore);
1925	}
1926
1927	// TODO: Support callbacs with vectors of addresses.
1928	unsigned NumElements = cast<FixedVectorType>(Val: VectTy)->getNumElements();
1929	Value *ShadowPtrs = ConstantInt::getNullValue(
1930	Ty: FixedVectorType::get(ElementType: IRB.getPtrTy(), NumElts: NumElements));
1931	Value OriginPtrs = nullptr*;
1932	if (MS.TrackOrigins)
1933	OriginPtrs = ConstantInt::getNullValue(
1934	Ty: FixedVectorType::get(ElementType: IRB.getPtrTy(), NumElts: NumElements));
1935	for (unsigned i = `0`; i < NumElements; ++i) {
1936	Value *OneAddr =
1937	IRB.CreateExtractElement(Vec: Addr, Idx: ConstantInt::get(Ty: IRB.getInt32Ty(), V: i));
1938	auto [ShadowPtr, OriginPtr] =
1939	getShadowOriginPtrKernelNoVec(Addr: OneAddr, IRB, ShadowTy, isStore);
1940
1941	ShadowPtrs = IRB.CreateInsertElement(
1942	Vec: ShadowPtrs, NewElt: ShadowPtr, Idx: ConstantInt::get(Ty: IRB.getInt32Ty(), V: i));
1943	if (MS.TrackOrigins)
1944	OriginPtrs = IRB.CreateInsertElement(
1945	Vec: OriginPtrs, NewElt: OriginPtr, Idx: ConstantInt::get(Ty: IRB.getInt32Ty(), V: i));
1946	}
1947	return {ShadowPtrs, OriginPtrs};
1948	}
1949
1950	std::pair<Value , Value > getShadowOriginPtr(Value *Addr, IRBuilder<> &IRB,
1951	Type *ShadowTy,
1952	MaybeAlign Alignment,
1953	bool isStore) {
1954	if (MS.CompileKernel)
1955	return getShadowOriginPtrKernel(Addr, IRB, ShadowTy, isStore);
1956	return getShadowOriginPtrUserspace(Addr, IRB, ShadowTy, Alignment);
1957	}
1958
1959	/// Compute the shadow address for a given function argument.
1960	///
1961	/// Shadow = ParamTLS+ArgOffset.
1962	Value getShadowPtrForArgument(IRBuilder<> &IRB, int* ArgOffset) {
1963	return IRB.CreatePtrAdd(Ptr: MS.ParamTLS,
1964	Offset: ConstantInt::get(Ty: MS.IntptrTy, V: ArgOffset), Name: "_msarg");
1965	}
1966
1967	/// Compute the origin address for a given function argument.
1968	Value getOriginPtrForArgument(IRBuilder<> &IRB, int* ArgOffset) {
1969	if (!MS.TrackOrigins)
1970	return nullptr;
1971	return IRB.CreatePtrAdd(Ptr: MS.ParamOriginTLS,
1972	Offset: ConstantInt::get(Ty: MS.IntptrTy, V: ArgOffset),
1973	Name: "_msarg_o");
1974	}
1975
1976	/// Compute the shadow address for a retval.
1977	Value *getShadowPtrForRetval(IRBuilder<> &IRB) {
1978	return IRB.CreatePointerCast(V: MS.RetvalTLS, DestTy: IRB.getPtrTy(AddrSpace: `0`), Name: "_msret");
1979	}
1980
1981	/// Compute the origin address for a retval.
1982	Value *getOriginPtrForRetval() {
1983	// We keep a single origin for the entire retval. Might be too optimistic.
1984	return MS.RetvalOriginTLS;
1985	}
1986
1987	/// Set SV to be the shadow value for V.
1988	void setShadow(Value V, Value SV) {
1989	assert(!ShadowMap.count(V) && "Values may only have one shadow");
1990	ShadowMap [V] = PropagateShadow ? SV : getCleanShadow(V);
1991	}
1992
1993	/// Set Origin to be the origin value for V.
1994	void setOrigin(Value V, Value Origin) {
1995	if (!MS.TrackOrigins)
1996	return;
1997	assert(!OriginMap.count(V) && "Values may only have one origin");
1998	LLVM_DEBUG(dbgs() << "ORIGIN: " << V << " ==> " << Origin << "\n");
1999	OriginMap [V] = Origin;
2000	}
2001
2002	Constant getCleanShadow(Type OrigTy) {
2003	Type *ShadowTy = getShadowTy(OrigTy);
2004	if (!ShadowTy)
2005	return nullptr;
2006	return Constant::getNullValue(Ty: ShadowTy);
2007	}
2008
2009	/// Create a clean shadow value for a given value.
2010	///
2011	/// Clean shadow (all zeroes) means all bits of the value are defined
2012	/// (initialized).
2013	Constant getCleanShadow(Value V) { return getCleanShadow(OrigTy: V->getType()); }
2014
2015	/// Create a dirty shadow of a given shadow type.
2016	Constant getPoisonedShadow(Type ShadowTy) {
2017	assert(ShadowTy);
2018	if (isa<IntegerType>(Val: ShadowTy) \|\| isa<VectorType>(Val: ShadowTy))
2019	return Constant::getAllOnesValue(Ty: ShadowTy);
2020	if (ArrayType *AT = dyn_cast<ArrayType>(Val: ShadowTy)) {
2021	SmallVector<Constant *, `4`> Vals(AT->getNumElements(),
2022	getPoisonedShadow(ShadowTy: AT->getElementType()));
2023	return ConstantArray::get(T: AT, V: Vals);
2024	}
2025	if (StructType *ST = dyn_cast<StructType>(Val: ShadowTy)) {
2026	SmallVector<Constant *, `4`> Vals;
2027	for (unsigned i = `0`, n = ST->getNumElements(); i < n; i++)
2028	Vals.push_back(Elt: getPoisonedShadow(ShadowTy: ST->getElementType(N: i)));
2029	return ConstantStruct::get(T: ST, V: Vals);
2030	}
2031	llvm_unreachable("Unexpected shadow type");
2032	}
2033
2034	/// Create a dirty shadow for a given value.
2035	Constant getPoisonedShadow(Value V) {
2036	Type *ShadowTy = getShadowTy(V);
2037	if (!ShadowTy)
2038	return nullptr;
2039	return getPoisonedShadow(ShadowTy);
2040	}
2041
2042	/// Create a clean (zero) origin.
2043	Value getCleanOrigin() { return* Constant::getNullValue(Ty: MS.OriginTy); }
2044
2045	/// Get the shadow value for a given Value.
2046	///
2047	/// This function either returns the value set earlier with setShadow,
2048	/// or extracts if from ParamTLS (for function arguments).
2049	Value getShadow(Value V) {
2050	if (Instruction *I = dyn_cast<Instruction>(Val: V)) {
2051	if (!PropagateShadow \|\| I->getMetadata(KindID: LLVMContext::MD_nosanitize))
2052	return getCleanShadow(V);
2053	// For instructions the shadow is already stored in the map.
2054	Value *Shadow = ShadowMap [V];
2055	if (!Shadow) {
2056	LLVM_DEBUG(dbgs() << "No shadow: " << V << "\n" << (I->getParent()));
2057	assert(Shadow && "No shadow for a value");
2058	}
2059	return Shadow;
2060	}
2061	// Handle fully undefined values
2062	// (partially undefined constant vectors are handled later)
2063	if ([[maybe_unused]] UndefValue *U = dyn_cast<UndefValue>(Val: V)) {
2064	Value *AllOnes = (PropagateShadow && PoisonUndef) ? getPoisonedShadow(V)
2065	: getCleanShadow(V);
2066	LLVM_DEBUG(dbgs() << "Undef: " << U << " ==> " << AllOnes << "\n");
2067	return AllOnes;
2068	}
2069	if (Argument *A = dyn_cast<Argument>(Val: V)) {
2070	// For arguments we compute the shadow on demand and store it in the map.
2071	Value *&ShadowPtr = ShadowMap [V];
2072	if (ShadowPtr)
2073	return ShadowPtr;
2074	Function *F = A->getParent();
2075	IRBuilder<> EntryIRB(FnPrologueEnd);
2076	unsigned ArgOffset = `0`;
2077	const DataLayout &DL = F->getDataLayout();
2078	for (auto &FArg : F->args()) {
2079	if (!FArg.getType()->isSized() \|\| FArg.getType()->isScalableTy()) {
2080	LLVM_DEBUG(dbgs() << (FArg.getType()->isScalableTy()
2081	? "vscale not fully supported\n"
2082	: "Arg is not sized\n"));
2083	if (A == &FArg) {
2084	ShadowPtr = getCleanShadow(V);
2085	setOrigin(V: A, Origin: getCleanOrigin());
2086	break;
2087	}
2088	continue;
2089	}
2090
2091	unsigned Size = FArg.hasByValAttr()
2092	? DL.getTypeAllocSize(Ty: FArg.getParamByValType())
2093	: DL.getTypeAllocSize(Ty: FArg.getType());
2094
2095	if (A == &FArg) {
2096	bool Overflow = ArgOffset + Size > kParamTLSSize;
2097	if (FArg.hasByValAttr()) {
2098	// ByVal pointer itself has clean shadow. We copy the actual
2099	// argument shadow to the underlying memory.
2100	// Figure out maximal valid memcpy alignment.
2101	const Align ArgAlign = DL.getValueOrABITypeAlignment(
2102	Alignment: FArg.getParamAlign(), Ty: FArg.getParamByValType());
2103	Value CpShadowPtr, CpOriginPtr;
2104	std::tie(args&: CpShadowPtr, args&: CpOriginPtr) =
2105	getShadowOriginPtr(Addr: V, IRB&: EntryIRB, ShadowTy: EntryIRB.getInt8Ty(), Alignment: ArgAlign,
2106	/isStore/ true);
2107	if (!PropagateShadow \|\| Overflow) {
2108	// ParamTLS overflow.
2109	EntryIRB.CreateMemSet(
2110	Ptr: CpShadowPtr, Val: Constant::getNullValue(Ty: EntryIRB.getInt8Ty()),
2111	Size, Align: ArgAlign);
2112	} else {
2113	Value *Base = getShadowPtrForArgument(IRB&: EntryIRB, ArgOffset);
2114	const Align CopyAlign = std::min(a: ArgAlign, b: kShadowTLSAlignment);
2115	[[maybe_unused]] Value *Cpy = EntryIRB.CreateMemCpy(
2116	Dst: CpShadowPtr, DstAlign: CopyAlign, Src: Base, SrcAlign: CopyAlign, Size);
2117	LLVM_DEBUG(dbgs() << " ByValCpy: " << *Cpy << "\n");
2118
2119	if (MS.TrackOrigins) {
2120	Value *OriginPtr = getOriginPtrForArgument(IRB&: EntryIRB, ArgOffset);
2121	// FIXME: OriginSize should be:
2122	// alignTo(V % kMinOriginAlignment + Size, kMinOriginAlignment)
2123	unsigned OriginSize = alignTo(Size, A: kMinOriginAlignment);
2124	EntryIRB.CreateMemCpy(
2125	Dst: CpOriginPtr,
2126	/ by getShadowOriginPtr / DstAlign: kMinOriginAlignment, Src: OriginPtr,
2127	/ by origin_tls[ArgOffset] / SrcAlign: kMinOriginAlignment,
2128	Size: OriginSize);
2129	}
2130	}
2131	}
2132
2133	if (!PropagateShadow \|\| Overflow \|\| FArg.hasByValAttr() \|\|
2134	(MS.EagerChecks && FArg.hasAttribute(Kind: Attribute::NoUndef))) {
2135	ShadowPtr = getCleanShadow(V);
2136	setOrigin(V: A, Origin: getCleanOrigin());
2137	} else {
2138	// Shadow over TLS
2139	Value *Base = getShadowPtrForArgument(IRB&: EntryIRB, ArgOffset);
2140	ShadowPtr = EntryIRB.CreateAlignedLoad(Ty: getShadowTy(V: &FArg), Ptr: Base,
2141	Align: kShadowTLSAlignment);
2142	if (MS.TrackOrigins) {
2143	Value *OriginPtr = getOriginPtrForArgument(IRB&: EntryIRB, ArgOffset);
2144	setOrigin(V: A, Origin: EntryIRB.CreateLoad(Ty: MS.OriginTy, Ptr: OriginPtr));
2145	}
2146	}
2147	LLVM_DEBUG(dbgs()
2148	<< " ARG: " << FArg << " ==> " << *ShadowPtr << "\n");
2149	break;
2150	}
2151
2152	ArgOffset += alignTo(Size, A: kShadowTLSAlignment);
2153	}
2154	assert(ShadowPtr && "Could not find shadow for an argument");
2155	return ShadowPtr;
2156	}
2157
2158	// Check for partially-undefined constant vectors
2159	// TODO: scalable vectors (this is hard because we do not have IRBuilder)
2160	if (isa<FixedVectorType>(Val: V->getType()) && isa<Constant>(Val: V) &&
2161	cast<Constant>(Val: V)->containsUndefOrPoisonElement() && PropagateShadow &&
2162	PoisonUndefVectors) {
2163	unsigned NumElems = cast<FixedVectorType>(Val: V->getType())->getNumElements();
2164	SmallVector<Constant *, `32`> ShadowVector(NumElems);
2165	for (unsigned i = `0`; i != NumElems; ++i) {
2166	Constant *Elem = cast<Constant>(Val: V)->getAggregateElement(Elt: i);
2167	ShadowVector [i] = isa<UndefValue>(Val: Elem) ? getPoisonedShadow(V: Elem)
2168	: getCleanShadow(V: Elem);
2169	}
2170
2171	Value *ShadowConstant = ConstantVector::get(V: ShadowVector);
2172	LLVM_DEBUG(dbgs() << "Partial undef constant vector: " << *V << " ==> "
2173	<< *ShadowConstant << "\n");
2174
2175	return ShadowConstant;
2176	}
2177
2178	// TODO: partially-undefined constant arrays, structures, and nested types
2179
2180	// For everything else the shadow is zero.
2181	return getCleanShadow(V);
2182	}
2183
2184	/// Get the shadow for i-th argument of the instruction I.
2185	Value getShadow(Instruction I, int i) {
2186	return getShadow(V: I->getOperand(i));
2187	}
2188
2189	/// Get the origin for a value.
2190	Value getOrigin(Value V) {
2191	if (!MS.TrackOrigins)
2192	return nullptr;
2193	if (!PropagateShadow \|\| isa<Constant>(Val: V) \|\| isa<InlineAsm>(Val: V))
2194	return getCleanOrigin();
2195	assert((isa<Instruction>(V) \|\| isa<Argument>(V)) &&
2196	"Unexpected value type in getOrigin()");
2197	if (Instruction *I = dyn_cast<Instruction>(Val: V)) {
2198	if (I->getMetadata(KindID: LLVMContext::MD_nosanitize))
2199	return getCleanOrigin();
2200	}
2201	Value *Origin = OriginMap [V];
2202	assert(Origin && "Missing origin");
2203	return Origin;
2204	}
2205
2206	/// Get the origin for i-th argument of the instruction I.
2207	Value getOrigin(Instruction I, int i) {
2208	return getOrigin(V: I->getOperand(i));
2209	}
2210
2211	/// Remember the place where a shadow check should be inserted.
2212	///
2213	/// This location will be later instrumented with a check that will print a
2214	/// UMR warning in runtime if the shadow value is not 0.
2215	void insertCheckShadow(Value Shadow, Value Origin, Instruction *OrigIns) {
2216	assert(Shadow);
2217	if (!InsertChecks)
2218	return;
2219
2220	if (!DebugCounter::shouldExecute(Counter&: DebugInsertCheck)) {
2221	LLVM_DEBUG(dbgs() << "Skipping check of " << *Shadow << " before "
2222	<< *OrigIns << "\n");
2223	return;
2224	}
2225
2226	Type *ShadowTy = Shadow->getType();
2227	if (isScalableNonVectorType(Ty: ShadowTy)) {
2228	LLVM_DEBUG(dbgs() << "Skipping check of scalable non-vector " << *Shadow
2229	<< " before " << *OrigIns << "\n");
2230	return;
2231	}
2232	#ifndef NDEBUG
2233	assert((isa<IntegerType>(ShadowTy) \|\| isa<VectorType>(ShadowTy) \|\|
2234	isa<StructType>(ShadowTy) \|\| isa<ArrayType>(ShadowTy)) &&
2235	"Can only insert checks for integer, vector, and aggregate shadow "
2236	"types");
2237	#endif
2238	InstrumentationList.push_back(
2239	Elt: ShadowOriginAndInsertPoint (Shadow, Origin, OrigIns));
2240	}
2241
2242	/// Get shadow for value, and remember the place where a shadow check should
2243	/// be inserted.
2244	///
2245	/// This location will be later instrumented with a check that will print a
2246	/// UMR warning in runtime if the value is not fully defined.
2247	void insertCheckShadowOf(Value Val, Instruction OrigIns) {
2248	assert(Val);
2249	Value Shadow, Origin;
2250	if (ClCheckConstantShadow) {
2251	Shadow = getShadow(V: Val);
2252	if (!Shadow)
2253	return;
2254	Origin = getOrigin(V: Val);
2255	} else {
2256	Shadow = dyn_cast_or_null<Instruction>(Val: getShadow(V: Val));
2257	if (!Shadow)
2258	return;
2259	Origin = dyn_cast_or_null<Instruction>(Val: getOrigin(V: Val));
2260	}
2261	insertCheckShadow(Shadow, Origin, OrigIns);
2262	}
2263
2264	AtomicOrdering addReleaseOrdering(AtomicOrdering a) {
2265	switch (a) {
2266	case AtomicOrdering::NotAtomic:
2267	return AtomicOrdering::NotAtomic;
2268	case AtomicOrdering::Unordered:
2269	case AtomicOrdering::Monotonic:
2270	case AtomicOrdering::Release:
2271	return AtomicOrdering::Release;
2272	case AtomicOrdering::Acquire:
2273	case AtomicOrdering::AcquireRelease:
2274	return AtomicOrdering::AcquireRelease;
2275	case AtomicOrdering::SequentiallyConsistent:
2276	return AtomicOrdering::SequentiallyConsistent;
2277	}
2278	llvm_unreachable("Unknown ordering");
2279	}
2280
2281	Value *makeAddReleaseOrderingTable(IRBuilder<> &IRB) {
2282	constexpr int NumOrderings = (int)AtomicOrderingCABI::seq_cst + `1`;
2283	uint32_t OrderingTable[NumOrderings] = {};
2284
2285	OrderingTable[(int)AtomicOrderingCABI::relaxed] =
2286	OrderingTable[(int)AtomicOrderingCABI::release] =
2287	(int)AtomicOrderingCABI::release;
2288	OrderingTable[(int)AtomicOrderingCABI::consume] =
2289	OrderingTable[(int)AtomicOrderingCABI::acquire] =
2290	OrderingTable[(int)AtomicOrderingCABI::acq_rel] =
2291	(int)AtomicOrderingCABI::acq_rel;
2292	OrderingTable[(int)AtomicOrderingCABI::seq_cst] =
2293	(int)AtomicOrderingCABI::seq_cst;
2294
2295	return ConstantDataVector::get(Context&: IRB.getContext(), Elts: OrderingTable);
2296	}
2297
2298	AtomicOrdering addAcquireOrdering(AtomicOrdering a) {
2299	switch (a) {
2300	case AtomicOrdering::NotAtomic:
2301	return AtomicOrdering::NotAtomic;
2302	case AtomicOrdering::Unordered:
2303	case AtomicOrdering::Monotonic:
2304	case AtomicOrdering::Acquire:
2305	return AtomicOrdering::Acquire;
2306	case AtomicOrdering::Release:
2307	case AtomicOrdering::AcquireRelease:
2308	return AtomicOrdering::AcquireRelease;
2309	case AtomicOrdering::SequentiallyConsistent:
2310	return AtomicOrdering::SequentiallyConsistent;
2311	}
2312	llvm_unreachable("Unknown ordering");
2313	}
2314
2315	Value *makeAddAcquireOrderingTable(IRBuilder<> &IRB) {
2316	constexpr int NumOrderings = (int)AtomicOrderingCABI::seq_cst + `1`;
2317	uint32_t OrderingTable[NumOrderings] = {};
2318
2319	OrderingTable[(int)AtomicOrderingCABI::relaxed] =
2320	OrderingTable[(int)AtomicOrderingCABI::acquire] =
2321	OrderingTable[(int)AtomicOrderingCABI::consume] =
2322	(int)AtomicOrderingCABI::acquire;
2323	OrderingTable[(int)AtomicOrderingCABI::release] =
2324	OrderingTable[(int)AtomicOrderingCABI::acq_rel] =
2325	(int)AtomicOrderingCABI::acq_rel;
2326	OrderingTable[(int)AtomicOrderingCABI::seq_cst] =
2327	(int)AtomicOrderingCABI::seq_cst;
2328
2329	return ConstantDataVector::get(Context&: IRB.getContext(), Elts: OrderingTable);
2330	}
2331
2332	// ------------------- Visitors.
2333	using InstVisitor<MemorySanitizerVisitor>::visit;
2334	void visit(Instruction &I) {
2335	if (I.getMetadata(KindID: LLVMContext::MD_nosanitize))
2336	return;
2337	// Don't want to visit if we're in the prologue
2338	if (isInPrologue(I))
2339	return;
2340	if (!DebugCounter::shouldExecute(Counter&: DebugInstrumentInstruction)) {
2341	LLVM_DEBUG(dbgs() << "Skipping instruction: " << I << "\n");
2342	// We still need to set the shadow and origin to clean values.
2343	setShadow(V: &I, SV: getCleanShadow(V: &I));
2344	setOrigin(V: &I, Origin: getCleanOrigin());
2345	return;
2346	}
2347
2348	Instructions.push_back(Elt: &I);
2349	}
2350
2351	/// Instrument LoadInst
2352	///
2353	/// Loads the corresponding shadow and (optionally) origin.
2354	/// Optionally, checks that the load address is fully defined.
2355	void visitLoadInst(LoadInst &I) {
2356	assert(I.getType()->isSized() && "Load type must have size");
2357	assert(!I.getMetadata(LLVMContext::MD_nosanitize));
2358	NextNodeIRBuilder IRB(&I);
2359	Type *ShadowTy = getShadowTy(V: &I);
2360	Value *Addr = I.getPointerOperand();
2361	Value ShadowPtr = nullptr, OriginPtr = nullptr;
2362	const Align Alignment = I.getAlign();
2363	if (PropagateShadow) {
2364	std::tie(args&: ShadowPtr, args&: OriginPtr) =
2365	getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /isStore/ false);
2366	setShadow(V: &I,
2367	SV: IRB.CreateAlignedLoad(Ty: ShadowTy, Ptr: ShadowPtr, Align: Alignment, Name: "_msld"));
2368	} else {
2369	setShadow(V: &I, SV: getCleanShadow(V: &I));
2370	}
2371
2372	if (ClCheckAccessAddress)
2373	insertCheckShadowOf(Val: I.getPointerOperand(), OrigIns: &I);
2374
2375	if (I.isAtomic())
2376	I.setOrdering(addAcquireOrdering(a: I.getOrdering()));
2377
2378	if (MS.TrackOrigins) {
2379	if (PropagateShadow) {
2380	const Align OriginAlignment = std::max(a: kMinOriginAlignment, b: Alignment);
2381	setOrigin(
2382	V: &I, Origin: IRB.CreateAlignedLoad(Ty: MS.OriginTy, Ptr: OriginPtr, Align: OriginAlignment));
2383	} else {
2384	setOrigin(V: &I, Origin: getCleanOrigin());
2385	}
2386	}
2387	}
2388
2389	/// Instrument StoreInst
2390	///
2391	/// Stores the corresponding shadow and (optionally) origin.
2392	/// Optionally, checks that the store address is fully defined.
2393	void visitStoreInst(StoreInst &I) {
2394	StoreList.push_back(Elt: &I);
2395	if (ClCheckAccessAddress)
2396	insertCheckShadowOf(Val: I.getPointerOperand(), OrigIns: &I);
2397	}
2398
2399	void handleCASOrRMW(Instruction &I) {
2400	assert(isa<AtomicRMWInst>(I) \|\| isa<AtomicCmpXchgInst>(I));
2401
2402	IRBuilder<> IRB(&I);
2403	Value *Addr = I.getOperand(i: `0`);
2404	Value *Val = I.getOperand(i: `1`);
2405	Value *ShadowPtr = getShadowOriginPtr(Addr, IRB, ShadowTy: getShadowTy(V: Val), Alignment: Align (`1`),
2406	/isStore/ true)
2407	.first;
2408
2409	if (ClCheckAccessAddress)
2410	insertCheckShadowOf(Val: Addr, OrigIns: &I);
2411
2412	// Only test the conditional argument of cmpxchg instruction.
2413	// The other argument can potentially be uninitialized, but we can not
2414	// detect this situation reliably without possible false positives.
2415	if (isa<AtomicCmpXchgInst>(Val: I))
2416	insertCheckShadowOf(Val, OrigIns: &I);
2417
2418	IRB.CreateStore(Val: getCleanShadow(V: Val), Ptr: ShadowPtr);
2419
2420	setShadow(V: &I, SV: getCleanShadow(V: &I));
2421	setOrigin(V: &I, Origin: getCleanOrigin());
2422	}
2423
2424	void visitAtomicRMWInst(AtomicRMWInst &I) {
2425	handleCASOrRMW(I);
2426	I.setOrdering(addReleaseOrdering(a: I.getOrdering()));
2427	}
2428
2429	void visitAtomicCmpXchgInst(AtomicCmpXchgInst &I) {
2430	handleCASOrRMW(I);
2431	I.setSuccessOrdering(addReleaseOrdering(a: I.getSuccessOrdering()));
2432	}
2433
2434	/// Generic handler to compute shadow for == and != comparisons.
2435	///
2436	/// This function is used by handleEqualityComparison and visitSwitchInst.
2437	///
2438	/// Sometimes the comparison result is known even if some of the bits of the
2439	/// arguments are not.
2440	Value propagateEqualityComparison(IRBuilder<> &IRB, Value A, Value *B,
2441	Value Sa, Value Sb) {
2442	assert(getShadowTy(A) == Sa->getType());
2443	assert(getShadowTy(B) == Sb->getType());
2444
2445	// Get rid of pointers and vectors of pointers.
2446	// For ints (and vectors of ints), types of A and Sa match,
2447	// and this is a no-op.
2448	A = IRB.CreatePointerCast(V: A, DestTy: Sa->getType());
2449	B = IRB.CreatePointerCast(V: B, DestTy: Sb->getType());
2450
2451	// A == B <==> (C = A^B) == 0
2452	// A != B <==> (C = A^B) != 0
2453	// Sc = Sa \| Sb
2454	Value *C = IRB.CreateXor(LHS: A, RHS: B);
2455	Value *Sc = IRB.CreateOr(LHS: Sa, RHS: Sb);
2456	// Now dealing with i = (C == 0) comparison (or C != 0, does not matter now)
2457	// Result is defined if one of the following is true
2458	// there is a defined 1 bit in C*
2459	// C is fully defined*
2460	// Si = !(C & ~Sc) && Sc
2461	Value *Zero = Constant::getNullValue(Ty: Sc->getType());
2462	Value *MinusOne = Constant::getAllOnesValue(Ty: Sc->getType());
2463	Value *LHS = IRB.CreateICmpNE(LHS: Sc, RHS: Zero);
2464	Value *RHS =
2465	IRB.CreateICmpEQ(LHS: IRB.CreateAnd(LHS: IRB.CreateXor(LHS: Sc, RHS: MinusOne), RHS: C), RHS: Zero);
2466	Value *Si = IRB.CreateAnd(LHS, RHS);
2467	Si->setName("_msprop_icmp");
2468
2469	return Si;
2470	}
2471
2472	// Instrument:
2473	// switch i32 %Val, label %else [ i32 0, label %A
2474	// i32 1, label %B
2475	// i32 2, label %C ]
2476	//
2477	// Typically, the switch input value (%Val) is fully initialized.
2478	//
2479	// Sometimes the compiler may convert (icmp + br) into a switch statement.
2480	// MSan allows icmp eq/ne with partly initialized inputs to still result in a
2481	// fully initialized output, if there exists a bit that is initialized in
2482	// both inputs with a differing value. For compatibility, we support this in
2483	// the switch instrumentation as well. Note that this edge case only applies
2484	// if the switch input value does not match any* of the cases (matching any*
2485	// of the cases requires an exact, fully initialized match).
2486	//
2487	// ShadowCases = 0
2488	// \| propagateEqualityComparison(Val, 0)
2489	// \| propagateEqualityComparison(Val, 1)
2490	// \| propagateEqualityComparison(Val, 2))
2491	void visitSwitchInst(SwitchInst &SI) {
2492	IRBuilder<> IRB(&SI);
2493
2494	Value *Val = SI.getCondition();
2495	Value *ShadowVal = getShadow(V: Val);
2496	// TODO: add fast path - if the condition is fully initialized, we know
2497	// there is no UUM, without needing to consider the case values below.
2498
2499	// Some code (e.g., AMDGPUGenMCCodeEmitter.inc) has tens of thousands of
2500	// cases. This results in an extremely long chained expression for MSan's
2501	// switch instrumentation, which can cause the JumpThreadingPass to have a
2502	// stack overflow or excessive runtime. We limit the number of cases
2503	// considered, with the tradeoff of niche false negatives.
2504	// TODO: figure out a better solution.
2505	int casesToConsider = ClSwitchPrecision;
2506
2507	Value ShadowCases = nullptr*;
2508	for (auto Case : SI.cases()) {
2509	if (casesToConsider <= `0`)
2510	break;
2511
2512	Value *Comparator = Case.getCaseValue();
2513	// TODO: some simplification is possible when comparing multiple cases
2514	// simultaneously.
2515	Value *ComparisonShadow = propagateEqualityComparison(
2516	IRB, A: Val, B: Comparator, Sa: ShadowVal, Sb: getShadow(V: Comparator));
2517
2518	if (ShadowCases)
2519	ShadowCases = IRB.CreateOr(LHS: ShadowCases, RHS: ComparisonShadow);
2520	else
2521	ShadowCases = ComparisonShadow;
2522
2523	casesToConsider--;
2524	}
2525
2526	if (ShadowCases)
2527	insertCheckShadow(Shadow: ShadowCases, Origin: getOrigin(V: Val), OrigIns: &SI);
2528	}
2529
2530	// Vector manipulation.
2531	void visitExtractElementInst(ExtractElementInst &I) {
2532	insertCheckShadowOf(Val: I.getOperand(i_nocapture: `1`), OrigIns: &I);
2533	IRBuilder<> IRB(&I);
2534	setShadow(V: &I, SV: IRB.CreateExtractElement(Vec: getShadow(I: &I, i: `0`), Idx: I.getOperand(i_nocapture: `1`),
2535	Name: "_msprop"));
2536	setOrigin(V: &I, Origin: getOrigin(I: &I, i: `0`));
2537	}
2538
2539	void visitInsertElementInst(InsertElementInst &I) {
2540	insertCheckShadowOf(Val: I.getOperand(i_nocapture: `2`), OrigIns: &I);
2541	IRBuilder<> IRB(&I);
2542	auto *Shadow0 = getShadow(I: &I, i: `0`);
2543	auto *Shadow1 = getShadow(I: &I, i: `1`);
2544	setShadow(V: &I, SV: IRB.CreateInsertElement(Vec: Shadow0, NewElt: Shadow1, Idx: I.getOperand(i_nocapture: `2`),
2545	Name: "_msprop"));
2546	setOriginForNaryOp(I);
2547	}
2548
2549	void visitShuffleVectorInst(ShuffleVectorInst &I) {
2550	IRBuilder<> IRB(&I);
2551	auto *Shadow0 = getShadow(I: &I, i: `0`);
2552	auto *Shadow1 = getShadow(I: &I, i: `1`);
2553	setShadow(V: &I, SV: IRB.CreateShuffleVector(V1: Shadow0, V2: Shadow1, Mask: I.getShuffleMask(),
2554	Name: "_msprop"));
2555	setOriginForNaryOp(I);
2556	}
2557
2558	// Casts.
2559	void visitSExtInst(SExtInst &I) {
2560	IRBuilder<> IRB(&I);
2561	setShadow(V: &I, SV: IRB.CreateSExt(V: getShadow(I: &I, i: `0`), DestTy: I.getType(), Name: "_msprop"));
2562	setOrigin(V: &I, Origin: getOrigin(I: &I, i: `0`));
2563	}
2564
2565	void visitZExtInst(ZExtInst &I) {
2566	IRBuilder<> IRB(&I);
2567	setShadow(V: &I, SV: IRB.CreateZExt(V: getShadow(I: &I, i: `0`), DestTy: I.getType(), Name: "_msprop"));
2568	setOrigin(V: &I, Origin: getOrigin(I: &I, i: `0`));
2569	}
2570
2571	void visitTruncInst(TruncInst &I) {
2572	IRBuilder<> IRB(&I);
2573	setShadow(V: &I, SV: IRB.CreateTrunc(V: getShadow(I: &I, i: `0`), DestTy: I.getType(), Name: "_msprop"));
2574	setOrigin(V: &I, Origin: getOrigin(I: &I, i: `0`));
2575	}
2576
2577	void visitBitCastInst(BitCastInst &I) {
2578	// Special case: if this is the bitcast (there is exactly 1 allowed) between
2579	// a musttail call and a ret, don't instrument. New instructions are not
2580	// allowed after a musttail call.
2581	if (auto *CI = dyn_cast<CallInst>(Val: I.getOperand(i_nocapture: `0`)))
2582	if (CI->isMustTailCall())
2583	return;
2584	IRBuilder<> IRB(&I);
2585	setShadow(V: &I, SV: IRB.CreateBitCast(V: getShadow(I: &I, i: `0`), DestTy: getShadowTy(V: &I)));
2586	setOrigin(V: &I, Origin: getOrigin(I: &I, i: `0`));
2587	}
2588
2589	void visitPtrToIntInst(PtrToIntInst &I) {
2590	IRBuilder<> IRB(&I);
2591	setShadow(V: &I, SV: IRB.CreateIntCast(V: getShadow(I: &I, i: `0`), DestTy: getShadowTy(V: &I), isSigned: false,
2592	Name: "_msprop_ptrtoint"));
2593	setOrigin(V: &I, Origin: getOrigin(I: &I, i: `0`));
2594	}
2595
2596	void visitIntToPtrInst(IntToPtrInst &I) {
2597	IRBuilder<> IRB(&I);
2598	setShadow(V: &I, SV: IRB.CreateIntCast(V: getShadow(I: &I, i: `0`), DestTy: getShadowTy(V: &I), isSigned: false,
2599	Name: "_msprop_inttoptr"));
2600	setOrigin(V: &I, Origin: getOrigin(I: &I, i: `0`));
2601	}
2602
2603	void visitFPToSIInst(CastInst &I) { handleShadowOr(I); }
2604	void visitFPToUIInst(CastInst &I) { handleShadowOr(I); }
2605	void visitSIToFPInst(CastInst &I) { handleShadowOr(I); }
2606	void visitUIToFPInst(CastInst &I) { handleShadowOr(I); }
2607	void visitFPExtInst(CastInst &I) { handleShadowOr(I); }
2608	void visitFPTruncInst(CastInst &I) { handleShadowOr(I); }
2609
2610	/// Generic handler to compute shadow for bitwise AND.
2611	///
2612	/// This is used by 'visitAnd' but also as a primitive for other handlers.
2613	///
2614	/// This code is precise: it implements the rule that "And" of an initialized
2615	/// zero bit always results in an initialized value:
2616	// 1&1 => 1; 0&1 => 0; p&1 => p;
2617	// 1&0 => 0; 0&0 => 0; p&0 => 0;
2618	// 1&p => p; 0&p => 0; p&p => p;
2619	//
2620	// S = (S1 & S2) \| (V1 & S2) \| (S1 & V2)
2621	Value handleBitwiseAnd(IRBuilder<> &IRB, Value V1, Value V2, Value S1,
2622	Value *S2) {
2623	// "The two arguments to the ‘and’ instruction must be integer or vector
2624	// of integer values. Both arguments must have identical types."
2625	//
2626	// We enforce this condition for all callers to handleBitwiseAnd(); callers
2627	// with non-integer types should call CreateAppToShadowCast() themselves.
2628	assert(V1->getType()->isIntOrIntVectorTy());
2629	assert(V1->getType() == V2->getType());
2630
2631	// Conveniently, getShadowTy() of Int/IntVector returns the original type.
2632	assert(V1->getType() == S1->getType());
2633	assert(V2->getType() == S2->getType());
2634
2635	Value *S1S2 = IRB.CreateAnd(LHS: S1, RHS: S2);
2636	Value *V1S2 = IRB.CreateAnd(LHS: V1, RHS: S2);
2637	Value *S1V2 = IRB.CreateAnd(LHS: S1, RHS: V2);
2638
2639	return IRB.CreateOr(Ops: {S1S2, V1S2, S1V2});
2640	}
2641
2642	/// Handler for bitwise AND operator.
2643	void visitAnd(BinaryOperator &I) {
2644	IRBuilder<> IRB(&I);
2645	Value *V1 = I.getOperand(i_nocapture: `0`);
2646	Value *V2 = I.getOperand(i_nocapture: `1`);
2647	Value *S1 = getShadow(I: &I, i: `0`);
2648	Value *S2 = getShadow(I: &I, i: `1`);
2649
2650	Value *OutShadow = handleBitwiseAnd(IRB, V1, V2, S1, S2);
2651
2652	setShadow(V: &I, SV: OutShadow);
2653	setOriginForNaryOp(I);
2654	}
2655
2656	void visitOr(BinaryOperator &I) {
2657	IRBuilder<> IRB(&I);
2658	// "Or" of 1 and a poisoned value results in unpoisoned value:
2659	// 1\|1 => 1; 0\|1 => 1; p\|1 => 1;
2660	// 1\|0 => 1; 0\|0 => 0; p\|0 => p;
2661	// 1\|p => 1; 0\|p => p; p\|p => p;
2662	//
2663	// S = (S1 & S2) \| (~V1 & S2) \| (S1 & ~V2)
2664	//
2665	// If the "disjoint OR" property is violated, the result is poison, and
2666	// hence the entire shadow is uninitialized:
2667	// S = S \| SignExt(V1 & V2 != 0)
2668	Value *S1 = getShadow(I: &I, i: `0`);
2669	Value *S2 = getShadow(I: &I, i: `1`);
2670	Value *V1 = I.getOperand(i_nocapture: `0`);
2671	Value *V2 = I.getOperand(i_nocapture: `1`);
2672
2673	// "The two arguments to the ‘or’ instruction must be integer or vector
2674	// of integer values. Both arguments must have identical types."
2675	assert(V1->getType()->isIntOrIntVectorTy());
2676	assert(V1->getType() == V2->getType());
2677
2678	// Conveniently, getShadowTy() of Int/IntVector returns the original type.
2679	assert(V1->getType() == S1->getType());
2680	assert(V2->getType() == S2->getType());
2681
2682	Value *NotV1 = IRB.CreateNot(V: V1);
2683	Value *NotV2 = IRB.CreateNot(V: V2);
2684
2685	Value *S1S2 = IRB.CreateAnd(LHS: S1, RHS: S2);
2686	Value *S2NotV1 = IRB.CreateAnd(LHS: NotV1, RHS: S2);
2687	Value *S1NotV2 = IRB.CreateAnd(LHS: S1, RHS: NotV2);
2688
2689	Value *S = IRB.CreateOr(Ops: {S1S2, S2NotV1, S1NotV2});
2690
2691	if (ClPreciseDisjointOr && cast<PossiblyDisjointInst>(Val: &I)->isDisjoint()) {
2692	Value *V1V2 = IRB.CreateAnd(LHS: V1, RHS: V2);
2693	Value *DisjointOrShadow = IRB.CreateSExt(
2694	V: IRB.CreateICmpNE(LHS: V1V2, RHS: getCleanShadow(V: V1V2)), DestTy: V1V2->getType());
2695	S = IRB.CreateOr(LHS: S, RHS: DisjointOrShadow, Name: "_ms_disjoint");
2696	}
2697
2698	setShadow(V: &I, SV: S);
2699	setOriginForNaryOp(I);
2700	}
2701
2702	/// Default propagation of shadow and/or origin.
2703	///
2704	/// This class implements the general case of shadow propagation, used in all
2705	/// cases where we don't know and/or don't care about what the operation
2706	/// actually does. It converts all input shadow values to a common type
2707	/// (extending or truncating as necessary), and bitwise OR's them.
2708	///
2709	/// This is much cheaper than inserting checks (i.e. requiring inputs to be
2710	/// fully initialized), and less prone to false positives.
2711	///
2712	/// This class also implements the general case of origin propagation. For a
2713	/// Nary operation, result origin is set to the origin of an argument that is
2714	/// not entirely initialized. If there is more than one such arguments, the
2715	/// rightmost of them is picked. It does not matter which one is picked if all
2716	/// arguments are initialized.
2717	template <bool CombineShadow> class Combiner {
2718	Value Shadow = nullptr*;
2719	Value Origin = nullptr*;
2720	IRBuilder<> &IRB;
2721	MemorySanitizerVisitor *MSV;
2722
2723	public:
2724	Combiner(MemorySanitizerVisitor *MSV, IRBuilder<> &IRB)
2725	: IRB(IRB), MSV(MSV) {}
2726
2727	/// Add a pair of shadow and origin values to the mix.
2728	Combiner &Add(Value OpShadow, Value OpOrigin) {
2729	if (CombineShadow) {
2730	assert(OpShadow);
2731	if (!Shadow)
2732	Shadow = OpShadow;
2733	else {
2734	OpShadow = MSV->CreateShadowCast(IRB, V: OpShadow, dstTy: Shadow->getType());
2735	Shadow = IRB.CreateOr(LHS: Shadow, RHS: OpShadow, Name: "_msprop");
2736	}
2737	}
2738
2739	if (MSV->MS.TrackOrigins) {
2740	assert(OpOrigin);
2741	if (!Origin) {
2742	Origin = OpOrigin;
2743	} else {
2744	Constant *ConstOrigin = dyn_cast<Constant>(Val: OpOrigin);
2745	// No point in adding something that might result in 0 origin value.
2746	if (!ConstOrigin \|\| !ConstOrigin->isNullValue()) {
2747	Value *Cond = MSV->convertToBool(V: OpShadow, IRB);
2748	Origin = IRB.CreateSelect(C: Cond, True: OpOrigin, False: Origin);
2749	}
2750	}
2751	}
2752	return *this;
2753	}
2754
2755	/// Add an application value to the mix.
2756	Combiner &Add(Value *V) {
2757	Value *OpShadow = MSV->getShadow(V);
2758	Value OpOrigin = MSV->MS.TrackOrigins ? MSV->getOrigin(V) : nullptr*;
2759	return Add(OpShadow, OpOrigin);
2760	}
2761
2762	/// Set the current combined values as the given instruction's shadow
2763	/// and origin.
2764	void Done(Instruction *I) {
2765	if (CombineShadow) {
2766	assert(Shadow);
2767	Shadow = MSV->CreateShadowCast(IRB, V: Shadow, dstTy: MSV->getShadowTy(V: I));
2768	MSV->setShadow(V: I, SV: Shadow);
2769	}
2770	if (MSV->MS.TrackOrigins) {
2771	assert(Origin);
2772	MSV->setOrigin(V: I, Origin);
2773	}
2774	}
2775
2776	/// Store the current combined value at the specified origin
2777	/// location.
2778	void DoneAndStoreOrigin(TypeSize TS, Value *OriginPtr) {
2779	if (MSV->MS.TrackOrigins) {
2780	assert(Origin);
2781	MSV->paintOrigin(IRB, Origin, OriginPtr, TS, Alignment: kMinOriginAlignment);
2782	}
2783	}
2784	};
2785
2786	using ShadowAndOriginCombiner = Combiner<true>;
2787	using OriginCombiner = Combiner<false>;
2788
2789	/// Propagate origin for arbitrary operation.
2790	void setOriginForNaryOp(Instruction &I) {
2791	if (!MS.TrackOrigins)
2792	return;
2793	IRBuilder<> IRB(&I);
2794	OriginCombiner OC(this, IRB);
2795	for (Use &Op : I.operands())
2796	OC.Add(V: Op.get());
2797	OC.Done(I: &I);
2798	}
2799
2800	size_t VectorOrPrimitiveTypeSizeInBits(Type *Ty) {
2801	assert(!(Ty->isVectorTy() && Ty->getScalarType()->isPointerTy()) &&
2802	"Vector of pointers is not a valid shadow type");
2803	return Ty->isVectorTy() ? cast<FixedVectorType>(Val: Ty)->getNumElements() *
2804	Ty->getScalarSizeInBits()
2805	: Ty->getPrimitiveSizeInBits();
2806	}
2807
2808	/// Cast between two shadow types, extending or truncating as
2809	/// necessary.
2810	Value CreateShadowCast(IRBuilder<> &IRB, Value V, Type *dstTy,
2811	bool Signed = false) {
2812	Type *srcTy = V->getType();
2813	if (srcTy == dstTy)
2814	return V;
2815	size_t srcSizeInBits = VectorOrPrimitiveTypeSizeInBits(Ty: srcTy);
2816	size_t dstSizeInBits = VectorOrPrimitiveTypeSizeInBits(Ty: dstTy);
2817	if (srcSizeInBits > `1` && dstSizeInBits == `1`)
2818	return IRB.CreateICmpNE(LHS: V, RHS: getCleanShadow(V));
2819
2820	if (dstTy->isIntegerTy() && srcTy->isIntegerTy())
2821	return IRB.CreateIntCast(V, DestTy: dstTy, isSigned: Signed);
2822	if (dstTy->isVectorTy() && srcTy->isVectorTy() &&
2823	cast<VectorType>(Val: dstTy)->getElementCount() ==
2824	cast<VectorType>(Val: srcTy)->getElementCount())
2825	return IRB.CreateIntCast(V, DestTy: dstTy, isSigned: Signed);
2826	Value V1 = IRB.CreateBitCast(V, DestTy: Type::getIntNTy(C&: MS.C, N: srcSizeInBits));
2827	Value *V2 =
2828	IRB.CreateIntCast(V: V1, DestTy: Type::getIntNTy(C&: *MS.C, N: dstSizeInBits), isSigned: Signed);
2829	return IRB.CreateBitCast(V: V2, DestTy: dstTy);
2830	// TODO: handle struct types.
2831	}
2832
2833	/// Cast an application value to the type of its own shadow.
2834	Value CreateAppToShadowCast(IRBuilder<> &IRB, Value V) {
2835	Type *ShadowTy = getShadowTy(V);
2836	if (V->getType() == ShadowTy)
2837	return V;
2838	if (V->getType()->isPtrOrPtrVectorTy())
2839	return IRB.CreatePtrToInt(V, DestTy: ShadowTy);
2840	else
2841	return IRB.CreateBitCast(V, DestTy: ShadowTy);
2842	}
2843
2844	/// Propagate shadow for arbitrary operation.
2845	void handleShadowOr(Instruction &I) {
2846	IRBuilder<> IRB(&I);
2847	ShadowAndOriginCombiner SC(this, IRB);
2848	for (Use &Op : I.operands())
2849	SC.Add(V: Op.get());
2850	SC.Done(I: &I);
2851	}
2852
2853	// Perform a bitwise OR on the horizontal pairs (or other specified grouping)
2854	// of elements.
2855	//
2856	// For example, suppose we have:
2857	// VectorA: <a0, a1, a2, a3, a4, a5>
2858	// VectorB: <b0, b1, b2, b3, b4, b5>
2859	// ReductionFactor: 3
2860	// Shards: 1
2861	// The output would be:
2862	// <a0\|a1\|a2, a3\|a4\|a5, b0\|b1\|b2, b3\|b4\|b5>
2863	//
2864	// If we have:
2865	// VectorA: <a0, a1, a2, a3, a4, a5, a6, a7>
2866	// VectorB: <b0, b1, b2, b3, b4, b5, b6, b7>
2867	// ReductionFactor: 2
2868	// Shards: 2
2869	// then a and be each have 2 "shards", resulting in the output being
2870	// interleaved:
2871	// <a0\|a1, a2\|a3, b0\|b1, b2\|b3, a4\|a5, a6\|a7, b4\|b5, b6\|b7>
2872	//
2873	// This is convenient for instrumenting horizontal add/sub.
2874	// For bitwise OR on "vertical" pairs, see maybeHandleSimpleNomemIntrinsic().
2875	Value horizontalReduce(IntrinsicInst &I, unsigned* ReductionFactor,
2876	unsigned Shards, Value VectorA, Value VectorB) {
2877	assert(isa<FixedVectorType>(VectorA->getType()));
2878	unsigned NumElems =
2879	cast<FixedVectorType>(Val: VectorA->getType())->getNumElements();
2880
2881	[[maybe_unused]] unsigned TotalNumElems = NumElems;
2882	if (VectorB) {
2883	assert(VectorA->getType() == VectorB->getType());
2884	TotalNumElems *= `2`;
2885	}
2886
2887	assert(NumElems % (ReductionFactor * Shards) == `0`);
2888
2889	Value Or = nullptr*;
2890
2891	IRBuilder<> IRB(&I);
2892	for (unsigned i = `0`; i < ReductionFactor; i++) {
2893	SmallVector<int, `16`> Mask;
2894
2895	for (unsigned j = `0`; j < Shards; j++) {
2896	unsigned Offset = NumElems / Shards * j;
2897
2898	for (unsigned X = `0`; X < NumElems / Shards; X += ReductionFactor)
2899	Mask.push_back(Elt: Offset + X + i);
2900
2901	if (VectorB) {
2902	for (unsigned X = `0`; X < NumElems / Shards; X += ReductionFactor)
2903	Mask.push_back(Elt: NumElems + Offset + X + i);
2904	}
2905	}
2906
2907	Value *Masked;
2908	if (VectorB)
2909	Masked = IRB.CreateShuffleVector(V1: VectorA, V2: VectorB, Mask);
2910	else
2911	Masked = IRB.CreateShuffleVector(V: VectorA, Mask);
2912
2913	if (Or)
2914	Or = IRB.CreateOr(LHS: Or, RHS: Masked);
2915	else
2916	Or = Masked;
2917	}
2918
2919	return Or;
2920	}
2921
2922	/// Propagate shadow for 1- or 2-vector intrinsics that combine adjacent
2923	/// fields.
2924	///
2925	/// e.g., <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16>)
2926	/// <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8>, <16 x i8>)
2927	void handlePairwiseShadowOrIntrinsic(IntrinsicInst &I, unsigned Shards) {
2928	assert(I.arg_size() == `1` \|\| I.arg_size() == `2`);
2929
2930	assert(I.getType()->isVectorTy());
2931	assert(I.getArgOperand(`0`)->getType()->isVectorTy());
2932
2933	[[maybe_unused]] FixedVectorType *ParamType =
2934	cast<FixedVectorType>(Val: I.getArgOperand(i: `0`)->getType());
2935	assert((I.arg_size() != `2`) \|\|
2936	(ParamType == cast<FixedVectorType>(I.getArgOperand(`1`)->getType())));
2937	[[maybe_unused]] FixedVectorType *ReturnType =
2938	cast<FixedVectorType>(Val: I.getType());
2939	assert(ParamType->getNumElements() * I.arg_size() ==
2940	`2` * ReturnType->getNumElements());
2941
2942	IRBuilder<> IRB(&I);
2943
2944	// Horizontal OR of shadow
2945	Value *FirstArgShadow = getShadow(I: &I, i: `0`);
2946	Value SecondArgShadow = nullptr*;
2947	if (I.arg_size() == `2`)
2948	SecondArgShadow = getShadow(I: &I, i: `1`);
2949
2950	Value OrShadow = horizontalReduce(I, /ReductionFactor=/*`2`, Shards,
2951	VectorA: FirstArgShadow, VectorB: SecondArgShadow);
2952
2953	OrShadow = CreateShadowCast(IRB, V: OrShadow, dstTy: getShadowTy(V: &I));
2954
2955	setShadow(V: &I, SV: OrShadow);
2956	setOriginForNaryOp(I);
2957	}
2958
2959	/// Propagate shadow for 1- or 2-vector intrinsics that combine adjacent
2960	/// fields, with the parameters reinterpreted to have elements of a specified
2961	/// width. For example:
2962	/// @llvm.x86.ssse3.phadd.w(<1 x i64> [[VAR1]], <1 x i64> [[VAR2]])
2963	/// conceptually operates on
2964	/// (<4 x i16> [[VAR1]], <4 x i16> [[VAR2]])
2965	/// and can be handled with ReinterpretElemWidth == 16.
2966	void handlePairwiseShadowOrIntrinsic(IntrinsicInst &I, unsigned Shards,
2967	int ReinterpretElemWidth) {
2968	assert(I.arg_size() == `1` \|\| I.arg_size() == `2`);
2969
2970	assert(I.getType()->isVectorTy());
2971	assert(I.getArgOperand(`0`)->getType()->isVectorTy());
2972
2973	FixedVectorType *ParamType =
2974	cast<FixedVectorType>(Val: I.getArgOperand(i: `0`)->getType());
2975	assert((I.arg_size() != `2`) \|\|
2976	(ParamType == cast<FixedVectorType>(I.getArgOperand(`1`)->getType())));
2977
2978	[[maybe_unused]] FixedVectorType *ReturnType =
2979	cast<FixedVectorType>(Val: I.getType());
2980	assert(ParamType->getNumElements() * I.arg_size() ==
2981	`2` * ReturnType->getNumElements());
2982
2983	IRBuilder<> IRB(&I);
2984
2985	FixedVectorType ReinterpretShadowTy = nullptr*;
2986	assert(isAligned(Align(ReinterpretElemWidth),
2987	ParamType->getPrimitiveSizeInBits()));
2988	ReinterpretShadowTy = FixedVectorType::get(
2989	ElementType: IRB.getIntNTy(N: ReinterpretElemWidth),
2990	NumElts: ParamType->getPrimitiveSizeInBits() / ReinterpretElemWidth);
2991
2992	// Horizontal OR of shadow
2993	Value *FirstArgShadow = getShadow(I: &I, i: `0`);
2994	FirstArgShadow = IRB.CreateBitCast(V: FirstArgShadow, DestTy: ReinterpretShadowTy);
2995
2996	// If we had two parameters each with an odd number of elements, the total
2997	// number of elements is even, but we have never seen this in extant
2998	// instruction sets, so we enforce that each parameter must have an even
2999	// number of elements.
3000	assert(isAligned(
3001	Align(`2`),
3002	cast<FixedVectorType>(FirstArgShadow->getType())->getNumElements()));
3003
3004	Value SecondArgShadow = nullptr*;
3005	if (I.arg_size() == `2`) {
3006	SecondArgShadow = getShadow(I: &I, i: `1`);
3007	SecondArgShadow = IRB.CreateBitCast(V: SecondArgShadow, DestTy: ReinterpretShadowTy);
3008	}
3009
3010	Value OrShadow = horizontalReduce(I, /ReductionFactor=/*`2`, Shards,
3011	VectorA: FirstArgShadow, VectorB: SecondArgShadow);
3012
3013	OrShadow = CreateShadowCast(IRB, V: OrShadow, dstTy: getShadowTy(V: &I));
3014
3015	setShadow(V: &I, SV: OrShadow);
3016	setOriginForNaryOp(I);
3017	}
3018
3019	void visitFNeg(UnaryOperator &I) { handleShadowOr(I); }
3020
3021	// Handle multiplication by constant.
3022	//
3023	// Handle a special case of multiplication by constant that may have one or
3024	// more zeros in the lower bits. This makes corresponding number of lower bits
3025	// of the result zero as well. We model it by shifting the other operand
3026	// shadow left by the required number of bits. Effectively, we transform
3027	// (X (A * 2*B)) to ((X << B) A) and instrument (X << B) as (Sx << B).*
3028	// We use multiplication by 2N instead of shift to cover the case of
3029	// multiplication by 0, which may occur in some elements of a vector operand.
3030	void handleMulByConstant(BinaryOperator &I, Constant *ConstArg,
3031	Value *OtherArg) {
3032	Constant *ShadowMul;
3033	Type *Ty = ConstArg->getType();
3034	if (auto *VTy = dyn_cast<VectorType>(Val: Ty)) {
3035	unsigned NumElements = cast<FixedVectorType>(Val: VTy)->getNumElements();
3036	Type *EltTy = VTy->getElementType();
3037	SmallVector<Constant *, `16`> Elements;
3038	for (unsigned Idx = `0`; Idx < NumElements; ++Idx) {
3039	if (ConstantInt *Elt =
3040	dyn_cast<ConstantInt>(Val: ConstArg->getAggregateElement(Elt: Idx))) {
3041	const APInt &V = Elt->getValue();
3042	APInt V2 = APInt (V.getBitWidth(), `1`) << V.countr_zero();
3043	Elements.push_back(Elt: ConstantInt::get(Ty: EltTy, V: V2));
3044	} else {
3045	Elements.push_back(Elt: ConstantInt::get(Ty: EltTy, V: `1`));
3046	}
3047	}
3048	ShadowMul = ConstantVector::get(V: Elements);
3049	} else {
3050	if (ConstantInt *Elt = dyn_cast<ConstantInt>(Val: ConstArg)) {
3051	const APInt &V = Elt->getValue();
3052	APInt V2 = APInt (V.getBitWidth(), `1`) << V.countr_zero();
3053	ShadowMul = ConstantInt::get(Ty, V: V2);
3054	} else {
3055	ShadowMul = ConstantInt::get(Ty, V: `1`);
3056	}
3057	}
3058
3059	IRBuilder<> IRB(&I);
3060	setShadow(V: &I,
3061	SV: IRB.CreateMul(LHS: getShadow(V: OtherArg), RHS: ShadowMul, Name: "msprop_mul_cst"));
3062	setOrigin(V: &I, Origin: getOrigin(V: OtherArg));
3063	}
3064
3065	void visitMul(BinaryOperator &I) {
3066	Constant *constOp0 = dyn_cast<Constant>(Val: I.getOperand(i_nocapture: `0`));
3067	Constant *constOp1 = dyn_cast<Constant>(Val: I.getOperand(i_nocapture: `1`));
3068	if (constOp0 && !constOp1)
3069	handleMulByConstant(I, ConstArg: constOp0, OtherArg: I.getOperand(i_nocapture: `1`));
3070	else if (constOp1 && !constOp0)
3071	handleMulByConstant(I, ConstArg: constOp1, OtherArg: I.getOperand(i_nocapture: `0`));
3072	else
3073	handleShadowOr(I);
3074	}
3075
3076	void visitFAdd(BinaryOperator &I) { handleShadowOr(I); }
3077	void visitFSub(BinaryOperator &I) { handleShadowOr(I); }
3078	void visitFMul(BinaryOperator &I) { handleShadowOr(I); }
3079	void visitAdd(BinaryOperator &I) { handleShadowOr(I); }
3080	void visitSub(BinaryOperator &I) { handleShadowOr(I); }
3081	void visitXor(BinaryOperator &I) { handleShadowOr(I); }
3082
3083	void handleIntegerDiv(Instruction &I) {
3084	IRBuilder<> IRB(&I);
3085	// Strict on the second argument.
3086	insertCheckShadowOf(Val: I.getOperand(i: `1`), OrigIns: &I);
3087	setShadow(V: &I, SV: getShadow(I: &I, i: `0`));
3088	setOrigin(V: &I, Origin: getOrigin(I: &I, i: `0`));
3089	}
3090
3091	void visitUDiv(BinaryOperator &I) { handleIntegerDiv(I); }
3092	void visitSDiv(BinaryOperator &I) { handleIntegerDiv(I); }
3093	void visitURem(BinaryOperator &I) { handleIntegerDiv(I); }
3094	void visitSRem(BinaryOperator &I) { handleIntegerDiv(I); }
3095
3096	// Floating point division is side-effect free. We can not require that the
3097	// divisor is fully initialized and must propagate shadow. See PR37523.
3098	void visitFDiv(BinaryOperator &I) { handleShadowOr(I); }
3099	void visitFRem(BinaryOperator &I) { handleShadowOr(I); }
3100
3101	/// Instrument == and != comparisons.
3102	///
3103	/// Sometimes the comparison result is known even if some of the bits of the
3104	/// arguments are not.
3105	void handleEqualityComparison(ICmpInst &I) {
3106	IRBuilder<> IRB(&I);
3107	Value *A = I.getOperand(i_nocapture: `0`);
3108	Value *B = I.getOperand(i_nocapture: `1`);
3109	Value *Sa = getShadow(V: A);
3110	Value *Sb = getShadow(V: B);
3111
3112	Value *Si = propagateEqualityComparison(IRB, A, B, Sa, Sb);
3113
3114	setShadow(V: &I, SV: Si);
3115	setOriginForNaryOp(I);
3116	}
3117
3118	/// Instrument relational comparisons.
3119	///
3120	/// This function does exact shadow propagation for all relational
3121	/// comparisons of integers, pointers and vectors of those.
3122	/// FIXME: output seems suboptimal when one of the operands is a constant
3123	void handleRelationalComparisonExact(ICmpInst &I) {
3124	IRBuilder<> IRB(&I);
3125	Value *A = I.getOperand(i_nocapture: `0`);
3126	Value *B = I.getOperand(i_nocapture: `1`);
3127	Value *Sa = getShadow(V: A);
3128	Value *Sb = getShadow(V: B);
3129
3130	// Get rid of pointers and vectors of pointers.
3131	// For ints (and vectors of ints), types of A and Sa match,
3132	// and this is a no-op.
3133	A = IRB.CreatePointerCast(V: A, DestTy: Sa->getType());
3134	B = IRB.CreatePointerCast(V: B, DestTy: Sb->getType());
3135
3136	// Let [a0, a1] be the interval of possible values of A, taking into account
3137	// its undefined bits. Let [b0, b1] be the interval of possible values of B.
3138	// Then (A cmp B) is defined iff (a0 cmp b1) == (a1 cmp b0).
3139	bool IsSigned = I.isSigned();
3140
3141	auto GetMinMaxUnsigned = [&](Value V, Value S) {
3142	if (IsSigned) {
3143	// Sign-flip to map from signed range to unsigned range. Relation A vs B
3144	// should be preserved, if checked with `getUnsignedPredicate()`.
3145	// Relationship between Amin, Amax, Bmin, Bmax also will not be
3146	// affected, as they are created by effectively adding/substructing from
3147	// A (or B) a value, derived from shadow, with no overflow, either
3148	// before or after sign flip.
3149	APInt MinVal =
3150	APInt::getSignedMinValue(numBits: V->getType()->getScalarSizeInBits());
3151	V = IRB.CreateXor(LHS: V, RHS: ConstantInt::get(Ty: V->getType(), V: MinVal));
3152	}
3153	// Minimize undefined bits.
3154	Value *Min = IRB.CreateAnd(LHS: V, RHS: IRB.CreateNot(V: S));
3155	Value *Max = IRB.CreateOr(LHS: V, RHS: S);
3156	return std::make_pair(x&: Min, y&: Max);
3157	};
3158
3159	auto [Amin, Amax] = GetMinMaxUnsigned(A, Sa);
3160	auto [Bmin, Bmax] = GetMinMaxUnsigned(B, Sb);
3161	Value *S1 = IRB.CreateICmp(P: I.getUnsignedPredicate(), LHS: Amin, RHS: Bmax);
3162	Value *S2 = IRB.CreateICmp(P: I.getUnsignedPredicate(), LHS: Amax, RHS: Bmin);
3163
3164	Value *Si = IRB.CreateXor(LHS: S1, RHS: S2);
3165	setShadow(V: &I, SV: Si);
3166	setOriginForNaryOp(I);
3167	}
3168
3169	/// Instrument signed relational comparisons.
3170	///
3171	/// Handle sign bit tests: x<0, x>=0, x<=-1, x>-1 by propagating the highest
3172	/// bit of the shadow. Everything else is delegated to handleShadowOr().
3173	void handleSignedRelationalComparison(ICmpInst &I) {
3174	Constant *constOp;
3175	Value op = nullptr*;
3176	CmpInst::Predicate pre;
3177	if ((constOp = dyn_cast<Constant>(Val: I.getOperand(i_nocapture: `1`)))) {
3178	op = I.getOperand(i_nocapture: `0`);
3179	pre = I.getPredicate();
3180	} else if ((constOp = dyn_cast<Constant>(Val: I.getOperand(i_nocapture: `0`)))) {
3181	op = I.getOperand(i_nocapture: `1`);
3182	pre = I.getSwappedPredicate();
3183	} else {
3184	handleShadowOr(I);
3185	return;
3186	}
3187
3188	if ((constOp->isNullValue() &&
3189	(pre == CmpInst::ICMP_SLT \|\| pre == CmpInst::ICMP_SGE)) \|\|
3190	(constOp->isAllOnesValue() &&
3191	(pre == CmpInst::ICMP_SGT \|\| pre == CmpInst::ICMP_SLE))) {
3192	IRBuilder<> IRB(&I);
3193	Value *Shadow = IRB.CreateICmpSLT(LHS: getShadow(V: op), RHS: getCleanShadow(V: op),
3194	Name: "_msprop_icmp_s");
3195	setShadow(V: &I, SV: Shadow);
3196	setOrigin(V: &I, Origin: getOrigin(V: op));
3197	} else {
3198	handleShadowOr(I);
3199	}
3200	}
3201
3202	void visitICmpInst(ICmpInst &I) {
3203	if (!ClHandleICmp) {
3204	handleShadowOr(I);
3205	return;
3206	}
3207	if (I.isEquality()) {
3208	handleEqualityComparison(I);
3209	return;
3210	}
3211
3212	assert(I.isRelational());
3213	if (ClHandleICmpExact) {
3214	handleRelationalComparisonExact(I);
3215	return;
3216	}
3217	if (I.isSigned()) {
3218	handleSignedRelationalComparison(I);
3219	return;
3220	}
3221
3222	assert(I.isUnsigned());
3223	if ((isa<Constant>(Val: I.getOperand(i_nocapture: `0`)) \|\| isa<Constant>(Val: I.getOperand(i_nocapture: `1`)))) {
3224	handleRelationalComparisonExact(I);
3225	return;
3226	}
3227
3228	handleShadowOr(I);
3229	}
3230
3231	void visitFCmpInst(FCmpInst &I) { handleShadowOr(I); }
3232
3233	void handleShift(BinaryOperator &I) {
3234	IRBuilder<> IRB(&I);
3235	// If any of the S2 bits are poisoned, the whole thing is poisoned.
3236	// Otherwise perform the same shift on S1.
3237	Value *S1 = getShadow(I: &I, i: `0`);
3238	Value *S2 = getShadow(I: &I, i: `1`);
3239	Value *S2Conv =
3240	IRB.CreateSExt(V: IRB.CreateICmpNE(LHS: S2, RHS: getCleanShadow(V: S2)), DestTy: S2->getType());
3241	Value *V2 = I.getOperand(i_nocapture: `1`);
3242	Value *Shift = IRB.CreateBinOp(Opc: I.getOpcode(), LHS: S1, RHS: V2);
3243	setShadow(V: &I, SV: IRB.CreateOr(LHS: Shift, RHS: S2Conv));
3244	setOriginForNaryOp(I);
3245	}
3246
3247	void visitShl(BinaryOperator &I) { handleShift(I); }
3248	void visitAShr(BinaryOperator &I) { handleShift(I); }
3249	void visitLShr(BinaryOperator &I) { handleShift(I); }
3250
3251	void handleFunnelShift(IntrinsicInst &I) {
3252	IRBuilder<> IRB(&I);
3253	// If any of the S2 bits are poisoned, the whole thing is poisoned.
3254	// Otherwise perform the same shift on S0 and S1.
3255	Value *S0 = getShadow(I: &I, i: `0`);
3256	Value *S1 = getShadow(I: &I, i: `1`);
3257	Value *S2 = getShadow(I: &I, i: `2`);
3258	Value *S2Conv =
3259	IRB.CreateSExt(V: IRB.CreateICmpNE(LHS: S2, RHS: getCleanShadow(V: S2)), DestTy: S2->getType());
3260	Value *V2 = I.getOperand(i_nocapture: `2`);
3261	Value *Shift = IRB.CreateIntrinsic(ID: I.getIntrinsicID(), Types: S2Conv->getType(),
3262	Args: {S0, S1, V2});
3263	setShadow(V: &I, SV: IRB.CreateOr(LHS: Shift, RHS: S2Conv));
3264	setOriginForNaryOp(I);
3265	}
3266
3267	/// Instrument llvm.memmove
3268	///
3269	/// At this point we don't know if llvm.memmove will be inlined or not.
3270	/// If we don't instrument it and it gets inlined,
3271	/// our interceptor will not kick in and we will lose the memmove.
3272	/// If we instrument the call here, but it does not get inlined,
3273	/// we will memmove the shadow twice: which is bad in case
3274	/// of overlapping regions. So, we simply lower the intrinsic to a call.
3275	///
3276	/// Similar situation exists for memcpy and memset.
3277	void visitMemMoveInst(MemMoveInst &I) {
3278	getShadow(V: I.getArgOperand(i: `1`)); // Ensure shadow initialized
3279	IRBuilder<> IRB(&I);
3280	IRB.CreateCall(Callee: MS.MemmoveFn,
3281	Args: {I.getArgOperand(i: `0`), I.getArgOperand(i: `1`),
3282	IRB.CreateIntCast(V: I.getArgOperand(i: `2`), DestTy: MS.IntptrTy, isSigned: false)});
3283	I.eraseFromParent();
3284	}
3285
3286	/// Instrument memcpy
3287	///
3288	/// Similar to memmove: avoid copying shadow twice. This is somewhat
3289	/// unfortunate as it may slowdown small constant memcpys.
3290	/// FIXME: consider doing manual inline for small constant sizes and proper
3291	/// alignment.
3292	///
3293	/// Note: This also handles memcpy.inline, which promises no calls to external
3294	/// functions as an optimization. However, with instrumentation enabled this
3295	/// is difficult to promise; additionally, we know that the MSan runtime
3296	/// exists and provides __msan_memcpy(). Therefore, we assume that with
3297	/// instrumentation it's safe to turn memcpy.inline into a call to
3298	/// __msan_memcpy(). Should this be wrong, such as when implementing memcpy()
3299	/// itself, instrumentation should be disabled with the no_sanitize attribute.
3300	void visitMemCpyInst(MemCpyInst &I) {
3301	getShadow(V: I.getArgOperand(i: `1`)); // Ensure shadow initialized
3302	IRBuilder<> IRB(&I);
3303	IRB.CreateCall(Callee: MS.MemcpyFn,
3304	Args: {I.getArgOperand(i: `0`), I.getArgOperand(i: `1`),
3305	IRB.CreateIntCast(V: I.getArgOperand(i: `2`), DestTy: MS.IntptrTy, isSigned: false)});
3306	I.eraseFromParent();
3307	}
3308
3309	// Same as memcpy.
3310	void visitMemSetInst(MemSetInst &I) {
3311	IRBuilder<> IRB(&I);
3312	IRB.CreateCall(
3313	Callee: MS.MemsetFn,
3314	Args: {I.getArgOperand(i: `0`),
3315	IRB.CreateIntCast(V: I.getArgOperand(i: `1`), DestTy: IRB.getInt32Ty(), isSigned: false),
3316	IRB.CreateIntCast(V: I.getArgOperand(i: `2`), DestTy: MS.IntptrTy, isSigned: false)});
3317	I.eraseFromParent();
3318	}
3319
3320	void visitVAStartInst(VAStartInst &I) { VAHelper ->visitVAStartInst(I); }
3321
3322	void visitVACopyInst(VACopyInst &I) { VAHelper ->visitVACopyInst(I); }
3323
3324	/// Handle vector store-like intrinsics.
3325	///
3326	/// Instrument intrinsics that look like a simple SIMD store: writes memory,
3327	/// has 1 pointer argument and 1 vector argument, returns void.
3328	bool handleVectorStoreIntrinsic(IntrinsicInst &I) {
3329	assert(I.arg_size() == `2`);
3330
3331	IRBuilder<> IRB(&I);
3332	Value *Addr = I.getArgOperand(i: `0`);
3333	Value *Shadow = getShadow(I: &I, i: `1`);
3334	Value ShadowPtr, OriginPtr;
3335
3336	// We don't know the pointer alignment (could be unaligned SSE store!).
3337	// Have to assume to worst case.
3338	std::tie(args&: ShadowPtr, args&: OriginPtr) = getShadowOriginPtr(
3339	Addr, IRB, ShadowTy: Shadow->getType(), Alignment: Align (`1`), /isStore/ true);
3340	IRB.CreateAlignedStore(Val: Shadow, Ptr: ShadowPtr, Align: Align (`1`));
3341
3342	if (ClCheckAccessAddress)
3343	insertCheckShadowOf(Val: Addr, OrigIns: &I);
3344
3345	// FIXME: factor out common code from materializeStores
3346	if (MS.TrackOrigins)
3347	IRB.CreateStore(Val: getOrigin(I: &I, i: `1`), Ptr: OriginPtr);
3348	return true;
3349	}
3350
3351	/// Handle vector load-like intrinsics.
3352	///
3353	/// Instrument intrinsics that look like a simple SIMD load: reads memory,
3354	/// has 1 pointer argument, returns a vector.
3355	bool handleVectorLoadIntrinsic(IntrinsicInst &I) {
3356	assert(I.arg_size() == `1`);
3357
3358	IRBuilder<> IRB(&I);
3359	Value *Addr = I.getArgOperand(i: `0`);
3360
3361	Type *ShadowTy = getShadowTy(V: &I);
3362	Value ShadowPtr = nullptr, OriginPtr = nullptr;
3363	if (PropagateShadow) {
3364	// We don't know the pointer alignment (could be unaligned SSE load!).
3365	// Have to assume to worst case.
3366	const Align Alignment = Align (`1`);
3367	std::tie(args&: ShadowPtr, args&: OriginPtr) =
3368	getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /isStore/ false);
3369	setShadow(V: &I,
3370	SV: IRB.CreateAlignedLoad(Ty: ShadowTy, Ptr: ShadowPtr, Align: Alignment, Name: "_msld"));
3371	} else {
3372	setShadow(V: &I, SV: getCleanShadow(V: &I));
3373	}
3374
3375	if (ClCheckAccessAddress)
3376	insertCheckShadowOf(Val: Addr, OrigIns: &I);
3377
3378	if (MS.TrackOrigins) {
3379	if (PropagateShadow)
3380	setOrigin(V: &I, Origin: IRB.CreateLoad(Ty: MS.OriginTy, Ptr: OriginPtr));
3381	else
3382	setOrigin(V: &I, Origin: getCleanOrigin());
3383	}
3384	return true;
3385	}
3386
3387	/// Handle (SIMD arithmetic)-like intrinsics.
3388	///
3389	/// Instrument intrinsics with any number of arguments of the same type [],*
3390	/// equal to the return type, plus a specified number of trailing flags of
3391	/// any type.
3392	///
3393	/// [] The type should be simple (no aggregates or pointers; vectors are*
3394	/// fine).
3395	///
3396	/// Caller guarantees that this intrinsic does not access memory.
3397	///
3398	/// TODO: "horizontal"/"pairwise" intrinsics are often incorrectly matched by
3399	/// by this handler. See horizontalReduce().
3400	///
3401	/// TODO: permutation intrinsics are also often incorrectly matched.
3402	[[maybe_unused]] bool
3403	maybeHandleSimpleNomemIntrinsic(IntrinsicInst &I,
3404	unsigned int trailingFlags) {
3405	Type *RetTy = I.getType();
3406	if (!(RetTy->isIntOrIntVectorTy() \|\| RetTy->isFPOrFPVectorTy()))
3407	return false;
3408
3409	unsigned NumArgOperands = I.arg_size();
3410	assert(NumArgOperands >= trailingFlags);
3411	for (unsigned i = `0`; i < NumArgOperands - trailingFlags; ++i) {
3412	Type *Ty = I.getArgOperand(i)->getType();
3413	if (Ty != RetTy)
3414	return false;
3415	}
3416
3417	IRBuilder<> IRB(&I);
3418	ShadowAndOriginCombiner SC(this, IRB);
3419	for (unsigned i = `0`; i < NumArgOperands; ++i)
3420	SC.Add(V: I.getArgOperand(i));
3421	SC.Done(I: &I);
3422
3423	return true;
3424	}
3425
3426	/// Returns whether it was able to heuristically instrument unknown
3427	/// intrinsics.
3428	///
3429	/// The main purpose of this code is to do something reasonable with all
3430	/// random intrinsics we might encounter, most importantly - SIMD intrinsics.
3431	/// We recognize several classes of intrinsics by their argument types and
3432	/// ModRefBehaviour and apply special instrumentation when we are reasonably
3433	/// sure that we know what the intrinsic does.
3434	///
3435	/// We special-case intrinsics where this approach fails. See llvm.bswap
3436	/// handling as an example of that.
3437	bool maybeHandleUnknownIntrinsicUnlogged(IntrinsicInst &I) {
3438	unsigned NumArgOperands = I.arg_size();
3439	if (NumArgOperands == `0`)
3440	return false;
3441
3442	if (NumArgOperands == `2` && I.getArgOperand(i: `0`)->getType()->isPointerTy() &&
3443	I.getArgOperand(i: `1`)->getType()->isVectorTy() &&
3444	I.getType()->isVoidTy() && !I.onlyReadsMemory()) {
3445	// This looks like a vector store.
3446	return handleVectorStoreIntrinsic(I);
3447	}
3448
3449	if (NumArgOperands == `1` && I.getArgOperand(i: `0`)->getType()->isPointerTy() &&
3450	I.getType()->isVectorTy() && I.onlyReadsMemory()) {
3451	// This looks like a vector load.
3452	return handleVectorLoadIntrinsic(I);
3453	}
3454
3455	if (I.doesNotAccessMemory())
3456	if (maybeHandleSimpleNomemIntrinsic(I, /trailingFlags=/`0`))
3457	return true;
3458
3459	// FIXME: detect and handle SSE maskstore/maskload?
3460	// Some cases are now handled in handleAVXMasked{Load,Store}.
3461	return false;
3462	}
3463
3464	bool maybeHandleUnknownIntrinsic(IntrinsicInst &I) {
3465	if (maybeHandleUnknownIntrinsicUnlogged(I)) {
3466	if (ClDumpHeuristicInstructions)
3467	dumpInst(I);
3468
3469	LLVM_DEBUG(dbgs() << "UNKNOWN INSTRUCTION HANDLED HEURISTICALLY: " << I
3470	<< "\n");
3471	return true;
3472	} else
3473	return false;
3474	}
3475
3476	void handleInvariantGroup(IntrinsicInst &I) {
3477	setShadow(V: &I, SV: getShadow(I: &I, i: `0`));
3478	setOrigin(V: &I, Origin: getOrigin(I: &I, i: `0`));
3479	}
3480
3481	void handleLifetimeStart(IntrinsicInst &I) {
3482	if (!PoisonStack)
3483	return;
3484	AllocaInst *AI = dyn_cast<AllocaInst>(Val: I.getArgOperand(i: `0`));
3485	if (AI)
3486	LifetimeStartList.push_back(Elt: std::make_pair(x: &I, y&: AI));
3487	}
3488
3489	void handleBswap(IntrinsicInst &I) {
3490	IRBuilder<> IRB(&I);
3491	Value *Op = I.getArgOperand(i: `0`);
3492	Type *OpType = Op->getType();
3493	setShadow(V: &I, SV: IRB.CreateIntrinsic(ID: Intrinsic::bswap, Types: ArrayRef(&OpType, `1`),
3494	Args: getShadow(V: Op)));
3495	setOrigin(V: &I, Origin: getOrigin(V: Op));
3496	}
3497
3498	// Uninitialized bits are ok if they appear after the leading/trailing 0's
3499	// and a 1. If the input is all zero, it is fully initialized iff
3500	// !is_zero_poison.
3501	//
3502	// e.g., for ctlz, with little-endian, if 0/1 are initialized bits with
3503	// concrete value 0/1, and ? is an uninitialized bit:
3504	// - 0001 0??? is fully initialized
3505	// - 000? ???? is fully uninitialized ()*
3506	// - ???? ???? is fully uninitialized
3507	// - 0000 0000 is fully uninitialized if is_zero_poison,
3508	// fully initialized otherwise
3509	//
3510	// () TODO: arguably, since the number of zeros is in the range [3, 8], we*
3511	// only need to poison 4 bits.
3512	//
3513	// OutputShadow =
3514	// ((ConcreteZerosCount >= ShadowZerosCount) && !AllZeroShadow)
3515	// \|\| (is_zero_poison && AllZeroSrc)
3516	void handleCountLeadingTrailingZeros(IntrinsicInst &I) {
3517	IRBuilder<> IRB(&I);
3518	Value *Src = I.getArgOperand(i: `0`);
3519	Value *SrcShadow = getShadow(V: Src);
3520
3521	Value False = IRB.getInt1(V: false*);
3522	Value *ConcreteZerosCount = IRB.CreateIntrinsic(
3523	RetTy: I.getType(), ID: I.getIntrinsicID(), Args: {Src, /is_zero_poison=/False});
3524	Value *ShadowZerosCount = IRB.CreateIntrinsic(
3525	RetTy: I.getType(), ID: I.getIntrinsicID(), Args: {SrcShadow, /is_zero_poison=/False});
3526
3527	Value *CompareConcreteZeros = IRB.CreateICmpUGE(
3528	LHS: ConcreteZerosCount, RHS: ShadowZerosCount, Name: "_mscz_cmp_zeros");
3529
3530	Value *NotAllZeroShadow =
3531	IRB.CreateIsNotNull(Arg: SrcShadow, Name: "_mscz_shadow_not_null");
3532	Value *OutputShadow =
3533	IRB.CreateAnd(LHS: CompareConcreteZeros, RHS: NotAllZeroShadow, Name: "_mscz_main");
3534
3535	// If zero poison is requested, mix in with the shadow
3536	Constant *IsZeroPoison = cast<Constant>(Val: I.getOperand(i_nocapture: `1`));
3537	if (!IsZeroPoison->isNullValue()) {
3538	Value *BoolZeroPoison = IRB.CreateIsNull(Arg: Src, Name: "_mscz_bzp");
3539	OutputShadow = IRB.CreateOr(LHS: OutputShadow, RHS: BoolZeroPoison, Name: "_mscz_bs");
3540	}
3541
3542	OutputShadow = IRB.CreateSExt(V: OutputShadow, DestTy: getShadowTy(V: Src), Name: "_mscz_os");
3543
3544	setShadow(V: &I, SV: OutputShadow);
3545	setOriginForNaryOp(I);
3546	}
3547
3548	/// Handle Arm NEON vector convert intrinsics.
3549	///
3550	/// e.g., <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float>)
3551	/// i32 @llvm.aarch64.neon.fcvtms.i32.f64 (double)
3552	///
3553	/// For conversions to or from fixed-point, there is a trailing argument to
3554	/// indicate the fixed-point precision:
3555	/// - <4 x float> llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32)
3556	/// - <4 x i32> llvm.aarch64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32)
3557	///
3558	/// For x86 SSE vector convert intrinsics, see
3559	/// handleSSEVectorConvertIntrinsic().
3560	void handleNEONVectorConvertIntrinsic(IntrinsicInst &I, bool FixedPoint) {
3561	if (FixedPoint)
3562	assert(I.arg_size() == `2`);
3563	else
3564	assert(I.arg_size() == `1`);
3565
3566	IRBuilder<> IRB(&I);
3567	Value *S0 = getShadow(I: &I, i: `0`);
3568
3569	if (FixedPoint) {
3570	Value *Precision = I.getOperand(i_nocapture: `1`);
3571	insertCheckShadowOf(Val: Precision, OrigIns: &I);
3572	}
3573
3574	/// For scalars:
3575	/// Since they are converting from floating-point to integer, the output is
3576	/// - fully uninitialized if any* bit of the input is uninitialized*
3577	/// - fully ininitialized if all bits of the input are ininitialized
3578	/// We apply the same principle on a per-field basis for vectors.
3579	Value *OutShadow = IRB.CreateSExt(V: IRB.CreateICmpNE(LHS: S0, RHS: getCleanShadow(V: S0)),
3580	DestTy: getShadowTy(V: &I));
3581	setShadow(V: &I, SV: OutShadow);
3582	setOriginForNaryOp(I);
3583	}
3584
3585	/// Some instructions have additional zero-elements in the return type
3586	/// e.g., <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64>, ...)
3587	///
3588	/// This function will return a vector type with the same number of elements
3589	/// as the input, but same per-element width as the return value e.g.,
3590	/// <8 x i8>.
3591	FixedVectorType maybeShrinkVectorShadowType(Value Src, IntrinsicInst &I) {
3592	assert(isa<FixedVectorType>(getShadowTy(&I)));
3593	FixedVectorType *ShadowType = cast<FixedVectorType>(Val: getShadowTy(V: &I));
3594
3595	// TODO: generalize beyond 2x?
3596	if (ShadowType->getElementCount() ==
3597	cast<VectorType>(Val: Src->getType())->getElementCount() * `2`)
3598	ShadowType = FixedVectorType::getHalfElementsVectorType(VTy: ShadowType);
3599
3600	assert(ShadowType->getElementCount() ==
3601	cast<VectorType>(Src->getType())->getElementCount());
3602
3603	return ShadowType;
3604	}
3605
3606	/// Doubles the length of a vector shadow (extending with zeros) if necessary
3607	/// to match the length of the shadow for the instruction.
3608	/// If scalar types of the vectors are different, it will use the type of the
3609	/// input vector.
3610	/// This is more type-safe than CreateShadowCast().
3611	Value maybeExtendVectorShadowWithZeros(Value Shadow, IntrinsicInst &I) {
3612	IRBuilder<> IRB(&I);
3613	assert(isa<FixedVectorType>(Shadow->getType()));
3614	assert(isa<FixedVectorType>(I.getType()));
3615
3616	Value *FullShadow = getCleanShadow(V: &I);
3617	unsigned ShadowNumElems =
3618	cast<FixedVectorType>(Val: Shadow->getType())->getNumElements();
3619	unsigned FullShadowNumElems =
3620	cast<FixedVectorType>(Val: FullShadow->getType())->getNumElements();
3621
3622	assert((ShadowNumElems == FullShadowNumElems) \|\|
3623	(ShadowNumElems * `2` == FullShadowNumElems));
3624
3625	if (ShadowNumElems == FullShadowNumElems) {
3626	FullShadow = Shadow;
3627	} else {
3628	// TODO: generalize beyond 2x?
3629	SmallVector<int, `32`> ShadowMask(FullShadowNumElems);
3630	std::iota(first: ShadowMask.begin(), last: ShadowMask.end(), value: `0`);
3631
3632	// Append zeros
3633	FullShadow =
3634	IRB.CreateShuffleVector(V1: Shadow, V2: getCleanShadow(V: Shadow), Mask: ShadowMask);
3635	}
3636
3637	return FullShadow;
3638	}
3639
3640	/// Handle x86 SSE vector conversion.
3641	///
3642	/// e.g., single-precision to half-precision conversion:
3643	/// <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a0, i32 0)
3644	/// <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a0, i32 0)
3645	///
3646	/// floating-point to integer:
3647	/// <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>)
3648	/// <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>)
3649	///
3650	/// Note: if the output has more elements, they are zero-initialized (and
3651	/// therefore the shadow will also be initialized).
3652	///
3653	/// This differs from handleSSEVectorConvertIntrinsic() because it
3654	/// propagates uninitialized shadow (instead of checking the shadow).
3655	void handleSSEVectorConvertIntrinsicByProp(IntrinsicInst &I,
3656	bool HasRoundingMode) {
3657	if (HasRoundingMode) {
3658	assert(I.arg_size() == `2`);
3659	[[maybe_unused]] Value *RoundingMode = I.getArgOperand(i: `1`);
3660	assert(RoundingMode->getType()->isIntegerTy());
3661	} else {
3662	assert(I.arg_size() == `1`);
3663	}
3664
3665	Value *Src = I.getArgOperand(i: `0`);
3666	assert(Src->getType()->isVectorTy());
3667
3668	// The return type might have more elements than the input.
3669	// Temporarily shrink the return type's number of elements.
3670	VectorType *ShadowType = maybeShrinkVectorShadowType(Src, I);
3671
3672	IRBuilder<> IRB(&I);
3673	Value *S0 = getShadow(I: &I, i: `0`);
3674
3675	/// For scalars:
3676	/// Since they are converting to and/or from floating-point, the output is:
3677	/// - fully uninitialized if any* bit of the input is uninitialized*
3678	/// - fully ininitialized if all bits of the input are ininitialized
3679	/// We apply the same principle on a per-field basis for vectors.
3680	Value *Shadow =
3681	IRB.CreateSExt(V: IRB.CreateICmpNE(LHS: S0, RHS: getCleanShadow(V: S0)), DestTy: ShadowType);
3682
3683	// The return type might have more elements than the input.
3684	// Extend the return type back to its original width if necessary.
3685	Value *FullShadow = maybeExtendVectorShadowWithZeros(Shadow, I);
3686
3687	setShadow(V: &I, SV: FullShadow);
3688	setOriginForNaryOp(I);
3689	}
3690
3691	// Instrument x86 SSE vector convert intrinsic.
3692	//
3693	// This function instruments intrinsics like cvtsi2ss:
3694	// %Out = int_xxx_cvtyyy(%ConvertOp)
3695	// or
3696	// %Out = int_xxx_cvtyyy(%CopyOp, %ConvertOp)
3697	// Intrinsic converts \p NumUsedElements elements of \p ConvertOp to the same
3698	// number \p Out elements, and (if has 2 arguments) copies the rest of the
3699	// elements from \p CopyOp.
3700	// In most cases conversion involves floating-point value which may trigger a
3701	// hardware exception when not fully initialized. For this reason we require
3702	// \p ConvertOp[0:NumUsedElements] to be fully initialized and trap otherwise.
3703	// We copy the shadow of \p CopyOp[NumUsedElements:] to \p
3704	// Out[NumUsedElements:]. This means that intrinsics without \p CopyOp always
3705	// return a fully initialized value.
3706	//
3707	// For Arm NEON vector convert intrinsics, see
3708	// handleNEONVectorConvertIntrinsic().
3709	void handleSSEVectorConvertIntrinsic(IntrinsicInst &I, int NumUsedElements,
3710	bool HasRoundingMode = false) {
3711	IRBuilder<> IRB(&I);
3712	Value CopyOp, ConvertOp;
3713
3714	assert((!HasRoundingMode \|\|
3715	isa<ConstantInt>(I.getArgOperand(I.arg_size() - `1`))) &&
3716	"Invalid rounding mode");
3717
3718	switch (I.arg_size() - HasRoundingMode) {
3719	case `2`:
3720	CopyOp = I.getArgOperand(i: `0`);
3721	ConvertOp = I.getArgOperand(i: `1`);
3722	break;
3723	case `1`:
3724	ConvertOp = I.getArgOperand(i: `0`);
3725	CopyOp = nullptr;
3726	break;
3727	default:
3728	llvm_unreachable("Cvt intrinsic with unsupported number of arguments.");
3729	}
3730
3731	// The first NumUsedElements* elements of ConvertOp are converted to the*
3732	// same number of output elements. The rest of the output is copied from
3733	// CopyOp, or (if not available) filled with zeroes.
3734	// Combine shadow for elements of ConvertOp that are used in this operation,
3735	// and insert a check.
3736	// FIXME: consider propagating shadow of ConvertOp, at least in the case of
3737	// int->any conversion.
3738	Value *ConvertShadow = getShadow(V: ConvertOp);
3739	Value AggShadow = nullptr*;
3740	if (ConvertOp->getType()->isVectorTy()) {
3741	AggShadow = IRB.CreateExtractElement(
3742	Vec: ConvertShadow, Idx: ConstantInt::get(Ty: IRB.getInt32Ty(), V: `0`));
3743	for (int i = `1`; i < NumUsedElements; ++i) {
3744	Value *MoreShadow = IRB.CreateExtractElement(
3745	Vec: ConvertShadow, Idx: ConstantInt::get(Ty: IRB.getInt32Ty(), V: i));
3746	AggShadow = IRB.CreateOr(LHS: AggShadow, RHS: MoreShadow);
3747	}
3748	} else {
3749	AggShadow = ConvertShadow;
3750	}
3751	assert(AggShadow->getType()->isIntegerTy());
3752	insertCheckShadow(Shadow: AggShadow, Origin: getOrigin(V: ConvertOp), OrigIns: &I);
3753
3754	// Build result shadow by zero-filling parts of CopyOp shadow that come from
3755	// ConvertOp.
3756	if (CopyOp) {
3757	assert(CopyOp->getType() == I.getType());
3758	assert(CopyOp->getType()->isVectorTy());
3759	Value *ResultShadow = getShadow(V: CopyOp);
3760	Type *EltTy = cast<VectorType>(Val: ResultShadow->getType())->getElementType();
3761	for (int i = `0`; i < NumUsedElements; ++i) {
3762	ResultShadow = IRB.CreateInsertElement(
3763	Vec: ResultShadow, NewElt: ConstantInt::getNullValue(Ty: EltTy),
3764	Idx: ConstantInt::get(Ty: IRB.getInt32Ty(), V: i));
3765	}
3766	setShadow(V: &I, SV: ResultShadow);
3767	setOrigin(V: &I, Origin: getOrigin(V: CopyOp));
3768	} else {
3769	setShadow(V: &I, SV: getCleanShadow(V: &I));
3770	setOrigin(V: &I, Origin: getCleanOrigin());
3771	}
3772	}
3773
3774	// Given a scalar or vector, extract lower 64 bits (or less), and return all
3775	// zeroes if it is zero, and all ones otherwise.
3776	Value Lower64ShadowExtend(IRBuilder<> &IRB, Value S, Type *T) {
3777	if (S->getType()->isVectorTy())
3778	S = CreateShadowCast(IRB, V: S, dstTy: IRB.getInt64Ty(), / Signed / true);
3779	assert(S->getType()->getPrimitiveSizeInBits() <= `64`);
3780	Value *S2 = IRB.CreateICmpNE(LHS: S, RHS: getCleanShadow(V: S));
3781	return CreateShadowCast(IRB, V: S2, dstTy: T, / Signed / true);
3782	}
3783
3784	// Given a vector, extract its first element, and return all
3785	// zeroes if it is zero, and all ones otherwise.
3786	Value LowerElementShadowExtend(IRBuilder<> &IRB, Value S, Type *T) {
3787	Value *S1 = IRB.CreateExtractElement(Vec: S, Idx: (uint64_t)`0`);
3788	Value *S2 = IRB.CreateICmpNE(LHS: S1, RHS: getCleanShadow(V: S1));
3789	return CreateShadowCast(IRB, V: S2, dstTy: T, / Signed / true);
3790	}
3791
3792	Value VariableShadowExtend(IRBuilder<> &IRB, Value S) {
3793	Type *T = S->getType();
3794	assert(T->isVectorTy());
3795	Value *S2 = IRB.CreateICmpNE(LHS: S, RHS: getCleanShadow(V: S));
3796	return IRB.CreateSExt(V: S2, DestTy: T);
3797	}
3798
3799	// Instrument vector shift intrinsic.
3800	//
3801	// This function instruments intrinsics like int_x86_avx2_psll_w.
3802	// Intrinsic shifts %In by %ShiftSize bits.
3803	// %ShiftSize may be a vector. In that case the lower 64 bits determine shift
3804	// size, and the rest is ignored. Behavior is defined even if shift size is
3805	// greater than register (or field) width.
3806	void handleVectorShiftIntrinsic(IntrinsicInst &I, bool Variable) {
3807	assert(I.arg_size() == `2`);
3808	IRBuilder<> IRB(&I);
3809	// If any of the S2 bits are poisoned, the whole thing is poisoned.
3810	// Otherwise perform the same shift on S1.
3811	Value *S1 = getShadow(I: &I, i: `0`);
3812	Value *S2 = getShadow(I: &I, i: `1`);
3813	Value *S2Conv = Variable ? VariableShadowExtend(IRB, S: S2)
3814	: Lower64ShadowExtend(IRB, S: S2, T: getShadowTy(V: &I));
3815	Value *V1 = I.getOperand(i_nocapture: `0`);
3816	Value *V2 = I.getOperand(i_nocapture: `1`);
3817	Value *Shift = IRB.CreateCall(FTy: I.getFunctionType(), Callee: I.getCalledOperand(),
3818	Args: {IRB.CreateBitCast(V: S1, DestTy: V1->getType()), V2});
3819	Shift = IRB.CreateBitCast(V: Shift, DestTy: getShadowTy(V: &I));
3820	setShadow(V: &I, SV: IRB.CreateOr(LHS: Shift, RHS: S2Conv));
3821	setOriginForNaryOp(I);
3822	}
3823
3824	// Get an MMX-sized (64-bit) vector type, or optionally, other sized
3825	// vectors.
3826	Type getMMXVectorTy(unsigned* EltSizeInBits,
3827	unsigned X86_MMXSizeInBits = `64`) {
3828	assert(EltSizeInBits != `0` && (X86_MMXSizeInBits % EltSizeInBits) == `0` &&
3829	"Illegal MMX vector element size");
3830	return FixedVectorType::get(ElementType: IntegerType::get(C&: *MS.C, NumBits: EltSizeInBits),
3831	NumElts: X86_MMXSizeInBits / EltSizeInBits);
3832	}
3833
3834	// Returns a signed counterpart for an (un)signed-saturate-and-pack
3835	// intrinsic.
3836	Intrinsic::ID getSignedPackIntrinsic(Intrinsic::ID id) {
3837	switch (id) {
3838	case Intrinsic::x86_sse2_packsswb_128:
3839	case Intrinsic::x86_sse2_packuswb_128:
3840	return Intrinsic::x86_sse2_packsswb_128;
3841
3842	case Intrinsic::x86_sse2_packssdw_128:
3843	case Intrinsic::x86_sse41_packusdw:
3844	return Intrinsic::x86_sse2_packssdw_128;
3845
3846	case Intrinsic::x86_avx2_packsswb:
3847	case Intrinsic::x86_avx2_packuswb:
3848	return Intrinsic::x86_avx2_packsswb;
3849
3850	case Intrinsic::x86_avx2_packssdw:
3851	case Intrinsic::x86_avx2_packusdw:
3852	return Intrinsic::x86_avx2_packssdw;
3853
3854	case Intrinsic::x86_mmx_packsswb:
3855	case Intrinsic::x86_mmx_packuswb:
3856	return Intrinsic::x86_mmx_packsswb;
3857
3858	case Intrinsic::x86_mmx_packssdw:
3859	return Intrinsic::x86_mmx_packssdw;
3860
3861	case Intrinsic::x86_avx512_packssdw_512:
3862	case Intrinsic::x86_avx512_packusdw_512:
3863	return Intrinsic::x86_avx512_packssdw_512;
3864
3865	case Intrinsic::x86_avx512_packsswb_512:
3866	case Intrinsic::x86_avx512_packuswb_512:
3867	return Intrinsic::x86_avx512_packsswb_512;
3868
3869	default:
3870	llvm_unreachable("unexpected intrinsic id");
3871	}
3872	}
3873
3874	// Instrument vector pack intrinsic.
3875	//
3876	// This function instruments intrinsics like x86_mmx_packsswb, that
3877	// packs elements of 2 input vectors into half as many bits with saturation.
3878	// Shadow is propagated with the signed variant of the same intrinsic applied
3879	// to sext(Sa != zeroinitializer), sext(Sb != zeroinitializer).
3880	// MMXEltSizeInBits is used only for x86mmx arguments.
3881	//
3882	// TODO: consider using GetMinMaxUnsigned() to handle saturation precisely
3883	void handleVectorPackIntrinsic(IntrinsicInst &I,
3884	unsigned MMXEltSizeInBits = `0`) {
3885	assert(I.arg_size() == `2`);
3886	IRBuilder<> IRB(&I);
3887	Value *S1 = getShadow(I: &I, i: `0`);
3888	Value *S2 = getShadow(I: &I, i: `1`);
3889	assert(S1->getType()->isVectorTy());
3890
3891	// SExt and ICmpNE below must apply to individual elements of input vectors.
3892	// In case of x86mmx arguments, cast them to appropriate vector types and
3893	// back.
3894	Type *T =
3895	MMXEltSizeInBits ? getMMXVectorTy(EltSizeInBits: MMXEltSizeInBits) : S1->getType();
3896	if (MMXEltSizeInBits) {
3897	S1 = IRB.CreateBitCast(V: S1, DestTy: T);
3898	S2 = IRB.CreateBitCast(V: S2, DestTy: T);
3899	}
3900	Value *S1_ext =
3901	IRB.CreateSExt(V: IRB.CreateICmpNE(LHS: S1, RHS: Constant::getNullValue(Ty: T)), DestTy: T);
3902	Value *S2_ext =
3903	IRB.CreateSExt(V: IRB.CreateICmpNE(LHS: S2, RHS: Constant::getNullValue(Ty: T)), DestTy: T);
3904	if (MMXEltSizeInBits) {
3905	S1_ext = IRB.CreateBitCast(V: S1_ext, DestTy: getMMXVectorTy(EltSizeInBits: `64`));
3906	S2_ext = IRB.CreateBitCast(V: S2_ext, DestTy: getMMXVectorTy(EltSizeInBits: `64`));
3907	}
3908
3909	Value *S = IRB.CreateIntrinsic(ID: getSignedPackIntrinsic(id: I.getIntrinsicID()),
3910	Args: {S1_ext, S2_ext}, /FMFSource=/nullptr,
3911	Name: "_msprop_vector_pack");
3912	if (MMXEltSizeInBits)
3913	S = IRB.CreateBitCast(V: S, DestTy: getShadowTy(V: &I));
3914	setShadow(V: &I, SV: S);
3915	setOriginForNaryOp(I);
3916	}
3917
3918	// Convert `Mask` into `<n x i1>`.
3919	Constant createDppMask(unsigned* Width, unsigned Mask) {
3920	SmallVector<Constant *, `4`> R(Width);
3921	for (auto &M : R) {
3922	M = ConstantInt::getBool(Context&: F.getContext(), V: Mask & `1`);
3923	Mask >>= `1`;
3924	}
3925	return ConstantVector::get(V: R);
3926	}
3927
3928	// Calculate output shadow as array of booleans `<n x i1>`, assuming if any
3929	// arg is poisoned, entire dot product is poisoned.
3930	Value findDppPoisonedOutput(IRBuilder<> &IRB, Value S, unsigned SrcMask,
3931	unsigned DstMask) {
3932	const unsigned Width =
3933	cast<FixedVectorType>(Val: S->getType())->getNumElements();
3934
3935	S = IRB.CreateSelect(C: createDppMask(Width, Mask: SrcMask), True: S,
3936	False: Constant::getNullValue(Ty: S->getType()));
3937	Value *SElem = IRB.CreateOrReduce(Src: S);
3938	Value *IsClean = IRB.CreateIsNull(Arg: SElem, Name: "_msdpp");
3939	Value *DstMaskV = createDppMask(Width, Mask: DstMask);
3940
3941	return IRB.CreateSelect(
3942	C: IsClean, True: Constant::getNullValue(Ty: DstMaskV->getType()), False: DstMaskV);
3943	}
3944
3945	// See `Intel Intrinsics Guide` for `_dp_p` instructions.*
3946	//
3947	// 2 and 4 element versions produce single scalar of dot product, and then
3948	// puts it into elements of output vector, selected by 4 lowest bits of the
3949	// mask. Top 4 bits of the mask control which elements of input to use for dot
3950	// product.
3951	//
3952	// 8 element version mask still has only 4 bit for input, and 4 bit for output
3953	// mask. According to the spec it just operates as 4 element version on first
3954	// 4 elements of inputs and output, and then on last 4 elements of inputs and
3955	// output.
3956	void handleDppIntrinsic(IntrinsicInst &I) {
3957	IRBuilder<> IRB(&I);
3958
3959	Value *S0 = getShadow(I: &I, i: `0`);
3960	Value *S1 = getShadow(I: &I, i: `1`);
3961	Value *S = IRB.CreateOr(LHS: S0, RHS: S1);
3962
3963	const unsigned Width =
3964	cast<FixedVectorType>(Val: S->getType())->getNumElements();
3965	assert(Width == `2` \|\| Width == `4` \|\| Width == `8`);
3966
3967	const unsigned Mask = cast<ConstantInt>(Val: I.getArgOperand(i: `2`))->getZExtValue();
3968	const unsigned SrcMask = Mask >> `4`;
3969	const unsigned DstMask = Mask & `0xf`;
3970
3971	// Calculate shadow as `<n x i1>`.
3972	Value *SI1 = findDppPoisonedOutput(IRB, S, SrcMask, DstMask);
3973	if (Width == `8`) {
3974	// First 4 elements of shadow are already calculated. `makeDppShadow`
3975	// operats on 32 bit masks, so we can just shift masks, and repeat.
3976	SI1 = IRB.CreateOr(
3977	LHS: SI1, RHS: findDppPoisonedOutput(IRB, S, SrcMask: SrcMask << `4`, DstMask: DstMask << `4`));
3978	}
3979	// Extend to real size of shadow, poisoning either all or none bits of an
3980	// element.
3981	S = IRB.CreateSExt(V: SI1, DestTy: S->getType(), Name: "_msdpp");
3982
3983	setShadow(V: &I, SV: S);
3984	setOriginForNaryOp(I);
3985	}
3986
3987	Value convertBlendvToSelectMask(IRBuilder<> &IRB, Value C) {
3988	C = CreateAppToShadowCast(IRB, V: C);
3989	FixedVectorType *FVT = cast<FixedVectorType>(Val: C->getType());
3990	unsigned ElSize = FVT->getElementType()->getPrimitiveSizeInBits();
3991	C = IRB.CreateAShr(LHS: C, RHS: ElSize - `1`);
3992	FVT = FixedVectorType::get(ElementType: IRB.getInt1Ty(), NumElts: FVT->getNumElements());
3993	return IRB.CreateTrunc(V: C, DestTy: FVT);
3994	}
3995
3996	// `blendv(f, t, c)` is effectively `select(c[top_bit], t, f)`.
3997	void handleBlendvIntrinsic(IntrinsicInst &I) {
3998	Value *C = I.getOperand(i_nocapture: `2`);
3999	Value *T = I.getOperand(i_nocapture: `1`);
4000	Value *F = I.getOperand(i_nocapture: `0`);
4001
4002	Value *Sc = getShadow(I: &I, i: `2`);
4003	Value Oc = MS.TrackOrigins ? getOrigin(V: C) : nullptr*;
4004
4005	{
4006	IRBuilder<> IRB(&I);
4007	// Extract top bit from condition and its shadow.
4008	C = convertBlendvToSelectMask(IRB, C);
4009	Sc = convertBlendvToSelectMask(IRB, C: Sc);
4010
4011	setShadow(V: C, SV: Sc);
4012	setOrigin(V: C, Origin: Oc);
4013	}
4014
4015	handleSelectLikeInst(I, B: C, C: T, D: F);
4016	}
4017
4018	// Instrument sum-of-absolute-differences intrinsic.
4019	void handleVectorSadIntrinsic(IntrinsicInst &I, bool IsMMX = false) {
4020	const unsigned SignificantBitsPerResultElement = `16`;
4021	Type ResTy = IsMMX ? IntegerType::get(C&: MS.C, NumBits: `64`) : I.getType();
4022	unsigned ZeroBitsPerResultElement =
4023	ResTy->getScalarSizeInBits() - SignificantBitsPerResultElement;
4024
4025	IRBuilder<> IRB(&I);
4026	auto *Shadow0 = getShadow(I: &I, i: `0`);
4027	auto *Shadow1 = getShadow(I: &I, i: `1`);
4028	Value *S = IRB.CreateOr(LHS: Shadow0, RHS: Shadow1);
4029	S = IRB.CreateBitCast(V: S, DestTy: ResTy);
4030	S = IRB.CreateSExt(V: IRB.CreateICmpNE(LHS: S, RHS: Constant::getNullValue(Ty: ResTy)),
4031	DestTy: ResTy);
4032	S = IRB.CreateLShr(LHS: S, RHS: ZeroBitsPerResultElement);
4033	S = IRB.CreateBitCast(V: S, DestTy: getShadowTy(V: &I));
4034	setShadow(V: &I, SV: S);
4035	setOriginForNaryOp(I);
4036	}
4037
4038	// Instrument dot-product / multiply-add(-accumulate)? intrinsics.
4039	//
4040	// e.g., Two operands:
4041	// <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a, <8 x i16> %b)
4042	//
4043	// Two operands which require an EltSizeInBits override:
4044	// <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> %a, <1 x i64> %b)
4045	//
4046	// Three operands:
4047	// <4 x i32> @llvm.x86.avx512.vpdpbusd.128
4048	// (<4 x i32> %s, <16 x i8> %a, <16 x i8> %b)
4049	// <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16
4050	// (<2 x float> %acc, <4 x bfloat> %a, <4 x bfloat> %b)
4051	// (these are equivalent to multiply-add on %a and %b, followed by
4052	// adding/"accumulating" %s. "Accumulation" stores the result in one
4053	// of the source registers, but this accumulate vs. add distinction
4054	// is lost when dealing with LLVM intrinsics.)
4055	//
4056	// ZeroPurifies means that multiplying a known-zero with an uninitialized
4057	// value results in an initialized value. This is applicable for integer
4058	// multiplication, but not floating-point (counter-example: NaN).
4059	void handleVectorDotProductIntrinsic(IntrinsicInst &I,
4060	unsigned ReductionFactor,
4061	bool ZeroPurifies,
4062	unsigned EltSizeInBits,
4063	enum OddOrEvenLanes Lanes) {
4064	IRBuilder<> IRB(&I);
4065
4066	[[maybe_unused]] FixedVectorType *ReturnType =
4067	cast<FixedVectorType>(Val: I.getType());
4068	assert(isa<FixedVectorType>(ReturnType));
4069
4070	// Vectors A and B, and shadows
4071	Value Va = nullptr*;
4072	Value Vb = nullptr*;
4073	Value Sa = nullptr*;
4074	Value Sb = nullptr*;
4075
4076	assert(I.arg_size() == `2` \|\| I.arg_size() == `3`);
4077	if (I.arg_size() == `2`) {
4078	assert(Lanes == kBothLanes);
4079
4080	Va = I.getOperand(i_nocapture: `0`);
4081	Vb = I.getOperand(i_nocapture: `1`);
4082
4083	Sa = getShadow(I: &I, i: `0`);
4084	Sb = getShadow(I: &I, i: `1`);
4085	} else if (I.arg_size() == `3`) {
4086	// Operand 0 is the accumulator. We will deal with that below.
4087	Va = I.getOperand(i_nocapture: `1`);
4088	Vb = I.getOperand(i_nocapture: `2`);
4089
4090	Sa = getShadow(I: &I, i: `1`);
4091	Sb = getShadow(I: &I, i: `2`);
4092
4093	if (Lanes == kEvenLanes \|\| Lanes == kOddLanes) {
4094	// Convert < S0, S1, S2, S3, S4, S5, S6, S7 >
4095	// to < S0, S0, S2, S2, S4, S4, S6, S6 > (if even)
4096	// to < S1, S1, S3, S3, S5, S5, S7, S7 > (if odd)
4097	//
4098	// Note: for aarch64.neon.bfmlalb/t, the odd/even-indexed values are
4099	// zeroed, not duplicated. However, for shadow propagation, this
4100	// distinction is unimportant because Step 1 below will squeeze
4101	// each pair of elements (e.g., [S0, S0]) into a single bit, and
4102	// we only care if it is fully initialized.
4103
4104	FixedVectorType *InputShadowType = cast<FixedVectorType>(Val: Sa->getType());
4105	unsigned Width = InputShadowType->getNumElements();
4106
4107	Sa = IRB.CreateShuffleVector(
4108	V: Sa, Mask: getPclmulMask(Width, /OddElements=/Lanes == kOddLanes));
4109	Sb = IRB.CreateShuffleVector(
4110	V: Sb, Mask: getPclmulMask(Width, /OddElements=/Lanes == kOddLanes));
4111	}
4112	}
4113
4114	FixedVectorType *ParamType = cast<FixedVectorType>(Val: Va->getType());
4115	assert(ParamType == Vb->getType());
4116
4117	assert(ParamType->getPrimitiveSizeInBits() ==
4118	ReturnType->getPrimitiveSizeInBits());
4119
4120	if (I.arg_size() == `3`) {
4121	[[maybe_unused]] auto *AccumulatorType =
4122	cast<FixedVectorType>(Val: I.getOperand(i_nocapture: `0`)->getType());
4123	assert(AccumulatorType == ReturnType);
4124	}
4125
4126	FixedVectorType *ImplicitReturnType =
4127	cast<FixedVectorType>(Val: getShadowTy(OrigTy: ReturnType));
4128	// Step 1: instrument multiplication of corresponding vector elements
4129	if (EltSizeInBits) {
4130	ImplicitReturnType = cast<FixedVectorType>(
4131	Val: getMMXVectorTy(EltSizeInBits: EltSizeInBits * ReductionFactor,
4132	X86_MMXSizeInBits: ParamType->getPrimitiveSizeInBits()));
4133	ParamType = cast<FixedVectorType>(
4134	Val: getMMXVectorTy(EltSizeInBits, X86_MMXSizeInBits: ParamType->getPrimitiveSizeInBits()));
4135
4136	Va = IRB.CreateBitCast(V: Va, DestTy: ParamType);
4137	Vb = IRB.CreateBitCast(V: Vb, DestTy: ParamType);
4138
4139	Sa = IRB.CreateBitCast(V: Sa, DestTy: getShadowTy(OrigTy: ParamType));
4140	Sb = IRB.CreateBitCast(V: Sb, DestTy: getShadowTy(OrigTy: ParamType));
4141	} else {
4142	assert(ParamType->getNumElements() ==
4143	ReturnType->getNumElements() * ReductionFactor);
4144	}
4145
4146	// Each element of the vector is represented by a single bit (poisoned or
4147	// not) e.g., <8 x i1>.
4148	Value *SaNonZero = IRB.CreateIsNotNull(Arg: Sa);
4149	Value *SbNonZero = IRB.CreateIsNotNull(Arg: Sb);
4150	Value *And;
4151	if (ZeroPurifies) {
4152	// Multiplying an initialized* zero by an uninitialized element results*
4153	// in an initialized zero element.
4154	//
4155	// This is analogous to bitwise AND, where "AND" of 0 and a poisoned value
4156	// results in an unpoisoned value.
4157	Value *VaInt = Va;
4158	Value *VbInt = Vb;
4159	if (!Va->getType()->isIntegerTy()) {
4160	VaInt = CreateAppToShadowCast(IRB, V: Va);
4161	VbInt = CreateAppToShadowCast(IRB, V: Vb);
4162	}
4163
4164	// We check for non-zero on a per-element basis, not per-bit.
4165	Value *VaNonZero = IRB.CreateIsNotNull(Arg: VaInt);
4166	Value *VbNonZero = IRB.CreateIsNotNull(Arg: VbInt);
4167
4168	And = handleBitwiseAnd(IRB, V1: VaNonZero, V2: VbNonZero, S1: SaNonZero, S2: SbNonZero);
4169	} else {
4170	And = IRB.CreateOr(Ops: {SaNonZero, SbNonZero});
4171	}
4172
4173	// Extend <8 x i1> to <8 x i16>.
4174	// (The real pmadd intrinsic would have computed intermediate values of
4175	// <8 x i32>, but that is irrelevant for our shadow purposes because we
4176	// consider each element to be either fully initialized or fully
4177	// uninitialized.)
4178	And = IRB.CreateSExt(V: And, DestTy: Sa->getType());
4179
4180	// Step 2: instrument horizontal add
4181	// We don't need bit-precise horizontalReduce because we only want to check
4182	// if each pair/quad of elements is fully zero.
4183	// Cast to <4 x i32>.
4184	Value *Horizontal = IRB.CreateBitCast(V: And, DestTy: ImplicitReturnType);
4185
4186	// Compute <4 x i1>, then extend back to <4 x i32>.
4187	Value *OutShadow = IRB.CreateSExt(
4188	V: IRB.CreateICmpNE(LHS: Horizontal,
4189	RHS: Constant::getNullValue(Ty: Horizontal->getType())),
4190	DestTy: ImplicitReturnType);
4191
4192	// Cast it back to the required fake return type (if MMX: <1 x i64>; for
4193	// AVX, it is already correct).
4194	if (EltSizeInBits)
4195	OutShadow = CreateShadowCast(IRB, V: OutShadow, dstTy: getShadowTy(V: &I));
4196
4197	// Step 3 (if applicable): instrument accumulator
4198	if (I.arg_size() == `3`)
4199	OutShadow = IRB.CreateOr(LHS: OutShadow, RHS: getShadow(I: &I, i: `0`));
4200
4201	setShadow(V: &I, SV: OutShadow);
4202	setOriginForNaryOp(I);
4203	}
4204
4205	// Instrument compare-packed intrinsic.
4206	//
4207	// x86 has the predicate as the third operand, which is ImmArg e.g.,
4208	// - <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8)
4209	// - <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8)
4210	//
4211	// while Arm has separate intrinsics for >= and > e.g.,
4212	// - <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32
4213	// (<2 x float> %A, <2 x float>)
4214	// - <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32
4215	// (<2 x float> %A, <2 x float>)
4216	//
4217	// Bonus: this also handles scalar cases e.g.,
4218	// - i32 @llvm.aarch64.neon.facgt.i32.f32(float %A, float %B)
4219	void handleVectorComparePackedIntrinsic(IntrinsicInst &I,
4220	bool PredicateAsOperand) {
4221	if (PredicateAsOperand) {
4222	assert(I.arg_size() == `3`);
4223	assert(I.paramHasAttr(`2`, Attribute::ImmArg));
4224	} else
4225	assert(I.arg_size() == `2`);
4226
4227	IRBuilder<> IRB(&I);
4228
4229	// Basically, an or followed by sext(icmp ne 0) to end up with all-zeros or
4230	// all-ones shadow.
4231	Type *ResTy = getShadowTy(V: &I);
4232	auto *Shadow0 = getShadow(I: &I, i: `0`);
4233	auto *Shadow1 = getShadow(I: &I, i: `1`);
4234	Value *S0 = IRB.CreateOr(LHS: Shadow0, RHS: Shadow1);
4235	Value *S = IRB.CreateSExt(
4236	V: IRB.CreateICmpNE(LHS: S0, RHS: Constant::getNullValue(Ty: ResTy)), DestTy: ResTy);
4237	setShadow(V: &I, SV: S);
4238	setOriginForNaryOp(I);
4239	}
4240
4241	// Instrument compare-scalar intrinsic.
4242	// This handles both cmp intrinsics which return the result in the first*
4243	// element of a vector, and comi which return the result as i32.*
4244	void handleVectorCompareScalarIntrinsic(IntrinsicInst &I) {
4245	IRBuilder<> IRB(&I);
4246	auto *Shadow0 = getShadow(I: &I, i: `0`);
4247	auto *Shadow1 = getShadow(I: &I, i: `1`);
4248	Value *S0 = IRB.CreateOr(LHS: Shadow0, RHS: Shadow1);
4249	Value *S = LowerElementShadowExtend(IRB, S: S0, T: getShadowTy(V: &I));
4250	setShadow(V: &I, SV: S);
4251	setOriginForNaryOp(I);
4252	}
4253
4254	// Instrument generic vector reduction intrinsics
4255	// by ORing together all their fields.
4256	//
4257	// If AllowShadowCast is true, the return type does not need to be the same
4258	// type as the fields
4259	// e.g., declare i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8>)
4260	void handleVectorReduceIntrinsic(IntrinsicInst &I, bool AllowShadowCast) {
4261	assert(I.arg_size() == `1`);
4262
4263	IRBuilder<> IRB(&I);
4264	Value *S = IRB.CreateOrReduce(Src: getShadow(I: &I, i: `0`));
4265	if (AllowShadowCast)
4266	S = CreateShadowCast(IRB, V: S, dstTy: getShadowTy(V: &I));
4267	else
4268	assert(S->getType() == getShadowTy(&I));
4269	setShadow(V: &I, SV: S);
4270	setOriginForNaryOp(I);
4271	}
4272
4273	// Similar to handleVectorReduceIntrinsic but with an initial starting value.
4274	// e.g., call float @llvm.vector.reduce.fadd.f32.v2f32(float %a0, <2 x float>
4275	// %a1)
4276	// shadow = shadow[a0] \| shadow[a1.0] \| shadow[a1.1]
4277	//
4278	// The type of the return value, initial starting value, and elements of the
4279	// vector must be identical.
4280	void handleVectorReduceWithStarterIntrinsic(IntrinsicInst &I) {
4281	assert(I.arg_size() == `2`);
4282
4283	IRBuilder<> IRB(&I);
4284	Value *Shadow0 = getShadow(I: &I, i: `0`);
4285	Value *Shadow1 = IRB.CreateOrReduce(Src: getShadow(I: &I, i: `1`));
4286	assert(Shadow0->getType() == Shadow1->getType());
4287	Value *S = IRB.CreateOr(LHS: Shadow0, RHS: Shadow1);
4288	assert(S->getType() == getShadowTy(&I));
4289	setShadow(V: &I, SV: S);
4290	setOriginForNaryOp(I);
4291	}
4292
4293	// Instrument vector.reduce.or intrinsic.
4294	// Valid (non-poisoned) set bits in the operand pull low the
4295	// corresponding shadow bits.
4296	void handleVectorReduceOrIntrinsic(IntrinsicInst &I) {
4297	assert(I.arg_size() == `1`);
4298
4299	IRBuilder<> IRB(&I);
4300	Value *OperandShadow = getShadow(I: &I, i: `0`);
4301	Value *OperandUnsetBits = IRB.CreateNot(V: I.getOperand(i_nocapture: `0`));
4302	Value *OperandUnsetOrPoison = IRB.CreateOr(LHS: OperandUnsetBits, RHS: OperandShadow);
4303	// Bit N is clean if any field's bit N is 1 and unpoison
4304	Value *OutShadowMask = IRB.CreateAndReduce(Src: OperandUnsetOrPoison);
4305	// Otherwise, it is clean if every field's bit N is unpoison
4306	Value *OrShadow = IRB.CreateOrReduce(Src: OperandShadow);
4307	Value *S = IRB.CreateAnd(LHS: OutShadowMask, RHS: OrShadow);
4308
4309	setShadow(V: &I, SV: S);
4310	setOrigin(V: &I, Origin: getOrigin(I: &I, i: `0`));
4311	}
4312
4313	// Instrument vector.reduce.and intrinsic.
4314	// Valid (non-poisoned) unset bits in the operand pull down the
4315	// corresponding shadow bits.
4316	void handleVectorReduceAndIntrinsic(IntrinsicInst &I) {
4317	assert(I.arg_size() == `1`);
4318
4319	IRBuilder<> IRB(&I);
4320	Value *OperandShadow = getShadow(I: &I, i: `0`);
4321	Value *OperandSetOrPoison = IRB.CreateOr(LHS: I.getOperand(i_nocapture: `0`), RHS: OperandShadow);
4322	// Bit N is clean if any field's bit N is 0 and unpoison
4323	Value *OutShadowMask = IRB.CreateAndReduce(Src: OperandSetOrPoison);
4324	// Otherwise, it is clean if every field's bit N is unpoison
4325	Value *OrShadow = IRB.CreateOrReduce(Src: OperandShadow);
4326	Value *S = IRB.CreateAnd(LHS: OutShadowMask, RHS: OrShadow);
4327
4328	setShadow(V: &I, SV: S);
4329	setOrigin(V: &I, Origin: getOrigin(I: &I, i: `0`));
4330	}
4331
4332	void handleStmxcsr(IntrinsicInst &I) {
4333	IRBuilder<> IRB(&I);
4334	Value *Addr = I.getArgOperand(i: `0`);
4335	Type *Ty = IRB.getInt32Ty();
4336	Value *ShadowPtr =
4337	getShadowOriginPtr(Addr, IRB, ShadowTy: Ty, Alignment: Align (`1`), /isStore/ true).first;
4338
4339	IRB.CreateStore(Val: getCleanShadow(OrigTy: Ty), Ptr: ShadowPtr);
4340
4341	if (ClCheckAccessAddress)
4342	insertCheckShadowOf(Val: Addr, OrigIns: &I);
4343	}
4344
4345	void handleLdmxcsr(IntrinsicInst &I) {
4346	if (!InsertChecks)
4347	return;
4348
4349	IRBuilder<> IRB(&I);
4350	Value *Addr = I.getArgOperand(i: `0`);
4351	Type *Ty = IRB.getInt32Ty();
4352	const Align Alignment = Align (`1`);
4353	Value ShadowPtr, OriginPtr;
4354	std::tie(args&: ShadowPtr, args&: OriginPtr) =
4355	getShadowOriginPtr(Addr, IRB, ShadowTy: Ty, Alignment, /isStore/ false);
4356
4357	if (ClCheckAccessAddress)
4358	insertCheckShadowOf(Val: Addr, OrigIns: &I);
4359
4360	Value *Shadow = IRB.CreateAlignedLoad(Ty, Ptr: ShadowPtr, Align: Alignment, Name: "_ldmxcsr");
4361	Value *Origin = MS.TrackOrigins ? IRB.CreateLoad(Ty: MS.OriginTy, Ptr: OriginPtr)
4362	: getCleanOrigin();
4363	insertCheckShadow(Shadow, Origin, OrigIns: &I);
4364	}
4365
4366	void handleMaskedExpandLoad(IntrinsicInst &I) {
4367	IRBuilder<> IRB(&I);
4368	Value *Ptr = I.getArgOperand(i: `0`);
4369	MaybeAlign Align = I.getParamAlign(ArgNo: `0`);
4370	Value *Mask = I.getArgOperand(i: `1`);
4371	Value *PassThru = I.getArgOperand(i: `2`);
4372
4373	if (ClCheckAccessAddress) {
4374	insertCheckShadowOf(Val: Ptr, OrigIns: &I);
4375	insertCheckShadowOf(Val: Mask, OrigIns: &I);
4376	}
4377
4378	if (!PropagateShadow) {
4379	setShadow(V: &I, SV: getCleanShadow(V: &I));
4380	setOrigin(V: &I, Origin: getCleanOrigin());
4381	return;
4382	}
4383
4384	Type *ShadowTy = getShadowTy(V: &I);
4385	Type *ElementShadowTy = cast<VectorType>(Val: ShadowTy)->getElementType();
4386	auto [ShadowPtr, OriginPtr] =
4387	getShadowOriginPtr(Addr: Ptr, IRB, ShadowTy: ElementShadowTy, Alignment: Align, /isStore/ false);
4388
4389	Value *Shadow =
4390	IRB.CreateMaskedExpandLoad(Ty: ShadowTy, Ptr: ShadowPtr, Align, Mask,
4391	PassThru: getShadow(V: PassThru), Name: "_msmaskedexpload");
4392
4393	setShadow(V: &I, SV: Shadow);
4394
4395	// TODO: Store origins.
4396	setOrigin(V: &I, Origin: getCleanOrigin());
4397	}
4398
4399	void handleMaskedCompressStore(IntrinsicInst &I) {
4400	IRBuilder<> IRB(&I);
4401	Value *Values = I.getArgOperand(i: `0`);
4402	Value *Ptr = I.getArgOperand(i: `1`);
4403	MaybeAlign Align = I.getParamAlign(ArgNo: `1`);
4404	Value *Mask = I.getArgOperand(i: `2`);
4405
4406	if (ClCheckAccessAddress) {
4407	insertCheckShadowOf(Val: Ptr, OrigIns: &I);
4408	insertCheckShadowOf(Val: Mask, OrigIns: &I);
4409	}
4410
4411	Value *Shadow = getShadow(V: Values);
4412	Type *ElementShadowTy =
4413	getShadowTy(OrigTy: cast<VectorType>(Val: Values->getType())->getElementType());
4414	auto [ShadowPtr, OriginPtrs] =
4415	getShadowOriginPtr(Addr: Ptr, IRB, ShadowTy: ElementShadowTy, Alignment: Align, /isStore/ true);
4416
4417	IRB.CreateMaskedCompressStore(Val: Shadow, Ptr: ShadowPtr, Align, Mask);
4418
4419	// TODO: Store origins.
4420	}
4421
4422	void handleMaskedGather(IntrinsicInst &I) {
4423	IRBuilder<> IRB(&I);
4424	Value *Ptrs = I.getArgOperand(i: `0`);
4425	const Align Alignment = I.getParamAlign(ArgNo: `0`).valueOrOne();
4426	Value *Mask = I.getArgOperand(i: `1`);
4427	Value *PassThru = I.getArgOperand(i: `2`);
4428
4429	Type *PtrsShadowTy = getShadowTy(V: Ptrs);
4430	if (ClCheckAccessAddress) {
4431	insertCheckShadowOf(Val: Mask, OrigIns: &I);
4432	Value *MaskedPtrShadow = IRB.CreateSelect(
4433	C: Mask, True: getShadow(V: Ptrs), False: Constant::getNullValue(Ty: (PtrsShadowTy)),
4434	Name: "_msmaskedptrs");
4435	insertCheckShadow(Shadow: MaskedPtrShadow, Origin: getOrigin(V: Ptrs), OrigIns: &I);
4436	}
4437
4438	if (!PropagateShadow) {
4439	setShadow(V: &I, SV: getCleanShadow(V: &I));
4440	setOrigin(V: &I, Origin: getCleanOrigin());
4441	return;
4442	}
4443
4444	Type *ShadowTy = getShadowTy(V: &I);
4445	Type *ElementShadowTy = cast<VectorType>(Val: ShadowTy)->getElementType();
4446	auto [ShadowPtrs, OriginPtrs] = getShadowOriginPtr(
4447	Addr: Ptrs, IRB, ShadowTy: ElementShadowTy, Alignment, /isStore/ false);
4448
4449	Value *Shadow =
4450	IRB.CreateMaskedGather(Ty: ShadowTy, Ptrs: ShadowPtrs, Alignment, Mask,
4451	PassThru: getShadow(V: PassThru), Name: "_msmaskedgather");
4452
4453	setShadow(V: &I, SV: Shadow);
4454
4455	// TODO: Store origins.
4456	setOrigin(V: &I, Origin: getCleanOrigin());
4457	}
4458
4459	void handleMaskedScatter(IntrinsicInst &I) {
4460	IRBuilder<> IRB(&I);
4461	Value *Values = I.getArgOperand(i: `0`);
4462	Value *Ptrs = I.getArgOperand(i: `1`);
4463	const Align Alignment = I.getParamAlign(ArgNo: `1`).valueOrOne();
4464	Value *Mask = I.getArgOperand(i: `2`);
4465
4466	Type *PtrsShadowTy = getShadowTy(V: Ptrs);
4467	if (ClCheckAccessAddress) {
4468	insertCheckShadowOf(Val: Mask, OrigIns: &I);
4469	Value *MaskedPtrShadow = IRB.CreateSelect(
4470	C: Mask, True: getShadow(V: Ptrs), False: Constant::getNullValue(Ty: (PtrsShadowTy)),
4471	Name: "_msmaskedptrs");
4472	insertCheckShadow(Shadow: MaskedPtrShadow, Origin: getOrigin(V: Ptrs), OrigIns: &I);
4473	}
4474
4475	Value *Shadow = getShadow(V: Values);
4476	Type *ElementShadowTy =
4477	getShadowTy(OrigTy: cast<VectorType>(Val: Values->getType())->getElementType());
4478	auto [ShadowPtrs, OriginPtrs] = getShadowOriginPtr(
4479	Addr: Ptrs, IRB, ShadowTy: ElementShadowTy, Alignment, /isStore/ true);
4480
4481	IRB.CreateMaskedScatter(Val: Shadow, Ptrs: ShadowPtrs, Alignment, Mask);
4482
4483	// TODO: Store origin.
4484	}
4485
4486	// Intrinsic::masked_store
4487	//
4488	// Note: handleAVXMaskedStore handles AVX/AVX2 variants, though AVX512 masked
4489	// stores are lowered to Intrinsic::masked_store.
4490	void handleMaskedStore(IntrinsicInst &I) {
4491	IRBuilder<> IRB(&I);
4492	Value *V = I.getArgOperand(i: `0`);
4493	Value *Ptr = I.getArgOperand(i: `1`);
4494	const Align Alignment = I.getParamAlign(ArgNo: `1`).valueOrOne();
4495	Value *Mask = I.getArgOperand(i: `2`);
4496	Value *Shadow = getShadow(V);
4497
4498	if (ClCheckAccessAddress) {
4499	insertCheckShadowOf(Val: Ptr, OrigIns: &I);
4500	insertCheckShadowOf(Val: Mask, OrigIns: &I);
4501	}
4502
4503	Value *ShadowPtr;
4504	Value *OriginPtr;
4505	std::tie(args&: ShadowPtr, args&: OriginPtr) = getShadowOriginPtr(
4506	Addr: Ptr, IRB, ShadowTy: Shadow->getType(), Alignment, /isStore/ true);
4507
4508	IRB.CreateMaskedStore(Val: Shadow, Ptr: ShadowPtr, Alignment, Mask);
4509
4510	if (!MS.TrackOrigins)
4511	return;
4512
4513	auto &DL = F.getDataLayout();
4514	paintOrigin(IRB, Origin: getOrigin(V), OriginPtr,
4515	TS: DL.getTypeStoreSize(Ty: Shadow->getType()),
4516	Alignment: std::max(a: Alignment, b: kMinOriginAlignment));
4517	}
4518
4519	// Intrinsic::masked_load
4520	//
4521	// Note: handleAVXMaskedLoad handles AVX/AVX2 variants, though AVX512 masked
4522	// loads are lowered to Intrinsic::masked_load.
4523	void handleMaskedLoad(IntrinsicInst &I) {
4524	IRBuilder<> IRB(&I);
4525	Value *Ptr = I.getArgOperand(i: `0`);
4526	const Align Alignment = I.getParamAlign(ArgNo: `0`).valueOrOne();
4527	Value *Mask = I.getArgOperand(i: `1`);
4528	Value *PassThru = I.getArgOperand(i: `2`);
4529
4530	if (ClCheckAccessAddress) {
4531	insertCheckShadowOf(Val: Ptr, OrigIns: &I);
4532	insertCheckShadowOf(Val: Mask, OrigIns: &I);
4533	}
4534
4535	if (!PropagateShadow) {
4536	setShadow(V: &I, SV: getCleanShadow(V: &I));
4537	setOrigin(V: &I, Origin: getCleanOrigin());
4538	return;
4539	}
4540
4541	Type *ShadowTy = getShadowTy(V: &I);
4542	Value ShadowPtr, OriginPtr;
4543	std::tie(args&: ShadowPtr, args&: OriginPtr) =
4544	getShadowOriginPtr(Addr: Ptr, IRB, ShadowTy, Alignment, /isStore/ false);
4545	setShadow(V: &I, SV: IRB.CreateMaskedLoad(Ty: ShadowTy, Ptr: ShadowPtr, Alignment, Mask,
4546	PassThru: getShadow(V: PassThru), Name: "_msmaskedld"));
4547
4548	if (!MS.TrackOrigins)
4549	return;
4550
4551	// Choose between PassThru's and the loaded value's origins.
4552	Value *MaskedPassThruShadow = IRB.CreateAnd(
4553	LHS: getShadow(V: PassThru), RHS: IRB.CreateSExt(V: IRB.CreateNeg(V: Mask), DestTy: ShadowTy));
4554
4555	Value *NotNull = convertToBool(V: MaskedPassThruShadow, IRB, name: "_mscmp");
4556
4557	Value *PtrOrigin = IRB.CreateLoad(Ty: MS.OriginTy, Ptr: OriginPtr);
4558	Value *Origin = IRB.CreateSelect(C: NotNull, True: getOrigin(V: PassThru), False: PtrOrigin);
4559
4560	setOrigin(V: &I, Origin);
4561	}
4562
4563	// e.g., void @llvm.x86.avx.maskstore.ps.256(ptr, <8 x i32>, <8 x float>)
4564	// dst mask src
4565	//
4566	// AVX512 masked stores are lowered to Intrinsic::masked_load and are handled
4567	// by handleMaskedStore.
4568	//
4569	// This function handles AVX and AVX2 masked stores; these use the MSBs of a
4570	// vector of integers, unlike the LLVM masked intrinsics, which require a
4571	// vector of booleans. X86InstCombineIntrinsic.cpp::simplifyX86MaskedLoad
4572	// mentions that the x86 backend does not know how to efficiently convert
4573	// from a vector of booleans back into the AVX mask format; therefore, they
4574	// (and we) do not reduce AVX/AVX2 masked intrinsics into LLVM masked
4575	// intrinsics.
4576	void handleAVXMaskedStore(IntrinsicInst &I) {
4577	assert(I.arg_size() == `3`);
4578
4579	IRBuilder<> IRB(&I);
4580
4581	Value *Dst = I.getArgOperand(i: `0`);
4582	assert(Dst->getType()->isPointerTy() && "Destination is not a pointer!");
4583
4584	Value *Mask = I.getArgOperand(i: `1`);
4585	assert(isa<VectorType>(Mask->getType()) && "Mask is not a vector!");
4586
4587	Value *Src = I.getArgOperand(i: `2`);
4588	assert(isa<VectorType>(Src->getType()) && "Source is not a vector!");
4589
4590	const Align Alignment = Align (`1`);
4591
4592	Value *SrcShadow = getShadow(V: Src);
4593
4594	if (ClCheckAccessAddress) {
4595	insertCheckShadowOf(Val: Dst, OrigIns: &I);
4596	insertCheckShadowOf(Val: Mask, OrigIns: &I);
4597	}
4598
4599	Value *DstShadowPtr;
4600	Value *DstOriginPtr;
4601	std::tie(args&: DstShadowPtr, args&: DstOriginPtr) = getShadowOriginPtr(
4602	Addr: Dst, IRB, ShadowTy: SrcShadow->getType(), Alignment, /isStore/ true);
4603
4604	SmallVector<Value *, `2`> ShadowArgs;
4605	ShadowArgs.append(NumInputs: `1`, Elt: DstShadowPtr);
4606	ShadowArgs.append(NumInputs: `1`, Elt: Mask);
4607	// The intrinsic may require floating-point but shadows can be arbitrary
4608	// bit patterns, of which some would be interpreted as "invalid"
4609	// floating-point values (NaN etc.); we assume the intrinsic will happily
4610	// copy them.
4611	ShadowArgs.append(NumInputs: `1`, Elt: IRB.CreateBitCast(V: SrcShadow, DestTy: Src->getType()));
4612
4613	CallInst *CI =
4614	IRB.CreateIntrinsic(RetTy: IRB.getVoidTy(), ID: I.getIntrinsicID(), Args: ShadowArgs);
4615	setShadow(V: &I, SV: CI);
4616
4617	if (!MS.TrackOrigins)
4618	return;
4619
4620	// Approximation only
4621	auto &DL = F.getDataLayout();
4622	paintOrigin(IRB, Origin: getOrigin(V: Src), OriginPtr: DstOriginPtr,
4623	TS: DL.getTypeStoreSize(Ty: SrcShadow->getType()),
4624	Alignment: std::max(a: Alignment, b: kMinOriginAlignment));
4625	}
4626
4627	// e.g., <8 x float> @llvm.x86.avx.maskload.ps.256(ptr, <8 x i32>)
4628	// return src mask
4629	//
4630	// Masked-off values are replaced with 0, which conveniently also represents
4631	// initialized memory.
4632	//
4633	// AVX512 masked stores are lowered to Intrinsic::masked_load and are handled
4634	// by handleMaskedStore.
4635	//
4636	// We do not combine this with handleMaskedLoad; see comment in
4637	// handleAVXMaskedStore for the rationale.
4638	//
4639	// This is subtly different than handleIntrinsicByApplyingToShadow(I, 1)
4640	// because we need to apply getShadowOriginPtr, not getShadow, to the first
4641	// parameter.
4642	void handleAVXMaskedLoad(IntrinsicInst &I) {
4643	assert(I.arg_size() == `2`);
4644
4645	IRBuilder<> IRB(&I);
4646
4647	Value *Src = I.getArgOperand(i: `0`);
4648	assert(Src->getType()->isPointerTy() && "Source is not a pointer!");
4649
4650	Value *Mask = I.getArgOperand(i: `1`);
4651	assert(isa<VectorType>(Mask->getType()) && "Mask is not a vector!");
4652
4653	const Align Alignment = Align (`1`);
4654
4655	if (ClCheckAccessAddress) {
4656	insertCheckShadowOf(Val: Mask, OrigIns: &I);
4657	}
4658
4659	Type *SrcShadowTy = getShadowTy(V: Src);
4660	Value SrcShadowPtr, SrcOriginPtr;
4661	std::tie(args&: SrcShadowPtr, args&: SrcOriginPtr) =
4662	getShadowOriginPtr(Addr: Src, IRB, ShadowTy: SrcShadowTy, Alignment, /isStore/ false);
4663
4664	SmallVector<Value *, `2`> ShadowArgs;
4665	ShadowArgs.append(NumInputs: `1`, Elt: SrcShadowPtr);
4666	ShadowArgs.append(NumInputs: `1`, Elt: Mask);
4667
4668	CallInst *CI =
4669	IRB.CreateIntrinsic(RetTy: I.getType(), ID: I.getIntrinsicID(), Args: ShadowArgs);
4670	// The AVX masked load intrinsics do not have integer variants. We use the
4671	// floating-point variants, which will happily copy the shadows even if
4672	// they are interpreted as "invalid" floating-point values (NaN etc.).
4673	setShadow(V: &I, SV: IRB.CreateBitCast(V: CI, DestTy: getShadowTy(V: &I)));
4674
4675	if (!MS.TrackOrigins)
4676	return;
4677
4678	// The "pass-through" value is always zero (initialized). To the extent
4679	// that that results in initialized aligned 4-byte chunks, the origin value
4680	// is ignored. It is therefore correct to simply copy the origin from src.
4681	Value *PtrSrcOrigin = IRB.CreateLoad(Ty: MS.OriginTy, Ptr: SrcOriginPtr);
4682	setOrigin(V: &I, Origin: PtrSrcOrigin);
4683	}
4684
4685	// Test whether the mask indices are initialized, only checking the bits that
4686	// are actually used.
4687	//
4688	// e.g., if Idx is <32 x i16>, only (log2(32) == 5) bits of each index are
4689	// used/checked.
4690	void maskedCheckAVXIndexShadow(IRBuilder<> &IRB, Value Idx, Instruction I) {
4691	assert(isFixedIntVector(Idx));
4692	auto IdxVectorSize =
4693	cast<FixedVectorType>(Val: Idx->getType())->getNumElements();
4694	assert(isPowerOf2_64(IdxVectorSize));
4695
4696	// Compiler isn't smart enough, let's help it
4697	if (isa<Constant>(Val: Idx))
4698	return;
4699
4700	auto *IdxShadow = getShadow(V: Idx);
4701	Value *Truncated = IRB.CreateTrunc(
4702	V: IdxShadow,
4703	DestTy: FixedVectorType::get(ElementType: Type::getIntNTy(C&: *MS.C, N: Log2_64(Value: IdxVectorSize)),
4704	NumElts: IdxVectorSize));
4705	insertCheckShadow(Shadow: Truncated, Origin: getOrigin(V: Idx), OrigIns: I);
4706	}
4707
4708	// Instrument AVX permutation intrinsic.
4709	// We apply the same permutation (argument index 1) to the shadow.
4710	void handleAVXVpermilvar(IntrinsicInst &I) {
4711	IRBuilder<> IRB(&I);
4712	Value *Shadow = getShadow(I: &I, i: `0`);
4713	maskedCheckAVXIndexShadow(IRB, Idx: I.getArgOperand(i: `1`), I: &I);
4714
4715	// Shadows are integer-ish types but some intrinsics require a
4716	// different (e.g., floating-point) type.
4717	Shadow = IRB.CreateBitCast(V: Shadow, DestTy: I.getArgOperand(i: `0`)->getType());
4718	CallInst *CI = IRB.CreateIntrinsic(RetTy: I.getType(), ID: I.getIntrinsicID(),
4719	Args: {Shadow, I.getArgOperand(i: `1`)});
4720
4721	setShadow(V: &I, SV: IRB.CreateBitCast(V: CI, DestTy: getShadowTy(V: &I)));
4722	setOriginForNaryOp(I);
4723	}
4724
4725	// Instrument AVX permutation intrinsic.
4726	// We apply the same permutation (argument index 1) to the shadows.
4727	void handleAVXVpermi2var(IntrinsicInst &I) {
4728	assert(I.arg_size() == `3`);
4729	assert(isa<FixedVectorType>(I.getArgOperand(`0`)->getType()));
4730	assert(isa<FixedVectorType>(I.getArgOperand(`1`)->getType()));
4731	assert(isa<FixedVectorType>(I.getArgOperand(`2`)->getType()));
4732	[[maybe_unused]] auto ArgVectorSize =
4733	cast<FixedVectorType>(Val: I.getArgOperand(i: `0`)->getType())->getNumElements();
4734	assert(cast<FixedVectorType>(I.getArgOperand(`1`)->getType())
4735	->getNumElements() == ArgVectorSize);
4736	assert(cast<FixedVectorType>(I.getArgOperand(`2`)->getType())
4737	->getNumElements() == ArgVectorSize);
4738	assert(I.getArgOperand(`0`)->getType() == I.getArgOperand(`2`)->getType());
4739	assert(I.getType() == I.getArgOperand(`0`)->getType());
4740	assert(I.getArgOperand(`1`)->getType()->isIntOrIntVectorTy());
4741	IRBuilder<> IRB(&I);
4742	Value *AShadow = getShadow(I: &I, i: `0`);
4743	Value *Idx = I.getArgOperand(i: `1`);
4744	Value *BShadow = getShadow(I: &I, i: `2`);
4745
4746	maskedCheckAVXIndexShadow(IRB, Idx, I: &I);
4747
4748	// Shadows are integer-ish types but some intrinsics require a
4749	// different (e.g., floating-point) type.
4750	AShadow = IRB.CreateBitCast(V: AShadow, DestTy: I.getArgOperand(i: `0`)->getType());
4751	BShadow = IRB.CreateBitCast(V: BShadow, DestTy: I.getArgOperand(i: `2`)->getType());
4752	CallInst *CI = IRB.CreateIntrinsic(RetTy: I.getType(), ID: I.getIntrinsicID(),
4753	Args: {AShadow, Idx, BShadow});
4754	setShadow(V: &I, SV: IRB.CreateBitCast(V: CI, DestTy: getShadowTy(V: &I)));
4755	setOriginForNaryOp(I);
4756	}
4757
4758	[[maybe_unused]] static bool isFixedIntVectorTy(const Type *T) {
4759	return isa<FixedVectorType>(Val: T) && T->isIntOrIntVectorTy();
4760	}
4761
4762	[[maybe_unused]] static bool isFixedFPVectorTy(const Type *T) {
4763	return isa<FixedVectorType>(Val: T) && T->isFPOrFPVectorTy();
4764	}
4765
4766	[[maybe_unused]] static bool isFixedIntVector(const Value *V) {
4767	return isFixedIntVectorTy(T: V->getType());
4768	}
4769
4770	[[maybe_unused]] static bool isFixedFPVector(const Value *V) {
4771	return isFixedFPVectorTy(T: V->getType());
4772	}
4773
4774	// e.g., <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512
4775	// (<16 x float> a, <16 x i32> writethru, i16 mask,
4776	// i32 rounding)
4777	//
4778	// Inconveniently, some similar intrinsics have a different operand order:
4779	// <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512
4780	// (<16 x float> a, i32 rounding, <16 x i16> writethru,
4781	// i16 mask)
4782	//
4783	// If the return type has more elements than A, the excess elements are
4784	// zeroed (and the corresponding shadow is initialized).
4785	// <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128
4786	// (<4 x float> a, i32 rounding, <8 x i16> writethru,
4787	// i8 mask)
4788	//
4789	// dst[i] = mask[i] ? convert(a[i]) : writethru[i]
4790	// dst_shadow[i] = mask[i] ? all_or_nothing(a_shadow[i]) : writethru_shadow[i]
4791	// where all_or_nothing(x) is fully uninitialized if x has any
4792	// uninitialized bits
4793	void handleAVX512VectorConvertFPToInt(IntrinsicInst &I, bool LastMask) {
4794	IRBuilder<> IRB(&I);
4795
4796	assert(I.arg_size() == `4`);
4797	Value *A = I.getOperand(i_nocapture: `0`);
4798	Value *WriteThrough;
4799	Value *Mask;
4800	Value *RoundingMode;
4801	if (LastMask) {
4802	WriteThrough = I.getOperand(i_nocapture: `2`);
4803	Mask = I.getOperand(i_nocapture: `3`);
4804	RoundingMode = I.getOperand(i_nocapture: `1`);
4805	} else {
4806	WriteThrough = I.getOperand(i_nocapture: `1`);
4807	Mask = I.getOperand(i_nocapture: `2`);
4808	RoundingMode = I.getOperand(i_nocapture: `3`);
4809	}
4810
4811	assert(isFixedFPVector(A));
4812	assert(isFixedIntVector(WriteThrough));
4813
4814	unsigned ANumElements =
4815	cast<FixedVectorType>(Val: A->getType())->getNumElements();
4816	[[maybe_unused]] unsigned WriteThruNumElements =
4817	cast<FixedVectorType>(Val: WriteThrough->getType())->getNumElements();
4818	assert(ANumElements == WriteThruNumElements \|\|
4819	ANumElements * `2` == WriteThruNumElements);
4820
4821	assert(Mask->getType()->isIntegerTy());
4822	unsigned MaskNumElements = Mask->getType()->getScalarSizeInBits();
4823	assert(ANumElements == MaskNumElements \|\|
4824	ANumElements * `2` == MaskNumElements);
4825
4826	assert(WriteThruNumElements == MaskNumElements);
4827
4828	// Some bits of the mask may be unused, though it's unusual to have partly
4829	// uninitialized bits.
4830	insertCheckShadowOf(Val: Mask, OrigIns: &I);
4831
4832	assert(RoundingMode->getType()->isIntegerTy());
4833	// Only some bits of the rounding mode are used, though it's very
4834	// unusual to have uninitialized bits there (more commonly, it's a
4835	// constant).
4836	insertCheckShadowOf(Val: RoundingMode, OrigIns: &I);
4837
4838	assert(I.getType() == WriteThrough->getType());
4839
4840	Value *AShadow = getShadow(V: A);
4841	AShadow = maybeExtendVectorShadowWithZeros(Shadow: AShadow, I);
4842
4843	if (ANumElements * `2` == MaskNumElements) {
4844	// Ensure that the irrelevant bits of the mask are zero, hence selecting
4845	// from the zeroed shadow instead of the writethrough's shadow.
4846	Mask =
4847	IRB.CreateTrunc(V: Mask, DestTy: IRB.getIntNTy(N: ANumElements), Name: "_ms_mask_trunc");
4848	Mask =
4849	IRB.CreateZExt(V: Mask, DestTy: IRB.getIntNTy(N: MaskNumElements), Name: "_ms_mask_zext");
4850	}
4851
4852	// Convert i16 mask to <16 x i1>
4853	Mask = IRB.CreateBitCast(
4854	V: Mask, DestTy: FixedVectorType::get(ElementType: IRB.getInt1Ty(), NumElts: MaskNumElements),
4855	Name: "_ms_mask_bitcast");
4856
4857	/// For floating-point to integer conversion, the output is:
4858	/// - fully uninitialized if any* bit of the input is uninitialized*
4859	/// - fully ininitialized if all bits of the input are ininitialized
4860	/// We apply the same principle on a per-element basis for vectors.
4861	///
4862	/// We use the scalar width of the return type instead of A's.
4863	AShadow = IRB.CreateSExt(
4864	V: IRB.CreateICmpNE(LHS: AShadow, RHS: getCleanShadow(OrigTy: AShadow->getType())),
4865	DestTy: getShadowTy(V: &I), Name: "_ms_a_shadow");
4866
4867	Value *WriteThroughShadow = getShadow(V: WriteThrough);
4868	Value *Shadow = IRB.CreateSelect(C: Mask, True: AShadow, False: WriteThroughShadow,
4869	Name: "_ms_writethru_select");
4870
4871	setShadow(V: &I, SV: Shadow);
4872	setOriginForNaryOp(I);
4873	}
4874
4875	// Instrument BMI / BMI2 intrinsics.
4876	// All of these intrinsics are Z = I(X, Y)
4877	// where the types of all operands and the result match, and are either i32 or
4878	// i64. The following instrumentation happens to work for all of them:
4879	// Sz = I(Sx, Y) \| (sext (Sy != 0))
4880	void handleBmiIntrinsic(IntrinsicInst &I) {
4881	IRBuilder<> IRB(&I);
4882	Type *ShadowTy = getShadowTy(V: &I);
4883
4884	// If any bit of the mask operand is poisoned, then the whole thing is.
4885	Value *SMask = getShadow(I: &I, i: `1`);
4886	SMask = IRB.CreateSExt(V: IRB.CreateICmpNE(LHS: SMask, RHS: getCleanShadow(OrigTy: ShadowTy)),
4887	DestTy: ShadowTy);
4888	// Apply the same intrinsic to the shadow of the first operand.
4889	Value *S = IRB.CreateCall(Callee: I.getCalledFunction(),
4890	Args: {getShadow(I: &I, i: `0`), I.getOperand(i_nocapture: `1`)});
4891	S = IRB.CreateOr(LHS: SMask, RHS: S);
4892	setShadow(V: &I, SV: S);
4893	setOriginForNaryOp(I);
4894	}
4895
4896	static SmallVector<int, `8`> getPclmulMask(unsigned Width, bool OddElements) {
4897	SmallVector<int, `8`> Mask;
4898	for (unsigned X = OddElements ? `1` : `0`; X < Width; X += `2`) {
4899	Mask.append(NumInputs: `2`, Elt: X);
4900	}
4901	return Mask;
4902	}
4903
4904	// Instrument pclmul intrinsics.
4905	// These intrinsics operate either on odd or on even elements of the input
4906	// vectors, depending on the constant in the 3rd argument, ignoring the rest.
4907	// Replace the unused elements with copies of the used ones, ex:
4908	// (0, 1, 2, 3) -> (0, 0, 2, 2) (even case)
4909	// or
4910	// (0, 1, 2, 3) -> (1, 1, 3, 3) (odd case)
4911	// and then apply the usual shadow combining logic.
4912	void handlePclmulIntrinsic(IntrinsicInst &I) {
4913	IRBuilder<> IRB(&I);
4914	unsigned Width =
4915	cast<FixedVectorType>(Val: I.getArgOperand(i: `0`)->getType())->getNumElements();
4916	assert(isa<ConstantInt>(I.getArgOperand(`2`)) &&
4917	"pclmul 3rd operand must be a constant");
4918	unsigned Imm = cast<ConstantInt>(Val: I.getArgOperand(i: `2`))->getZExtValue();
4919	Value *Shuf0 = IRB.CreateShuffleVector(V: getShadow(I: &I, i: `0`),
4920	Mask: getPclmulMask(Width, OddElements: Imm & `0x01`));
4921	Value *Shuf1 = IRB.CreateShuffleVector(V: getShadow(I: &I, i: `1`),
4922	Mask: getPclmulMask(Width, OddElements: Imm & `0x10`));
4923	ShadowAndOriginCombiner SOC(this, IRB);
4924	SOC.Add(OpShadow: Shuf0, OpOrigin: getOrigin(I: &I, i: `0`));
4925	SOC.Add(OpShadow: Shuf1, OpOrigin: getOrigin(I: &I, i: `1`));
4926	SOC.Done(I: &I);
4927	}
4928
4929	// Instrument _mm__sd\|ss intrinsics*
4930	void handleUnarySdSsIntrinsic(IntrinsicInst &I) {
4931	IRBuilder<> IRB(&I);
4932	unsigned Width =
4933	cast<FixedVectorType>(Val: I.getArgOperand(i: `0`)->getType())->getNumElements();
4934	Value *First = getShadow(I: &I, i: `0`);
4935	Value *Second = getShadow(I: &I, i: `1`);
4936	// First element of second operand, remaining elements of first operand
4937	SmallVector<int, `16`> Mask;
4938	Mask.push_back(Elt: Width);
4939	for (unsigned i = `1`; i < Width; i++)
4940	Mask.push_back(Elt: i);
4941	Value *Shadow = IRB.CreateShuffleVector(V1: First, V2: Second, Mask);
4942
4943	setShadow(V: &I, SV: Shadow);
4944	setOriginForNaryOp(I);
4945	}
4946
4947	void handleVtestIntrinsic(IntrinsicInst &I) {
4948	IRBuilder<> IRB(&I);
4949	Value *Shadow0 = getShadow(I: &I, i: `0`);
4950	Value *Shadow1 = getShadow(I: &I, i: `1`);
4951	Value *Or = IRB.CreateOr(LHS: Shadow0, RHS: Shadow1);
4952	Value *NZ = IRB.CreateICmpNE(LHS: Or, RHS: Constant::getNullValue(Ty: Or->getType()));
4953	Value *Scalar = convertShadowToScalar(V: NZ, IRB);
4954	Value *Shadow = IRB.CreateZExt(V: Scalar, DestTy: getShadowTy(V: &I));
4955
4956	setShadow(V: &I, SV: Shadow);
4957	setOriginForNaryOp(I);
4958	}
4959
4960	void handleBinarySdSsIntrinsic(IntrinsicInst &I) {
4961	IRBuilder<> IRB(&I);
4962	unsigned Width =
4963	cast<FixedVectorType>(Val: I.getArgOperand(i: `0`)->getType())->getNumElements();
4964	Value *First = getShadow(I: &I, i: `0`);
4965	Value *Second = getShadow(I: &I, i: `1`);
4966	Value *OrShadow = IRB.CreateOr(LHS: First, RHS: Second);
4967	// First element of both OR'd together, remaining elements of first operand
4968	SmallVector<int, `16`> Mask;
4969	Mask.push_back(Elt: Width);
4970	for (unsigned i = `1`; i < Width; i++)
4971	Mask.push_back(Elt: i);
4972	Value *Shadow = IRB.CreateShuffleVector(V1: First, V2: OrShadow, Mask);
4973
4974	setShadow(V: &I, SV: Shadow);
4975	setOriginForNaryOp(I);
4976	}
4977
4978	// _mm_round_ps / _mm_round_ps.
4979	// Similar to maybeHandleSimpleNomemIntrinsic except
4980	// the second argument is guaranteed to be a constant integer.
4981	void handleRoundPdPsIntrinsic(IntrinsicInst &I) {
4982	assert(I.getArgOperand(`0`)->getType() == I.getType());
4983	assert(I.arg_size() == `2`);
4984	assert(isa<ConstantInt>(I.getArgOperand(`1`)));
4985
4986	IRBuilder<> IRB(&I);
4987	ShadowAndOriginCombiner SC(this, IRB);
4988	SC.Add(V: I.getArgOperand(i: `0`));
4989	SC.Done(I: &I);
4990	}
4991
4992	// Instrument @llvm.abs intrinsic.
4993	//
4994	// e.g., i32 @llvm.abs.i32 (i32 <Src>, i1 <is_int_min_poison>)
4995	// <4 x i32> @llvm.abs.v4i32(<4 x i32> <Src>, i1 <is_int_min_poison>)
4996	void handleAbsIntrinsic(IntrinsicInst &I) {
4997	assert(I.arg_size() == `2`);
4998	Value *Src = I.getArgOperand(i: `0`);
4999	Value *IsIntMinPoison = I.getArgOperand(i: `1`);
5000
5001	assert(I.getType()->isIntOrIntVectorTy());
5002
5003	assert(Src->getType() == I.getType());
5004
5005	assert(IsIntMinPoison->getType()->isIntegerTy());
5006	assert(IsIntMinPoison->getType()->getIntegerBitWidth() == `1`);
5007
5008	IRBuilder<> IRB(&I);
5009	Value *SrcShadow = getShadow(V: Src);
5010
5011	APInt MinVal =
5012	APInt::getSignedMinValue(numBits: Src->getType()->getScalarSizeInBits());
5013	Value *MinValVec = ConstantInt::get(Ty: Src->getType(), V: MinVal);
5014	Value *SrcIsMin = IRB.CreateICmp(P: CmpInst::ICMP_EQ, LHS: Src, RHS: MinValVec);
5015
5016	Value *PoisonedShadow = getPoisonedShadow(V: Src);
5017	Value *PoisonedIfIntMinShadow =
5018	IRB.CreateSelect(C: SrcIsMin, True: PoisonedShadow, False: SrcShadow);
5019	Value *Shadow =
5020	IRB.CreateSelect(C: IsIntMinPoison, True: PoisonedIfIntMinShadow, False: SrcShadow);
5021
5022	setShadow(V: &I, SV: Shadow);
5023	setOrigin(V: &I, Origin: getOrigin(I: &I, i: `0`));
5024	}
5025
5026	void handleIsFpClass(IntrinsicInst &I) {
5027	IRBuilder<> IRB(&I);
5028	Value *Shadow = getShadow(I: &I, i: `0`);
5029	setShadow(V: &I, SV: IRB.CreateICmpNE(LHS: Shadow, RHS: getCleanShadow(V: Shadow)));
5030	setOrigin(V: &I, Origin: getOrigin(I: &I, i: `0`));
5031	}
5032
5033	void handleArithmeticWithOverflow(IntrinsicInst &I) {
5034	IRBuilder<> IRB(&I);
5035	Value *Shadow0 = getShadow(I: &I, i: `0`);
5036	Value *Shadow1 = getShadow(I: &I, i: `1`);
5037	Value *ShadowElt0 = IRB.CreateOr(LHS: Shadow0, RHS: Shadow1);
5038	Value *ShadowElt1 =
5039	IRB.CreateICmpNE(LHS: ShadowElt0, RHS: getCleanShadow(V: ShadowElt0));
5040
5041	Value *Shadow = PoisonValue::get(T: getShadowTy(V: &I));
5042	Shadow = IRB.CreateInsertValue(Agg: Shadow, Val: ShadowElt0, Idxs: `0`);
5043	Shadow = IRB.CreateInsertValue(Agg: Shadow, Val: ShadowElt1, Idxs: `1`);
5044
5045	setShadow(V: &I, SV: Shadow);
5046	setOriginForNaryOp(I);
5047	}
5048
5049	Value extractLowerShadow(IRBuilder<> &IRB, Value V) {
5050	assert(isa<FixedVectorType>(V->getType()));
5051	assert(cast<FixedVectorType>(V->getType())->getNumElements() > `0`);
5052	Value *Shadow = getShadow(V);
5053	return IRB.CreateExtractElement(Vec: Shadow,
5054	Idx: ConstantInt::get(Ty: IRB.getInt32Ty(), V: `0`));
5055	}
5056
5057	// Handle llvm.x86.avx512.mask.pmov{,s,us}..512*
5058	//
5059	// e.g., call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512
5060	// (<8 x i64>, <16 x i8>, i8)
5061	// A WriteThru Mask
5062	//
5063	// call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512
5064	// (<16 x i32>, <16 x i8>, i16)
5065	//
5066	// Dst[i] = Mask[i] ? truncate_or_saturate(A[i]) : WriteThru[i]
5067	// Dst_shadow[i] = Mask[i] ? truncate(A_shadow[i]) : WriteThru_shadow[i]
5068	//
5069	// If Dst has more elements than A, the excess elements are zeroed (and the
5070	// corresponding shadow is initialized).
5071	//
5072	// Note: for PMOV (truncation), handleIntrinsicByApplyingToShadow is precise
5073	// and is much faster than this handler.
5074	void handleAVX512VectorDownConvert(IntrinsicInst &I) {
5075	IRBuilder<> IRB(&I);
5076
5077	assert(I.arg_size() == `3`);
5078	Value *A = I.getOperand(i_nocapture: `0`);
5079	Value *WriteThrough = I.getOperand(i_nocapture: `1`);
5080	Value *Mask = I.getOperand(i_nocapture: `2`);
5081
5082	assert(isFixedIntVector(A));
5083	assert(isFixedIntVector(WriteThrough));
5084
5085	unsigned ANumElements =
5086	cast<FixedVectorType>(Val: A->getType())->getNumElements();
5087	unsigned OutputNumElements =
5088	cast<FixedVectorType>(Val: WriteThrough->getType())->getNumElements();
5089	assert(ANumElements == OutputNumElements \|\|
5090	ANumElements * `2` == OutputNumElements);
5091
5092	assert(Mask->getType()->isIntegerTy());
5093	assert(Mask->getType()->getScalarSizeInBits() == ANumElements);
5094	insertCheckShadowOf(Val: Mask, OrigIns: &I);
5095
5096	assert(I.getType() == WriteThrough->getType());
5097
5098	// Widen the mask, if necessary, to have one bit per element of the output
5099	// vector.
5100	// We want the extra bits to have '1's, so that the CreateSelect will
5101	// select the values from AShadow instead of WriteThroughShadow ("maskless"
5102	// versions of the intrinsics are sometimes implemented using an all-1's
5103	// mask and an undefined value for WriteThroughShadow). We accomplish this
5104	// by using bitwise NOT before and after the ZExt.
5105	if (ANumElements != OutputNumElements) {
5106	Mask = IRB.CreateNot(V: Mask);
5107	Mask = IRB.CreateZExt(V: Mask, DestTy: Type::getIntNTy(C&: *MS.C, N: OutputNumElements),
5108	Name: "_ms_widen_mask");
5109	Mask = IRB.CreateNot(V: Mask);
5110	}
5111	Mask = IRB.CreateBitCast(
5112	V: Mask, DestTy: FixedVectorType::get(ElementType: IRB.getInt1Ty(), NumElts: OutputNumElements));
5113
5114	Value *AShadow = getShadow(V: A);
5115
5116	// The return type might have more elements than the input.
5117	// Temporarily shrink the return type's number of elements.
5118	VectorType *ShadowType = maybeShrinkVectorShadowType(Src: A, I);
5119
5120	// PMOV truncates; PMOVS/PMOVUS uses signed/unsigned saturation.
5121	// This handler treats them all as truncation, which leads to some rare
5122	// false positives in the cases where the truncated bytes could
5123	// unambiguously saturate the value e.g., if A = ??????10 ????????
5124	// (big-endian), the unsigned saturated byte conversion is 11111111 i.e.,
5125	// fully defined, but the truncated byte is ????????.
5126	//
5127	// TODO: use GetMinMaxUnsigned() to handle saturation precisely.
5128	AShadow = IRB.CreateTrunc(V: AShadow, DestTy: ShadowType, Name: "_ms_trunc_shadow");
5129	AShadow = maybeExtendVectorShadowWithZeros(Shadow: AShadow, I);
5130
5131	Value *WriteThroughShadow = getShadow(V: WriteThrough);
5132
5133	Value *Shadow = IRB.CreateSelect(C: Mask, True: AShadow, False: WriteThroughShadow);
5134	setShadow(V: &I, SV: Shadow);
5135	setOriginForNaryOp(I);
5136	}
5137
5138	// Handle llvm.x86.avx512. instructions that take vector(s) of floating-point*
5139	// values and perform an operation whose shadow propagation should be handled
5140	// as all-or-nothing [], with masking provided by a vector and a mask*
5141	// supplied as an integer.
5142	//
5143	// [] if all bits of a vector element are initialized, the output is fully*
5144	// initialized; otherwise, the output is fully uninitialized
5145	//
5146	// e.g., <16 x float> @llvm.x86.avx512.rsqrt14.ps.512
5147	// (<16 x float>, <16 x float>, i16)
5148	// A WriteThru Mask
5149	//
5150	// <2 x double> @llvm.x86.avx512.rcp14.pd.128
5151	// (<2 x double>, <2 x double>, i8)
5152	// A WriteThru Mask
5153	//
5154	// <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512
5155	// (<8 x double>, i32, <8 x double>, i8, i32)
5156	// A Imm WriteThru Mask Rounding
5157	//
5158	// <16 x float> @llvm.x86.avx512.mask.scalef.ps.512
5159	// (<16 x float>, <16 x float>, <16 x float>, i16, i32)
5160	// WriteThru A B Mask Rnd
5161	//
5162	// All operands other than A, B, ..., and WriteThru (e.g., Mask, Imm,
5163	// Rounding) must be fully initialized.
5164	//
5165	// Dst[i] = Mask[i] ? some_op(A[i], B[i], ...)
5166	// : WriteThru[i]
5167	// Dst_shadow[i] = Mask[i] ? all_or_nothing(A_shadow[i] \| B_shadow[i] \| ...)
5168	// : WriteThru_shadow[i]
5169	void handleAVX512VectorGenericMaskedFP(IntrinsicInst &I,
5170	SmallVector<unsigned, `4`> DataIndices,
5171	unsigned WriteThruIndex,
5172	unsigned MaskIndex) {
5173	IRBuilder<> IRB(&I);
5174
5175	unsigned NumArgs = I.arg_size();
5176
5177	assert(WriteThruIndex < NumArgs);
5178	assert(MaskIndex < NumArgs);
5179	assert(WriteThruIndex != MaskIndex);
5180	Value *WriteThru = I.getOperand(i_nocapture: WriteThruIndex);
5181
5182	unsigned OutputNumElements =
5183	cast<FixedVectorType>(Val: WriteThru->getType())->getNumElements();
5184
5185	assert(DataIndices.size() > `0`);
5186
5187	bool isData[`16`] = {false};
5188	assert(NumArgs <= `16`);
5189	for (unsigned i : DataIndices) {
5190	assert(i < NumArgs);
5191	assert(i != WriteThruIndex);
5192	assert(i != MaskIndex);
5193
5194	isData[i] = true;
5195
5196	Value *A = I.getOperand(i_nocapture: i);
5197	assert(isFixedFPVector(A));
5198	[[maybe_unused]] unsigned ANumElements =
5199	cast<FixedVectorType>(Val: A->getType())->getNumElements();
5200	assert(ANumElements == OutputNumElements);
5201	}
5202
5203	Value *Mask = I.getOperand(i_nocapture: MaskIndex);
5204
5205	assert(isFixedFPVector(WriteThru));
5206
5207	for (unsigned i = `0`; i < NumArgs; ++i) {
5208	if (!isData[i] && i != WriteThruIndex) {
5209	// Imm, Mask, Rounding etc. are "control" data, hence we require that
5210	// they be fully initialized.
5211	assert(I.getOperand(i)->getType()->isIntegerTy());
5212	insertCheckShadowOf(Val: I.getOperand(i_nocapture: i), OrigIns: &I);
5213	}
5214	}
5215
5216	// The mask has 1 bit per element of A, but a minimum of 8 bits.
5217	if (Mask->getType()->getScalarSizeInBits() == `8` && OutputNumElements < `8`)
5218	Mask = IRB.CreateTrunc(V: Mask, DestTy: Type::getIntNTy(C&: *MS.C, N: OutputNumElements));
5219	assert(Mask->getType()->getScalarSizeInBits() == OutputNumElements);
5220
5221	assert(I.getType() == WriteThru->getType());
5222
5223	Mask = IRB.CreateBitCast(
5224	V: Mask, DestTy: FixedVectorType::get(ElementType: IRB.getInt1Ty(), NumElts: OutputNumElements));
5225
5226	Value DataShadow = nullptr*;
5227	for (unsigned i : DataIndices) {
5228	Value *A = I.getOperand(i_nocapture: i);
5229	if (DataShadow)
5230	DataShadow = IRB.CreateOr(LHS: DataShadow, RHS: getShadow(V: A));
5231	else
5232	DataShadow = getShadow(V: A);
5233	}
5234
5235	// All-or-nothing shadow
5236	DataShadow =
5237	IRB.CreateSExt(V: IRB.CreateICmpNE(LHS: DataShadow, RHS: getCleanShadow(V: DataShadow)),
5238	DestTy: DataShadow->getType());
5239
5240	Value *WriteThruShadow = getShadow(V: WriteThru);
5241
5242	Value *Shadow = IRB.CreateSelect(C: Mask, True: DataShadow, False: WriteThruShadow);
5243	setShadow(V: &I, SV: Shadow);
5244
5245	setOriginForNaryOp(I);
5246	}
5247
5248	// For sh. compiler intrinsics:*
5249	// llvm.x86.avx512fp16.mask.{add/sub/mul/div/max/min}.sh.round
5250	// (<8 x half>, <8 x half>, <8 x half>, i8, i32)
5251	// A B WriteThru Mask RoundingMode
5252	//
5253	// DstShadow[0] = Mask[0] ? (AShadow[0] \| BShadow[0]) : WriteThruShadow[0]
5254	// DstShadow[1..7] = AShadow[1..7]
5255	void visitGenericScalarHalfwordInst(IntrinsicInst &I) {
5256	IRBuilder<> IRB(&I);
5257
5258	assert(I.arg_size() == `5`);
5259	Value *A = I.getOperand(i_nocapture: `0`);
5260	Value *B = I.getOperand(i_nocapture: `1`);
5261	Value *WriteThrough = I.getOperand(i_nocapture: `2`);
5262	Value *Mask = I.getOperand(i_nocapture: `3`);
5263	Value *RoundingMode = I.getOperand(i_nocapture: `4`);
5264
5265	// Technically, we could probably just check whether the LSB is
5266	// initialized, but intuitively it feels like a partly uninitialized mask
5267	// is unintended, and we should warn the user immediately.
5268	insertCheckShadowOf(Val: Mask, OrigIns: &I);
5269	insertCheckShadowOf(Val: RoundingMode, OrigIns: &I);
5270
5271	assert(isa<FixedVectorType>(A->getType()));
5272	unsigned NumElements =
5273	cast<FixedVectorType>(Val: A->getType())->getNumElements();
5274	assert(NumElements == `8`);
5275	assert(A->getType() == B->getType());
5276	assert(B->getType() == WriteThrough->getType());
5277	assert(Mask->getType()->getPrimitiveSizeInBits() == NumElements);
5278	assert(RoundingMode->getType()->isIntegerTy());
5279
5280	Value *ALowerShadow = extractLowerShadow(IRB, V: A);
5281	Value *BLowerShadow = extractLowerShadow(IRB, V: B);
5282
5283	Value *ABLowerShadow = IRB.CreateOr(LHS: ALowerShadow, RHS: BLowerShadow);
5284
5285	Value *WriteThroughLowerShadow = extractLowerShadow(IRB, V: WriteThrough);
5286
5287	Mask = IRB.CreateBitCast(
5288	V: Mask, DestTy: FixedVectorType::get(ElementType: IRB.getInt1Ty(), NumElts: NumElements));
5289	Value *MaskLower =
5290	IRB.CreateExtractElement(Vec: Mask, Idx: ConstantInt::get(Ty: IRB.getInt32Ty(), V: `0`));
5291
5292	Value *AShadow = getShadow(V: A);
5293	Value *DstLowerShadow =
5294	IRB.CreateSelect(C: MaskLower, True: ABLowerShadow, False: WriteThroughLowerShadow);
5295	Value *DstShadow = IRB.CreateInsertElement(
5296	Vec: AShadow, NewElt: DstLowerShadow, Idx: ConstantInt::get(Ty: IRB.getInt32Ty(), V: `0`),
5297	Name: "_msprop");
5298
5299	setShadow(V: &I, SV: DstShadow);
5300	setOriginForNaryOp(I);
5301	}
5302
5303	// Approximately handle AVX Galois Field Affine Transformation
5304	//
5305	// e.g.,
5306	// <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8>, <16 x i8>, i8)
5307	// <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8>, <32 x i8>, i8)
5308	// <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8>, <64 x i8>, i8)
5309	// Out A x b
5310	// where A and x are packed matrices, b is a vector,
5311	// Out = A x + b in GF(2)*
5312	//
5313	// Multiplication in GF(2) is equivalent to bitwise AND. However, the matrix
5314	// computation also includes a parity calculation.
5315	//
5316	// For the bitwise AND of bits V1 and V2, the exact shadow is:
5317	// Out_Shadow = (V1_Shadow & V2_Shadow)
5318	// \| (V1 & V2_Shadow)
5319	// \| (V1_Shadow & V2 )
5320	//
5321	// We approximate the shadow of gf2p8affineqb using:
5322	// Out_Shadow = gf2p8affineqb(x_Shadow, A_shadow, 0)
5323	// \| gf2p8affineqb(x, A_shadow, 0)
5324	// \| gf2p8affineqb(x_Shadow, A, 0)
5325	// \| set1_epi8(b_Shadow)
5326	//
5327	// This approximation has false negatives: if an intermediate dot-product
5328	// contains an even number of 1's, the parity is 0.
5329	// It has no false positives.
5330	void handleAVXGF2P8Affine(IntrinsicInst &I) {
5331	IRBuilder<> IRB(&I);
5332
5333	assert(I.arg_size() == `3`);
5334	Value *A = I.getOperand(i_nocapture: `0`);
5335	Value *X = I.getOperand(i_nocapture: `1`);
5336	Value *B = I.getOperand(i_nocapture: `2`);
5337
5338	assert(isFixedIntVector(A));
5339	assert(cast<VectorType>(A->getType())
5340	->getElementType()
5341	->getScalarSizeInBits() == `8`);
5342
5343	assert(A->getType() == X->getType());
5344
5345	assert(B->getType()->isIntegerTy());
5346	assert(B->getType()->getScalarSizeInBits() == `8`);
5347
5348	assert(I.getType() == A->getType());
5349
5350	Value *AShadow = getShadow(V: A);
5351	Value *XShadow = getShadow(V: X);
5352	Value *BZeroShadow = getCleanShadow(V: B);
5353
5354	CallInst *AShadowXShadow = IRB.CreateIntrinsic(
5355	RetTy: I.getType(), ID: I.getIntrinsicID(), Args: {XShadow, AShadow, BZeroShadow});
5356	CallInst *AShadowX = IRB.CreateIntrinsic(RetTy: I.getType(), ID: I.getIntrinsicID(),
5357	Args: {X, AShadow, BZeroShadow});
5358	CallInst *XShadowA = IRB.CreateIntrinsic(RetTy: I.getType(), ID: I.getIntrinsicID(),
5359	Args: {XShadow, A, BZeroShadow});
5360
5361	unsigned NumElements = cast<FixedVectorType>(Val: I.getType())->getNumElements();
5362	Value *BShadow = getShadow(V: B);
5363	Value *BBroadcastShadow = getCleanShadow(V: AShadow);
5364	// There is no LLVM IR intrinsic for _mm512_set1_epi8.
5365	// This loop generates a lot of LLVM IR, which we expect that CodeGen will
5366	// lower appropriately (e.g., VPBROADCASTB).
5367	// Besides, b is often a constant, in which case it is fully initialized.
5368	for (unsigned i = `0`; i < NumElements; i++)
5369	BBroadcastShadow = IRB.CreateInsertElement(Vec: BBroadcastShadow, NewElt: BShadow, Idx: i);
5370
5371	setShadow(V: &I, SV: IRB.CreateOr(
5372	Ops: {AShadowXShadow, AShadowX, XShadowA, BBroadcastShadow}));
5373	setOriginForNaryOp(I);
5374	}
5375
5376	// Handle Arm NEON vector load intrinsics (vld).*
5377	//
5378	// The WithLane instructions (ld[234]lane) are similar to:
5379	// call {<4 x i32>, <4 x i32>, <4 x i32>}
5380	// @llvm.aarch64.neon.ld3lane.v4i32.p0
5381	// (<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, i64 %lane, ptr
5382	// %A)
5383	//
5384	// The non-WithLane instructions (ld[234], ld1x[234], ld[234]r) are similar
5385	// to:
5386	// call {<8 x i8>, <8 x i8>} @llvm.aarch64.neon.ld2.v8i8.p0(ptr %A)
5387	void handleNEONVectorLoad(IntrinsicInst &I, bool WithLane) {
5388	unsigned int numArgs = I.arg_size();
5389
5390	// Return type is a struct of vectors of integers or floating-point
5391	assert(I.getType()->isStructTy());
5392	[[maybe_unused]] StructType *RetTy = cast<StructType>(Val: I.getType());
5393	assert(RetTy->getNumElements() > `0`);
5394	assert(RetTy->getElementType(`0`)->isIntOrIntVectorTy() \|\|
5395	RetTy->getElementType(`0`)->isFPOrFPVectorTy());
5396	for (unsigned int i = `0`; i < RetTy->getNumElements(); i++)
5397	assert(RetTy->getElementType(i) == RetTy->getElementType(`0`));
5398
5399	if (WithLane) {
5400	// 2, 3 or 4 vectors, plus lane number, plus input pointer
5401	assert(`4` <= numArgs && numArgs <= `6`);
5402
5403	// Return type is a struct of the input vectors
5404	assert(RetTy->getNumElements() + `2` == numArgs);
5405	for (unsigned int i = `0`; i < RetTy->getNumElements(); i++)
5406	assert(I.getArgOperand(i)->getType() == RetTy->getElementType(`0`));
5407	} else {
5408	assert(numArgs == `1`);
5409	}
5410
5411	IRBuilder<> IRB(&I);
5412
5413	SmallVector<Value *, `6`> ShadowArgs;
5414	if (WithLane) {
5415	for (unsigned int i = `0`; i < numArgs - `2`; i++)
5416	ShadowArgs.push_back(Elt: getShadow(V: I.getArgOperand(i)));
5417
5418	// Lane number, passed verbatim
5419	Value *LaneNumber = I.getArgOperand(i: numArgs - `2`);
5420	ShadowArgs.push_back(Elt: LaneNumber);
5421
5422	// TODO: blend shadow of lane number into output shadow?
5423	insertCheckShadowOf(Val: LaneNumber, OrigIns: &I);
5424	}
5425
5426	Value *Src = I.getArgOperand(i: numArgs - `1`);
5427	assert(Src->getType()->isPointerTy() && "Source is not a pointer!");
5428
5429	Type *SrcShadowTy = getShadowTy(V: Src);
5430	auto [SrcShadowPtr, SrcOriginPtr] =
5431	getShadowOriginPtr(Addr: Src, IRB, ShadowTy: SrcShadowTy, Alignment: Align (`1`), /isStore/ false);
5432	ShadowArgs.push_back(Elt: SrcShadowPtr);
5433
5434	// The NEON vector load instructions handled by this function all have
5435	// integer variants. It is easier to use those rather than trying to cast
5436	// a struct of vectors of floats into a struct of vectors of integers.
5437	CallInst *CI =
5438	IRB.CreateIntrinsic(RetTy: getShadowTy(V: &I), ID: I.getIntrinsicID(), Args: ShadowArgs);
5439	setShadow(V: &I, SV: CI);
5440
5441	if (!MS.TrackOrigins)
5442	return;
5443
5444	Value *PtrSrcOrigin = IRB.CreateLoad(Ty: MS.OriginTy, Ptr: SrcOriginPtr);
5445	setOrigin(V: &I, Origin: PtrSrcOrigin);
5446	}
5447
5448	/// Handle Arm NEON vector store intrinsics (vst{2,3,4}, vst1x_{2,3,4},
5449	/// and vst{2,3,4}lane).
5450	///
5451	/// Arm NEON vector store intrinsics have the output address (pointer) as the
5452	/// last argument, with the initial arguments being the inputs (and lane
5453	/// number for vst{2,3,4}lane). They return void.
5454	///
5455	/// - st4 interleaves the output e.g., st4 (inA, inB, inC, inD, outP) writes
5456	/// abcdabcdabcdabcd... into outP*
5457	/// - st1_x4 is non-interleaved e.g., st1_x4 (inA, inB, inC, inD, outP)
5458	/// writes aaaa...bbbb...cccc...dddd... into outP*
5459	/// - st4lane has arguments of (inA, inB, inC, inD, lane, outP)
5460	/// These instructions can all be instrumented with essentially the same
5461	/// MSan logic, simply by applying the corresponding intrinsic to the shadow.
5462	void handleNEONVectorStoreIntrinsic(IntrinsicInst &I, bool useLane) {
5463	IRBuilder<> IRB(&I);
5464
5465	// Don't use getNumOperands() because it includes the callee
5466	int numArgOperands = I.arg_size();
5467
5468	// The last arg operand is the output (pointer)
5469	assert(numArgOperands >= `1`);
5470	Value *Addr = I.getArgOperand(i: numArgOperands - `1`);
5471	assert(Addr->getType()->isPointerTy());
5472	int skipTrailingOperands = `1`;
5473
5474	if (ClCheckAccessAddress)
5475	insertCheckShadowOf(Val: Addr, OrigIns: &I);
5476
5477	// Second-last operand is the lane number (for vst{2,3,4}lane)
5478	if (useLane) {
5479	skipTrailingOperands++;
5480	assert(numArgOperands >= static_cast<int>(skipTrailingOperands));
5481	assert(isa<IntegerType>(
5482	I.getArgOperand(numArgOperands - skipTrailingOperands)->getType()));
5483	}
5484
5485	SmallVector<Value *, `8`> ShadowArgs;
5486	// All the initial operands are the inputs
5487	for (int i = `0`; i < numArgOperands - skipTrailingOperands; i++) {
5488	assert(isa<FixedVectorType>(I.getArgOperand(i)->getType()));
5489	Value *Shadow = getShadow(I: &I, i);
5490	ShadowArgs.append(NumInputs: `1`, Elt: Shadow);
5491	}
5492
5493	// MSan's GetShadowTy assumes the LHS is the type we want the shadow for
5494	// e.g., for:
5495	// [[TMP5:%.]] = bitcast <16 x i8> [[TMP2]] to i128*
5496	// we know the type of the output (and its shadow) is <16 x i8>.
5497	//
5498	// Arm NEON VST is unusual because the last argument is the output address:
5499	// define void @st2_16b(<16 x i8> %A, <16 x i8> %B, ptr %P) {
5500	// call void @llvm.aarch64.neon.st2.v16i8.p0
5501	// (<16 x i8> [[A]], <16 x i8> [[B]], ptr [[P]])
5502	// and we have no type information about P's operand. We must manually
5503	// compute the type (<16 x i8> x 2).
5504	FixedVectorType *OutputVectorTy = FixedVectorType::get(
5505	ElementType: cast<FixedVectorType>(Val: I.getArgOperand(i: `0`)->getType())->getElementType(),
5506	NumElts: cast<FixedVectorType>(Val: I.getArgOperand(i: `0`)->getType())->getNumElements() *
5507	(numArgOperands - skipTrailingOperands));
5508	Type *OutputShadowTy = getShadowTy(OrigTy: OutputVectorTy);
5509
5510	if (useLane)
5511	ShadowArgs.append(NumInputs: `1`,
5512	Elt: I.getArgOperand(i: numArgOperands - skipTrailingOperands));
5513
5514	Value OutputShadowPtr, OutputOriginPtr;
5515	// AArch64 NEON does not need alignment (unless OS requires it)
5516	std::tie(args&: OutputShadowPtr, args&: OutputOriginPtr) = getShadowOriginPtr(
5517	Addr, IRB, ShadowTy: OutputShadowTy, Alignment: Align (`1`), /isStore/ true);
5518	ShadowArgs.append(NumInputs: `1`, Elt: OutputShadowPtr);
5519
5520	CallInst *CI =
5521	IRB.CreateIntrinsic(RetTy: IRB.getVoidTy(), ID: I.getIntrinsicID(), Args: ShadowArgs);
5522	setShadow(V: &I, SV: CI);
5523
5524	if (MS.TrackOrigins) {
5525	// TODO: if we modelled the vst instruction more precisely, we could*
5526	// more accurately track the origins (e.g., if both inputs are
5527	// uninitialized for vst2, we currently blame the second input, even
5528	// though part of the output depends only on the first input).
5529	//
5530	// This is particularly imprecise for vst{2,3,4}lane, since only one
5531	// lane of each input is actually copied to the output.
5532	OriginCombiner OC(this, IRB);
5533	for (int i = `0`; i < numArgOperands - skipTrailingOperands; i++)
5534	OC.Add(V: I.getArgOperand(i));
5535
5536	const DataLayout &DL = F.getDataLayout();
5537	OC.DoneAndStoreOrigin(TS: DL.getTypeStoreSize(Ty: OutputVectorTy),
5538	OriginPtr: OutputOriginPtr);
5539	}
5540	}
5541
5542	// Integer matrix multiplication:
5543	// - <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8
5544	// (<4 x i32> %R, <16 x i8> %X, <16 x i8> %Y)
5545	// - <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8
5546	// (<4 x i32> %R, <16 x i8> %X, <16 x i8> %Y)
5547	// - <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8
5548	// (<4 x i32> %R, <16 x i8> %X, <16 x i8> %Y)
5549	//
5550	// Note:
5551	// - <4 x i32> is a 2x2 matrix
5552	// - <16 x i8> %X and %Y are 2x8 and 8x2 matrices respectively
5553	//
5554	// 2x8 %X 8x2 %Y
5555	// [ X01 X02 X03 X04 X05 X06 X07 X08 ] [ Y01 Y09 ]
5556	// [ X09 X10 X11 X12 X13 X14 X15 X16 ] x [ Y02 Y10 ]
5557	// [ Y03 Y11 ]
5558	// [ Y04 Y12 ]
5559	// [ Y05 Y13 ]
5560	// [ Y06 Y14 ]
5561	// [ Y07 Y15 ]
5562	// [ Y08 Y16 ]
5563	//
5564	// The general shadow propagation approach is:
5565	// 1) get the shadows of the input matrices %X and %Y
5566	// 2) change the shadow values to 0x1 if the corresponding value is fully
5567	// initialized, and 0x0 otherwise
5568	// 3) perform a matrix multiplication on the shadows of %X and %Y. The output
5569	// will be a 2x2 matrix; for each element, a value of 0x8 means all the
5570	// corresponding inputs were clean.
5571	// 4) blend in the shadow of %R
5572	//
5573	// TODO: consider allowing multiplication of zero with an uninitialized value
5574	// to result in an initialized value.
5575	//
5576	// Floating-point matrix multiplication:
5577	// - <4 x float> @llvm.aarch64.neon.bfmmla
5578	// (<4 x float> %R, <8 x bfloat> %X, <8 x bfloat> %Y)
5579	// %X and %Y are 2x4 and 4x2 matrices respectively
5580	//
5581	// Although there are half as many elements of %X and %Y compared to the
5582	// integer case, each element is twice the bit-width. Thus, we can reuse the
5583	// shadow propagation logic if we cast the shadows to the same type as the
5584	// integer case, and apply ummla to the shadows:
5585	//
5586	// 2x4 %X 4x2 %Y
5587	// [ A01:A02 A03:A04 A05:A06 A07:A08 ] [ B01:B02 B09:B10 ]
5588	// [ A09:A10 A11:A12 A13:A14 A15:A16 ] x [ B03:B04 B11:B12 ]
5589	// [ B05:B06 B13:B14 ]
5590	// [ B07:B08 B15:B16 ]
5591	//
5592	// For example, consider multiplying the first row of %X with the first
5593	// column of Y. We want to know if
5594	// A01:A02B01:B02 + A03:A04B03:B04 + A05:A06B06:B06 + A07:A08B07:B08 is
5595	// fully initialized, which will be true if and only if (A01, A02, ..., A08)
5596	// and (B01, B02, ..., B08) are each fully initialized. This latter condition
5597	// is equivalent to what is tested by the instrumentation for the integer
5598	// form.
5599	void handleNEONMatrixMultiply(IntrinsicInst &I) {
5600	IRBuilder<> IRB(&I);
5601
5602	assert(I.arg_size() == `3`);
5603	Value *R = I.getArgOperand(i: `0`);
5604	Value *A = I.getArgOperand(i: `1`);
5605	Value *B = I.getArgOperand(i: `2`);
5606
5607	assert(I.getType() == R->getType());
5608
5609	assert(isa<FixedVectorType>(R->getType()));
5610	assert(isa<FixedVectorType>(A->getType()));
5611	assert(isa<FixedVectorType>(B->getType()));
5612
5613	[[maybe_unused]] FixedVectorType *RTy = cast<FixedVectorType>(Val: R->getType());
5614	[[maybe_unused]] FixedVectorType *ATy = cast<FixedVectorType>(Val: A->getType());
5615	[[maybe_unused]] FixedVectorType *BTy = cast<FixedVectorType>(Val: B->getType());
5616
5617	Value *ShadowR = getShadow(I: &I, i: `0`);
5618	Value *ShadowA = getShadow(I: &I, i: `1`);
5619	Value *ShadowB = getShadow(I: &I, i: `2`);
5620
5621	// We will use ummla to compute the shadow. These are the types it expects.
5622	// These are also the types of the corresponding shadows.
5623	FixedVectorType *ExpectedRTy =
5624	FixedVectorType::get(ElementType: IntegerType::get(C&: *MS.C, NumBits: `32`), NumElts: `4`);
5625	FixedVectorType *ExpectedATy =
5626	FixedVectorType::get(ElementType: IntegerType::get(C&: *MS.C, NumBits: `8`), NumElts: `16`);
5627	FixedVectorType *ExpectedBTy =
5628	FixedVectorType::get(ElementType: IntegerType::get(C&: *MS.C, NumBits: `8`), NumElts: `16`);
5629
5630	if (RTy->getElementType()->isIntegerTy()) {
5631	// Types of R and A/B are not identical e.g., <4 x i32> %R, <16 x i8> %A
5632	assert(ATy->getElementType()->isIntegerTy());
5633
5634	assert(RTy == ExpectedRTy);
5635	assert(ATy == ExpectedATy);
5636	assert(BTy == ExpectedBTy);
5637	} else {
5638	assert(ATy->getElementType()->isFloatingPointTy());
5639	assert(BTy->getElementType()->isFloatingPointTy());
5640
5641	// Technically, what we care about is that:
5642	// getShadowTy(RTy)->canLosslesslyBitCastTo(ExpectedRTy)) etc.
5643	// but that is equivalent.
5644	assert(RTy->canLosslesslyBitCastTo(ExpectedRTy));
5645	assert(ATy->canLosslesslyBitCastTo(ExpectedATy));
5646	assert(BTy->canLosslesslyBitCastTo(ExpectedBTy));
5647
5648	ShadowA = IRB.CreateBitCast(V: ShadowA, DestTy: getShadowTy(OrigTy: ExpectedATy));
5649	ShadowB = IRB.CreateBitCast(V: ShadowB, DestTy: getShadowTy(OrigTy: ExpectedBTy));
5650	}
5651	assert(ATy->getElementType() == BTy->getElementType());
5652
5653	// From this point on, use Expected{R,A,B}Type.
5654
5655	// If the value is fully initialized, the shadow will be 000...001.
5656	// Otherwise, the shadow will be all zero.
5657	// (This is the opposite of how we typically handle shadows.)
5658	ShadowA =
5659	IRB.CreateZExt(V: IRB.CreateICmpEQ(LHS: ShadowA, RHS: getCleanShadow(OrigTy: ExpectedATy)),
5660	DestTy: getShadowTy(OrigTy: ExpectedATy));
5661	ShadowB =
5662	IRB.CreateZExt(V: IRB.CreateICmpEQ(LHS: ShadowB, RHS: getCleanShadow(OrigTy: ExpectedBTy)),
5663	DestTy: getShadowTy(OrigTy: ExpectedBTy));
5664
5665	Value *ShadowAB =
5666	IRB.CreateIntrinsic(RetTy: ExpectedRTy, ID: Intrinsic::aarch64_neon_ummla,
5667	Args: {getCleanShadow(OrigTy: ExpectedRTy), ShadowA, ShadowB});
5668
5669	// ummla multiplies a 2x8 matrix with an 8x2 matrix. If all entries of the
5670	// input matrices are equal to 0x1, all entries of the output matrix will
5671	// be 0x8.
5672	Value *FullyInit = ConstantVector::getSplat(
5673	EC: ExpectedRTy->getElementCount(),
5674	Elt: ConstantInt::get(Ty: ExpectedRTy->getElementType(), V: `0x8`));
5675
5676	ShadowAB = IRB.CreateSExt(V: IRB.CreateICmpNE(LHS: ShadowAB, RHS: FullyInit),
5677	DestTy: ShadowAB->getType());
5678
5679	ShadowR = IRB.CreateSExt(
5680	V: IRB.CreateICmpNE(LHS: ShadowR, RHS: getCleanShadow(OrigTy: ExpectedRTy)), DestTy: ExpectedRTy);
5681
5682	setShadow(V: &I, SV: IRB.CreateOr(LHS: ShadowAB, RHS: ShadowR));
5683	setOriginForNaryOp(I);
5684	}
5685
5686	/// Handle intrinsics by applying the intrinsic to the shadows.
5687	///
5688	/// The trailing arguments are passed verbatim to the intrinsic, though any
5689	/// uninitialized trailing arguments can also taint the shadow e.g., for an
5690	/// intrinsic with one trailing verbatim argument:
5691	/// out = intrinsic(var1, var2, opType)
5692	/// we compute:
5693	/// shadow[out] =
5694	/// intrinsic(shadow[var1], shadow[var2], opType) \| shadow[opType]
5695	///
5696	/// Typically, shadowIntrinsicID will be specified by the caller to be
5697	/// I.getIntrinsicID(), but the caller can choose to replace it with another
5698	/// intrinsic of the same type.
5699	///
5700	/// CAUTION: this assumes that the intrinsic will handle arbitrary
5701	/// bit-patterns (for example, if the intrinsic accepts floats for
5702	/// var1, we require that it doesn't care if inputs are NaNs).
5703	///
5704	/// For example, this can be applied to the Arm NEON vector table intrinsics
5705	/// (tbl{1,2,3,4}).
5706	///
5707	/// The origin is approximated using setOriginForNaryOp.
5708	void handleIntrinsicByApplyingToShadow(IntrinsicInst &I,
5709	Intrinsic::ID shadowIntrinsicID,
5710	unsigned int trailingVerbatimArgs) {
5711	IRBuilder<> IRB(&I);
5712
5713	assert(trailingVerbatimArgs < I.arg_size());
5714
5715	SmallVector<Value *, `8`> ShadowArgs;
5716	// Don't use getNumOperands() because it includes the callee
5717	for (unsigned int i = `0`; i < I.arg_size() - trailingVerbatimArgs; i++) {
5718	Value *Shadow = getShadow(I: &I, i);
5719
5720	// Shadows are integer-ish types but some intrinsics require a
5721	// different (e.g., floating-point) type.
5722	ShadowArgs.push_back(
5723	Elt: IRB.CreateBitCast(V: Shadow, DestTy: I.getArgOperand(i)->getType()));
5724	}
5725
5726	for (unsigned int i = I.arg_size() - trailingVerbatimArgs; i < I.arg_size();
5727	i++) {
5728	Value *Arg = I.getArgOperand(i);
5729	ShadowArgs.push_back(Elt: Arg);
5730	}
5731
5732	CallInst *CI =
5733	IRB.CreateIntrinsic(RetTy: I.getType(), ID: shadowIntrinsicID, Args: ShadowArgs);
5734	Value *CombinedShadow = CI;
5735
5736	// Combine the computed shadow with the shadow of trailing args
5737	for (unsigned int i = I.arg_size() - trailingVerbatimArgs; i < I.arg_size();
5738	i++) {
5739	Value *Shadow =
5740	CreateShadowCast(IRB, V: getShadow(I: &I, i), dstTy: CombinedShadow->getType());
5741	CombinedShadow = IRB.CreateOr(LHS: Shadow, RHS: CombinedShadow, Name: "_msprop");
5742	}
5743
5744	setShadow(V: &I, SV: IRB.CreateBitCast(V: CombinedShadow, DestTy: getShadowTy(V: &I)));
5745
5746	setOriginForNaryOp(I);
5747	}
5748
5749	// Approximation only
5750	//
5751	// e.g., <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64)
5752	void handleNEONVectorMultiplyIntrinsic(IntrinsicInst &I) {
5753	assert(I.arg_size() == `2`);
5754
5755	handleShadowOr(I);
5756	}
5757
5758	bool maybeHandleCrossPlatformIntrinsic(IntrinsicInst &I) {
5759	switch (I.getIntrinsicID()) {
5760	case Intrinsic::uadd_with_overflow:
5761	case Intrinsic::sadd_with_overflow:
5762	case Intrinsic::usub_with_overflow:
5763	case Intrinsic::ssub_with_overflow:
5764	case Intrinsic::umul_with_overflow:
5765	case Intrinsic::smul_with_overflow:
5766	handleArithmeticWithOverflow(I);
5767	break;
5768	case Intrinsic::abs:
5769	handleAbsIntrinsic(I);
5770	break;
5771	case Intrinsic::bitreverse:
5772	handleIntrinsicByApplyingToShadow(I, shadowIntrinsicID: I.getIntrinsicID(),
5773	/trailingVerbatimArgs/ `0`);
5774	break;
5775	case Intrinsic::is_fpclass:
5776	handleIsFpClass(I);
5777	break;
5778	case Intrinsic::lifetime_start:
5779	handleLifetimeStart(I);
5780	break;
5781	case Intrinsic::launder_invariant_group:
5782	case Intrinsic::strip_invariant_group:
5783	handleInvariantGroup(I);
5784	break;
5785	case Intrinsic::bswap:
5786	handleBswap(I);
5787	break;
5788	case Intrinsic::ctlz:
5789	case Intrinsic::cttz:
5790	handleCountLeadingTrailingZeros(I);
5791	break;
5792	case Intrinsic::masked_compressstore:
5793	handleMaskedCompressStore(I);
5794	break;
5795	case Intrinsic::masked_expandload:
5796	handleMaskedExpandLoad(I);
5797	break;
5798	case Intrinsic::masked_gather:
5799	handleMaskedGather(I);
5800	break;
5801	case Intrinsic::masked_scatter:
5802	handleMaskedScatter(I);
5803	break;
5804	case Intrinsic::masked_store:
5805	handleMaskedStore(I);
5806	break;
5807	case Intrinsic::masked_load:
5808	handleMaskedLoad(I);
5809	break;
5810	case Intrinsic::vector_reduce_and:
5811	handleVectorReduceAndIntrinsic(I);
5812	break;
5813	case Intrinsic::vector_reduce_or:
5814	handleVectorReduceOrIntrinsic(I);
5815	break;
5816
5817	case Intrinsic::vector_reduce_add:
5818	case Intrinsic::vector_reduce_xor:
5819	case Intrinsic::vector_reduce_mul:
5820	// Signed/Unsigned Min/Max
5821	// TODO: handling similarly to AND/OR may be more precise.
5822	case Intrinsic::vector_reduce_smax:
5823	case Intrinsic::vector_reduce_smin:
5824	case Intrinsic::vector_reduce_umax:
5825	case Intrinsic::vector_reduce_umin:
5826	// TODO: this has no false positives, but arguably we should check that all
5827	// the bits are initialized.
5828	case Intrinsic::vector_reduce_fmax:
5829	case Intrinsic::vector_reduce_fmin:
5830	handleVectorReduceIntrinsic(I, /AllowShadowCast=/false);
5831	break;
5832
5833	case Intrinsic::vector_reduce_fadd:
5834	case Intrinsic::vector_reduce_fmul:
5835	handleVectorReduceWithStarterIntrinsic(I);
5836	break;
5837
5838	case Intrinsic::scmp:
5839	case Intrinsic::ucmp: {
5840	handleShadowOr(I);
5841	break;
5842	}
5843
5844	case Intrinsic::fshl:
5845	case Intrinsic::fshr:
5846	handleFunnelShift(I);
5847	break;
5848
5849	case Intrinsic::is_constant:
5850	// The result of llvm.is.constant() is always defined.
5851	setShadow(V: &I, SV: getCleanShadow(V: &I));
5852	setOrigin(V: &I, Origin: getCleanOrigin());
5853	break;
5854
5855	default:
5856	return false;
5857	}
5858
5859	return true;
5860	}
5861
5862	bool maybeHandleX86SIMDIntrinsic(IntrinsicInst &I) {
5863	switch (I.getIntrinsicID()) {
5864	case Intrinsic::x86_sse_stmxcsr:
5865	handleStmxcsr(I);
5866	break;
5867	case Intrinsic::x86_sse_ldmxcsr:
5868	handleLdmxcsr(I);
5869	break;
5870
5871	// Convert Scalar Double Precision Floating-Point Value
5872	// to Unsigned Doubleword Integer
5873	// etc.
5874	case Intrinsic::x86_avx512_vcvtsd2usi64:
5875	case Intrinsic::x86_avx512_vcvtsd2usi32:
5876	case Intrinsic::x86_avx512_vcvtss2usi64:
5877	case Intrinsic::x86_avx512_vcvtss2usi32:
5878	case Intrinsic::x86_avx512_cvttss2usi64:
5879	case Intrinsic::x86_avx512_cvttss2usi:
5880	case Intrinsic::x86_avx512_cvttsd2usi64:
5881	case Intrinsic::x86_avx512_cvttsd2usi:
5882	case Intrinsic::x86_avx512_cvtusi2ss:
5883	case Intrinsic::x86_avx512_cvtusi642sd:
5884	case Intrinsic::x86_avx512_cvtusi642ss:
5885	handleSSEVectorConvertIntrinsic(I, NumUsedElements: `1`, HasRoundingMode: true);
5886	break;
5887	case Intrinsic::x86_sse2_cvtsd2si64:
5888	case Intrinsic::x86_sse2_cvtsd2si:
5889	case Intrinsic::x86_sse2_cvtsd2ss:
5890	case Intrinsic::x86_sse2_cvttsd2si64:
5891	case Intrinsic::x86_sse2_cvttsd2si:
5892	case Intrinsic::x86_sse_cvtss2si64:
5893	case Intrinsic::x86_sse_cvtss2si:
5894	case Intrinsic::x86_sse_cvttss2si64:
5895	case Intrinsic::x86_sse_cvttss2si:
5896	handleSSEVectorConvertIntrinsic(I, NumUsedElements: `1`);
5897	break;
5898	case Intrinsic::x86_sse_cvtps2pi:
5899	case Intrinsic::x86_sse_cvttps2pi:
5900	handleSSEVectorConvertIntrinsic(I, NumUsedElements: `2`);
5901	break;
5902
5903	// TODO:
5904	// <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double>)
5905	// <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64>)
5906	// <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, <1 x i64>)
5907
5908	case Intrinsic::x86_vcvtps2ph_128:
5909	case Intrinsic::x86_vcvtps2ph_256: {
5910	handleSSEVectorConvertIntrinsicByProp(I, /HasRoundingMode=/true);
5911	break;
5912	}
5913
5914	// Convert Packed Single Precision Floating-Point Values
5915	// to Packed Signed Doubleword Integer Values
5916	//
5917	// <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512
5918	// (<16 x float>, <16 x i32>, i16, i32)
5919	case Intrinsic::x86_avx512_mask_cvtps2dq_512:
5920	handleAVX512VectorConvertFPToInt(I, /LastMask=/false);
5921	break;
5922
5923	// Convert Packed Double Precision Floating-Point Values
5924	// to Packed Single Precision Floating-Point Values
5925	case Intrinsic::x86_sse2_cvtpd2ps:
5926	case Intrinsic::x86_sse2_cvtps2dq:
5927	case Intrinsic::x86_sse2_cvtpd2dq:
5928	case Intrinsic::x86_sse2_cvttps2dq:
5929	case Intrinsic::x86_sse2_cvttpd2dq:
5930	case Intrinsic::x86_avx_cvt_pd2_ps_256:
5931	case Intrinsic::x86_avx_cvt_ps2dq_256:
5932	case Intrinsic::x86_avx_cvt_pd2dq_256:
5933	case Intrinsic::x86_avx_cvtt_ps2dq_256:
5934	case Intrinsic::x86_avx_cvtt_pd2dq_256: {
5935	handleSSEVectorConvertIntrinsicByProp(I, /HasRoundingMode=/false);
5936	break;
5937	}
5938
5939	// Convert Single-Precision FP Value to 16-bit FP Value
5940	// <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512
5941	// (<16 x float>, i32, <16 x i16>, i16)
5942	// <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128
5943	// (<4 x float>, i32, <8 x i16>, i8)
5944	// <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256
5945	// (<8 x float>, i32, <8 x i16>, i8)
5946	case Intrinsic::x86_avx512_mask_vcvtps2ph_512:
5947	case Intrinsic::x86_avx512_mask_vcvtps2ph_256:
5948	case Intrinsic::x86_avx512_mask_vcvtps2ph_128:
5949	handleAVX512VectorConvertFPToInt(I, /LastMask=/true);
5950	break;
5951
5952	// Shift Packed Data (Left Logical, Right Arithmetic, Right Logical)
5953	case Intrinsic::x86_avx512_psll_w_512:
5954	case Intrinsic::x86_avx512_psll_d_512:
5955	case Intrinsic::x86_avx512_psll_q_512:
5956	case Intrinsic::x86_avx512_pslli_w_512:
5957	case Intrinsic::x86_avx512_pslli_d_512:
5958	case Intrinsic::x86_avx512_pslli_q_512:
5959	case Intrinsic::x86_avx512_psrl_w_512:
5960	case Intrinsic::x86_avx512_psrl_d_512:
5961	case Intrinsic::x86_avx512_psrl_q_512:
5962	case Intrinsic::x86_avx512_psra_w_512:
5963	case Intrinsic::x86_avx512_psra_d_512:
5964	case Intrinsic::x86_avx512_psra_q_512:
5965	case Intrinsic::x86_avx512_psrli_w_512:
5966	case Intrinsic::x86_avx512_psrli_d_512:
5967	case Intrinsic::x86_avx512_psrli_q_512:
5968	case Intrinsic::x86_avx512_psrai_w_512:
5969	case Intrinsic::x86_avx512_psrai_d_512:
5970	case Intrinsic::x86_avx512_psrai_q_512:
5971	case Intrinsic::x86_avx512_psra_q_256:
5972	case Intrinsic::x86_avx512_psra_q_128:
5973	case Intrinsic::x86_avx512_psrai_q_256:
5974	case Intrinsic::x86_avx512_psrai_q_128:
5975	case Intrinsic::x86_avx2_psll_w:
5976	case Intrinsic::x86_avx2_psll_d:
5977	case Intrinsic::x86_avx2_psll_q:
5978	case Intrinsic::x86_avx2_pslli_w:
5979	case Intrinsic::x86_avx2_pslli_d:
5980	case Intrinsic::x86_avx2_pslli_q:
5981	case Intrinsic::x86_avx2_psrl_w:
5982	case Intrinsic::x86_avx2_psrl_d:
5983	case Intrinsic::x86_avx2_psrl_q:
5984	case Intrinsic::x86_avx2_psra_w:
5985	case Intrinsic::x86_avx2_psra_d:
5986	case Intrinsic::x86_avx2_psrli_w:
5987	case Intrinsic::x86_avx2_psrli_d:
5988	case Intrinsic::x86_avx2_psrli_q:
5989	case Intrinsic::x86_avx2_psrai_w:
5990	case Intrinsic::x86_avx2_psrai_d:
5991	case Intrinsic::x86_sse2_psll_w:
5992	case Intrinsic::x86_sse2_psll_d:
5993	case Intrinsic::x86_sse2_psll_q:
5994	case Intrinsic::x86_sse2_pslli_w:
5995	case Intrinsic::x86_sse2_pslli_d:
5996	case Intrinsic::x86_sse2_pslli_q:
5997	case Intrinsic::x86_sse2_psrl_w:
5998	case Intrinsic::x86_sse2_psrl_d:
5999	case Intrinsic::x86_sse2_psrl_q:
6000	case Intrinsic::x86_sse2_psra_w:
6001	case Intrinsic::x86_sse2_psra_d:
6002	case Intrinsic::x86_sse2_psrli_w:
6003	case Intrinsic::x86_sse2_psrli_d:
6004	case Intrinsic::x86_sse2_psrli_q:
6005	case Intrinsic::x86_sse2_psrai_w:
6006	case Intrinsic::x86_sse2_psrai_d:
6007	case Intrinsic::x86_mmx_psll_w:
6008	case Intrinsic::x86_mmx_psll_d:
6009	case Intrinsic::x86_mmx_psll_q:
6010	case Intrinsic::x86_mmx_pslli_w:
6011	case Intrinsic::x86_mmx_pslli_d:
6012	case Intrinsic::x86_mmx_pslli_q:
6013	case Intrinsic::x86_mmx_psrl_w:
6014	case Intrinsic::x86_mmx_psrl_d:
6015	case Intrinsic::x86_mmx_psrl_q:
6016	case Intrinsic::x86_mmx_psra_w:
6017	case Intrinsic::x86_mmx_psra_d:
6018	case Intrinsic::x86_mmx_psrli_w:
6019	case Intrinsic::x86_mmx_psrli_d:
6020	case Intrinsic::x86_mmx_psrli_q:
6021	case Intrinsic::x86_mmx_psrai_w:
6022	case Intrinsic::x86_mmx_psrai_d:
6023	handleVectorShiftIntrinsic(I, / Variable / false);
6024	break;
6025	case Intrinsic::x86_avx2_psllv_d:
6026	case Intrinsic::x86_avx2_psllv_d_256:
6027	case Intrinsic::x86_avx512_psllv_d_512:
6028	case Intrinsic::x86_avx2_psllv_q:
6029	case Intrinsic::x86_avx2_psllv_q_256:
6030	case Intrinsic::x86_avx512_psllv_q_512:
6031	case Intrinsic::x86_avx2_psrlv_d:
6032	case Intrinsic::x86_avx2_psrlv_d_256:
6033	case Intrinsic::x86_avx512_psrlv_d_512:
6034	case Intrinsic::x86_avx2_psrlv_q:
6035	case Intrinsic::x86_avx2_psrlv_q_256:
6036	case Intrinsic::x86_avx512_psrlv_q_512:
6037	case Intrinsic::x86_avx2_psrav_d:
6038	case Intrinsic::x86_avx2_psrav_d_256:
6039	case Intrinsic::x86_avx512_psrav_d_512:
6040	case Intrinsic::x86_avx512_psrav_q_128:
6041	case Intrinsic::x86_avx512_psrav_q_256:
6042	case Intrinsic::x86_avx512_psrav_q_512:
6043	handleVectorShiftIntrinsic(I, / Variable / true);
6044	break;
6045
6046	// Pack with Signed/Unsigned Saturation
6047	case Intrinsic::x86_sse2_packsswb_128:
6048	case Intrinsic::x86_sse2_packssdw_128:
6049	case Intrinsic::x86_sse2_packuswb_128:
6050	case Intrinsic::x86_sse41_packusdw:
6051	case Intrinsic::x86_avx2_packsswb:
6052	case Intrinsic::x86_avx2_packssdw:
6053	case Intrinsic::x86_avx2_packuswb:
6054	case Intrinsic::x86_avx2_packusdw:
6055	// e.g., <64 x i8> @llvm.x86.avx512.packsswb.512
6056	// (<32 x i16> %a, <32 x i16> %b)
6057	// <32 x i16> @llvm.x86.avx512.packssdw.512
6058	// (<16 x i32> %a, <16 x i32> %b)
6059	// Note: AVX512 masked variants are auto-upgraded by LLVM.
6060	case Intrinsic::x86_avx512_packsswb_512:
6061	case Intrinsic::x86_avx512_packssdw_512:
6062	case Intrinsic::x86_avx512_packuswb_512:
6063	case Intrinsic::x86_avx512_packusdw_512:
6064	handleVectorPackIntrinsic(I);
6065	break;
6066
6067	case Intrinsic::x86_sse41_pblendvb:
6068	case Intrinsic::x86_sse41_blendvpd:
6069	case Intrinsic::x86_sse41_blendvps:
6070	case Intrinsic::x86_avx_blendv_pd_256:
6071	case Intrinsic::x86_avx_blendv_ps_256:
6072	case Intrinsic::x86_avx2_pblendvb:
6073	handleBlendvIntrinsic(I);
6074	break;
6075
6076	case Intrinsic::x86_avx_dp_ps_256:
6077	case Intrinsic::x86_sse41_dppd:
6078	case Intrinsic::x86_sse41_dpps:
6079	handleDppIntrinsic(I);
6080	break;
6081
6082	case Intrinsic::x86_mmx_packsswb:
6083	case Intrinsic::x86_mmx_packuswb:
6084	handleVectorPackIntrinsic(I, MMXEltSizeInBits: `16`);
6085	break;
6086
6087	case Intrinsic::x86_mmx_packssdw:
6088	handleVectorPackIntrinsic(I, MMXEltSizeInBits: `32`);
6089	break;
6090
6091	case Intrinsic::x86_mmx_psad_bw:
6092	handleVectorSadIntrinsic(I, IsMMX: true);
6093	break;
6094	case Intrinsic::x86_sse2_psad_bw:
6095	case Intrinsic::x86_avx2_psad_bw:
6096	handleVectorSadIntrinsic(I);
6097	break;
6098
6099	// Multiply and Add Packed Words
6100	// < 4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>)
6101	// < 8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>)
6102	// <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16>, <32 x i16>)
6103	//
6104	// Multiply and Add Packed Signed and Unsigned Bytes
6105	// < 8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>)
6106	// <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>)
6107	// <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8>, <64 x i8>)
6108	//
6109	// These intrinsics are auto-upgraded into non-masked forms:
6110	// < 4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128
6111	// (<8 x i16>, <8 x i16>, <4 x i32>, i8)
6112	// < 8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256
6113	// (<16 x i16>, <16 x i16>, <8 x i32>, i8)
6114	// <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512
6115	// (<32 x i16>, <32 x i16>, <16 x i32>, i16)
6116	// < 8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128
6117	// (<16 x i8>, <16 x i8>, <8 x i16>, i8)
6118	// <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256
6119	// (<32 x i8>, <32 x i8>, <16 x i16>, i16)
6120	// <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512
6121	// (<64 x i8>, <64 x i8>, <32 x i16>, i32)
6122	case Intrinsic::x86_sse2_pmadd_wd:
6123	case Intrinsic::x86_avx2_pmadd_wd:
6124	case Intrinsic::x86_avx512_pmaddw_d_512:
6125	case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
6126	case Intrinsic::x86_avx2_pmadd_ub_sw:
6127	case Intrinsic::x86_avx512_pmaddubs_w_512:
6128	handleVectorDotProductIntrinsic(I, /ReductionFactor=/`2`,
6129	/ZeroPurifies=/true,
6130	/EltSizeInBits=/`0`,
6131	/Lanes=/kBothLanes);
6132	break;
6133
6134	// <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>)
6135	case Intrinsic::x86_ssse3_pmadd_ub_sw:
6136	handleVectorDotProductIntrinsic(I, /ReductionFactor=/`2`,
6137	/ZeroPurifies=/true,
6138	/EltSizeInBits=/`8`,
6139	/Lanes=/kBothLanes);
6140	break;
6141
6142	// <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>)
6143	case Intrinsic::x86_mmx_pmadd_wd:
6144	handleVectorDotProductIntrinsic(I, /ReductionFactor=/`2`,
6145	/ZeroPurifies=/true,
6146	/EltSizeInBits=/`16`,
6147	/Lanes=/kBothLanes);
6148	break;
6149
6150	// BFloat16 multiply-add to single-precision
6151	// <4 x float> llvm.aarch64.neon.bfmlalt
6152	// (<4 x float>, <8 x bfloat>, <8 x bfloat>)
6153	case Intrinsic::aarch64_neon_bfmlalt:
6154	handleVectorDotProductIntrinsic(I, /ReductionFactor=/`2`,
6155	/ZeroPurifies=/false,
6156	/EltSizeInBits=/`0`,
6157	/Lanes=/kOddLanes);
6158	break;
6159
6160	// <4 x float> llvm.aarch64.neon.bfmlalb
6161	// (<4 x float>, <8 x bfloat>, <8 x bfloat>)
6162	case Intrinsic::aarch64_neon_bfmlalb:
6163	handleVectorDotProductIntrinsic(I, /ReductionFactor=/`2`,
6164	/ZeroPurifies=/false,
6165	/EltSizeInBits=/`0`,
6166	/Lanes=/kEvenLanes);
6167	break;
6168
6169	// AVX Vector Neural Network Instructions: bytes
6170	//
6171	// Multiply and Add Signed Bytes
6172	// < 4 x i32> @llvm.x86.avx2.vpdpbssd.128
6173	// (< 4 x i32>, <16 x i8>, <16 x i8>)
6174	// < 8 x i32> @llvm.x86.avx2.vpdpbssd.256
6175	// (< 8 x i32>, <32 x i8>, <32 x i8>)
6176	// <16 x i32> @llvm.x86.avx10.vpdpbssd.512
6177	// (<16 x i32>, <64 x i8>, <64 x i8>)
6178	//
6179	// Multiply and Add Signed Bytes With Saturation
6180	// < 4 x i32> @llvm.x86.avx2.vpdpbssds.128
6181	// (< 4 x i32>, <16 x i8>, <16 x i8>)
6182	// < 8 x i32> @llvm.x86.avx2.vpdpbssds.256
6183	// (< 8 x i32>, <32 x i8>, <32 x i8>)
6184	// <16 x i32> @llvm.x86.avx10.vpdpbssds.512
6185	// (<16 x i32>, <64 x i8>, <64 x i8>)
6186	//
6187	// Multiply and Add Signed and Unsigned Bytes
6188	// < 4 x i32> @llvm.x86.avx2.vpdpbsud.128
6189	// (< 4 x i32>, <16 x i8>, <16 x i8>)
6190	// < 8 x i32> @llvm.x86.avx2.vpdpbsud.256
6191	// (< 8 x i32>, <32 x i8>, <32 x i8>)
6192	// <16 x i32> @llvm.x86.avx10.vpdpbsud.512
6193	// (<16 x i32>, <64 x i8>, <64 x i8>)
6194	//
6195	// Multiply and Add Signed and Unsigned Bytes With Saturation
6196	// < 4 x i32> @llvm.x86.avx2.vpdpbsuds.128
6197	// (< 4 x i32>, <16 x i8>, <16 x i8>)
6198	// < 8 x i32> @llvm.x86.avx2.vpdpbsuds.256
6199	// (< 8 x i32>, <32 x i8>, <32 x i8>)
6200	// <16 x i32> @llvm.x86.avx512.vpdpbusds.512
6201	// (<16 x i32>, <64 x i8>, <64 x i8>)
6202	//
6203	// Multiply and Add Unsigned and Signed Bytes
6204	// < 4 x i32> @llvm.x86.avx512.vpdpbusd.128
6205	// (< 4 x i32>, <16 x i8>, <16 x i8>)
6206	// < 8 x i32> @llvm.x86.avx512.vpdpbusd.256
6207	// (< 8 x i32>, <32 x i8>, <32 x i8>)
6208	// <16 x i32> @llvm.x86.avx512.vpdpbusd.512
6209	// (<16 x i32>, <64 x i8>, <64 x i8>)
6210	//
6211	// Multiply and Add Unsigned and Signed Bytes With Saturation
6212	// < 4 x i32> @llvm.x86.avx512.vpdpbusds.128
6213	// (< 4 x i32>, <16 x i8>, <16 x i8>)
6214	// < 8 x i32> @llvm.x86.avx512.vpdpbusds.256
6215	// (< 8 x i32>, <32 x i8>, <32 x i8>)
6216	// <16 x i32> @llvm.x86.avx10.vpdpbsuds.512
6217	// (<16 x i32>, <64 x i8>, <64 x i8>)
6218	//
6219	// Multiply and Add Unsigned Bytes
6220	// < 4 x i32> @llvm.x86.avx2.vpdpbuud.128
6221	// (< 4 x i32>, <16 x i8>, <16 x i8>)
6222	// < 8 x i32> @llvm.x86.avx2.vpdpbuud.256
6223	// (< 8 x i32>, <32 x i8>, <32 x i8>)
6224	// <16 x i32> @llvm.x86.avx10.vpdpbuud.512
6225	// (<16 x i32>, <64 x i8>, <64 x i8>)
6226	//
6227	// Multiply and Add Unsigned Bytes With Saturation
6228	// < 4 x i32> @llvm.x86.avx2.vpdpbuuds.128
6229	// (< 4 x i32>, <16 x i8>, <16 x i8>)
6230	// < 8 x i32> @llvm.x86.avx2.vpdpbuuds.256
6231	// (< 8 x i32>, <32 x i8>, <32 x i8>)
6232	// <16 x i32> @llvm.x86.avx10.vpdpbuuds.512
6233	// (<16 x i32>, <64 x i8>, <64 x i8>)
6234	//
6235	// These intrinsics are auto-upgraded into non-masked forms:
6236	// <4 x i32> @llvm.x86.avx512.mask.vpdpbusd.128
6237	// (<4 x i32>, <16 x i8>, <16 x i8>, i8)
6238	// <4 x i32> @llvm.x86.avx512.maskz.vpdpbusd.128
6239	// (<4 x i32>, <16 x i8>, <16 x i8>, i8)
6240	// <8 x i32> @llvm.x86.avx512.mask.vpdpbusd.256
6241	// (<8 x i32>, <32 x i8>, <32 x i8>, i8)
6242	// <8 x i32> @llvm.x86.avx512.maskz.vpdpbusd.256
6243	// (<8 x i32>, <32 x i8>, <32 x i8>, i8)
6244	// <16 x i32> @llvm.x86.avx512.mask.vpdpbusd.512
6245	// (<16 x i32>, <64 x i8>, <64 x i8>, i16)
6246	// <16 x i32> @llvm.x86.avx512.maskz.vpdpbusd.512
6247	// (<16 x i32>, <64 x i8>, <64 x i8>, i16)
6248	//
6249	// <4 x i32> @llvm.x86.avx512.mask.vpdpbusds.128
6250	// (<4 x i32>, <16 x i8>, <16 x i8>, i8)
6251	// <4 x i32> @llvm.x86.avx512.maskz.vpdpbusds.128
6252	// (<4 x i32>, <16 x i8>, <16 x i8>, i8)
6253	// <8 x i32> @llvm.x86.avx512.mask.vpdpbusds.256
6254	// (<8 x i32>, <32 x i8>, <32 x i8>, i8)
6255	// <8 x i32> @llvm.x86.avx512.maskz.vpdpbusds.256
6256	// (<8 x i32>, <32 x i8>, <32 x i8>, i8)
6257	// <16 x i32> @llvm.x86.avx512.mask.vpdpbusds.512
6258	// (<16 x i32>, <64 x i8>, <64 x i8>, i16)
6259	// <16 x i32> @llvm.x86.avx512.maskz.vpdpbusds.512
6260	// (<16 x i32>, <64 x i8>, <64 x i8>, i16)
6261	case Intrinsic::x86_avx512_vpdpbusd_128:
6262	case Intrinsic::x86_avx512_vpdpbusd_256:
6263	case Intrinsic::x86_avx512_vpdpbusd_512:
6264	case Intrinsic::x86_avx512_vpdpbusds_128:
6265	case Intrinsic::x86_avx512_vpdpbusds_256:
6266	case Intrinsic::x86_avx512_vpdpbusds_512:
6267	case Intrinsic::x86_avx2_vpdpbssd_128:
6268	case Intrinsic::x86_avx2_vpdpbssd_256:
6269	case Intrinsic::x86_avx10_vpdpbssd_512:
6270	case Intrinsic::x86_avx2_vpdpbssds_128:
6271	case Intrinsic::x86_avx2_vpdpbssds_256:
6272	case Intrinsic::x86_avx10_vpdpbssds_512:
6273	case Intrinsic::x86_avx2_vpdpbsud_128:
6274	case Intrinsic::x86_avx2_vpdpbsud_256:
6275	case Intrinsic::x86_avx10_vpdpbsud_512:
6276	case Intrinsic::x86_avx2_vpdpbsuds_128:
6277	case Intrinsic::x86_avx2_vpdpbsuds_256:
6278	case Intrinsic::x86_avx10_vpdpbsuds_512:
6279	case Intrinsic::x86_avx2_vpdpbuud_128:
6280	case Intrinsic::x86_avx2_vpdpbuud_256:
6281	case Intrinsic::x86_avx10_vpdpbuud_512:
6282	case Intrinsic::x86_avx2_vpdpbuuds_128:
6283	case Intrinsic::x86_avx2_vpdpbuuds_256:
6284	case Intrinsic::x86_avx10_vpdpbuuds_512:
6285	handleVectorDotProductIntrinsic(I, /ReductionFactor=/`4`,
6286	/ZeroPurifies=/true,
6287	/EltSizeInBits=/`0`,
6288	/Lanes=/kBothLanes);
6289	break;
6290
6291	// AVX Vector Neural Network Instructions: words
6292	//
6293	// Multiply and Add Signed Word Integers
6294	// < 4 x i32> @llvm.x86.avx512.vpdpwssd.128
6295	// (< 4 x i32>, < 8 x i16>, < 8 x i16>)
6296	// < 8 x i32> @llvm.x86.avx512.vpdpwssd.256
6297	// (< 8 x i32>, <16 x i16>, <16 x i16>)
6298	// <16 x i32> @llvm.x86.avx512.vpdpwssd.512
6299	// (<16 x i32>, <32 x i16>, <32 x i16>)
6300	//
6301	// Multiply and Add Signed Word Integers With Saturation
6302	// < 4 x i32> @llvm.x86.avx512.vpdpwssds.128
6303	// (< 4 x i32>, < 8 x i16>, < 8 x i16>)
6304	// < 8 x i32> @llvm.x86.avx512.vpdpwssds.256
6305	// (< 8 x i32>, <16 x i16>, <16 x i16>)
6306	// <16 x i32> @llvm.x86.avx512.vpdpwssds.512
6307	// (<16 x i32>, <32 x i16>, <32 x i16>)
6308	//
6309	// Multiply and Add Signed and Unsigned Word Integers
6310	// < 4 x i32> @llvm.x86.avx2.vpdpwsud.128
6311	// (< 4 x i32>, < 8 x i16>, < 8 x i16>)
6312	// < 8 x i32> @llvm.x86.avx2.vpdpwsud.256
6313	// (< 8 x i32>, <16 x i16>, <16 x i16>)
6314	// <16 x i32> @llvm.x86.avx10.vpdpwsud.512
6315	// (<16 x i32>, <32 x i16>, <32 x i16>)
6316	//
6317	// Multiply and Add Signed and Unsigned Word Integers With Saturation
6318	// < 4 x i32> @llvm.x86.avx2.vpdpwsuds.128
6319	// (< 4 x i32>, < 8 x i16>, < 8 x i16>)
6320	// < 8 x i32> @llvm.x86.avx2.vpdpwsuds.256
6321	// (< 8 x i32>, <16 x i16>, <16 x i16>)
6322	// <16 x i32> @llvm.x86.avx10.vpdpwsuds.512
6323	// (<16 x i32>, <32 x i16>, <32 x i16>)
6324	//
6325	// Multiply and Add Unsigned and Signed Word Integers
6326	// < 4 x i32> @llvm.x86.avx2.vpdpwusd.128
6327	// (< 4 x i32>, < 8 x i16>, < 8 x i16>)
6328	// < 8 x i32> @llvm.x86.avx2.vpdpwusd.256
6329	// (< 8 x i32>, <16 x i16>, <16 x i16>)
6330	// <16 x i32> @llvm.x86.avx10.vpdpwusd.512
6331	// (<16 x i32>, <32 x i16>, <32 x i16>)
6332	//
6333	// Multiply and Add Unsigned and Signed Word Integers With Saturation
6334	// < 4 x i32> @llvm.x86.avx2.vpdpwusds.128
6335	// (< 4 x i32>, < 8 x i16>, < 8 x i16>)
6336	// < 8 x i32> @llvm.x86.avx2.vpdpwusds.256
6337	// (< 8 x i32>, <16 x i16>, <16 x i16>)
6338	// <16 x i32> @llvm.x86.avx10.vpdpwusds.512
6339	// (<16 x i32>, <32 x i16>, <32 x i16>)
6340	//
6341	// Multiply and Add Unsigned and Unsigned Word Integers
6342	// < 4 x i32> @llvm.x86.avx2.vpdpwuud.128
6343	// (< 4 x i32>, < 8 x i16>, < 8 x i16>)
6344	// < 8 x i32> @llvm.x86.avx2.vpdpwuud.256
6345	// (< 8 x i32>, <16 x i16>, <16 x i16>)
6346	// <16 x i32> @llvm.x86.avx10.vpdpwuud.512
6347	// (<16 x i32>, <32 x i16>, <32 x i16>)
6348	//
6349	// Multiply and Add Unsigned and Unsigned Word Integers With Saturation
6350	// < 4 x i32> @llvm.x86.avx2.vpdpwuuds.128
6351	// (< 4 x i32>, < 8 x i16>, < 8 x i16>)
6352	// < 8 x i32> @llvm.x86.avx2.vpdpwuuds.256
6353	// (< 8 x i32>, <16 x i16>, <16 x i16>)
6354	// <16 x i32> @llvm.x86.avx10.vpdpwuuds.512
6355	// (<16 x i32>, <32 x i16>, <32 x i16>)
6356	//
6357	// These intrinsics are auto-upgraded into non-masked forms:
6358	// <4 x i32> @llvm.x86.avx512.mask.vpdpwssd.128
6359	// (<4 x i32>, <8 x i16>, <8 x i16>, i8)
6360	// <4 x i32> @llvm.x86.avx512.maskz.vpdpwssd.128
6361	// (<4 x i32>, <8 x i16>, <8 x i16>, i8)
6362	// <8 x i32> @llvm.x86.avx512.mask.vpdpwssd.256
6363	// (<8 x i32>, <16 x i16>, <16 x i16>, i8)
6364	// <8 x i32> @llvm.x86.avx512.maskz.vpdpwssd.256
6365	// (<8 x i32>, <16 x i16>, <16 x i16>, i8)
6366	// <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512
6367	// (<16 x i32>, <32 x i16>, <32 x i16>, i16)
6368	// <16 x i32> @llvm.x86.avx512.maskz.vpdpwssd.512
6369	// (<16 x i32>, <32 x i16>, <32 x i16>, i16)
6370	//
6371	// <4 x i32> @llvm.x86.avx512.mask.vpdpwssds.128
6372	// (<4 x i32>, <8 x i16>, <8 x i16>, i8)
6373	// <4 x i32> @llvm.x86.avx512.maskz.vpdpwssds.128
6374	// (<4 x i32>, <8 x i16>, <8 x i16>, i8)
6375	// <8 x i32> @llvm.x86.avx512.mask.vpdpwssds.256
6376	// (<8 x i32>, <16 x i16>, <16 x i16>, i8)
6377	// <8 x i32> @llvm.x86.avx512.maskz.vpdpwssds.256
6378	// (<8 x i32>, <16 x i16>, <16 x i16>, i8)
6379	// <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512
6380	// (<16 x i32>, <32 x i16>, <32 x i16>, i16)
6381	// <16 x i32> @llvm.x86.avx512.maskz.vpdpwssds.512
6382	// (<16 x i32>, <32 x i16>, <32 x i16>, i16)
6383	case Intrinsic::x86_avx512_vpdpwssd_128:
6384	case Intrinsic::x86_avx512_vpdpwssd_256:
6385	case Intrinsic::x86_avx512_vpdpwssd_512:
6386	case Intrinsic::x86_avx512_vpdpwssds_128:
6387	case Intrinsic::x86_avx512_vpdpwssds_256:
6388	case Intrinsic::x86_avx512_vpdpwssds_512:
6389	case Intrinsic::x86_avx2_vpdpwsud_128:
6390	case Intrinsic::x86_avx2_vpdpwsud_256:
6391	case Intrinsic::x86_avx10_vpdpwsud_512:
6392	case Intrinsic::x86_avx2_vpdpwsuds_128:
6393	case Intrinsic::x86_avx2_vpdpwsuds_256:
6394	case Intrinsic::x86_avx10_vpdpwsuds_512:
6395	case Intrinsic::x86_avx2_vpdpwusd_128:
6396	case Intrinsic::x86_avx2_vpdpwusd_256:
6397	case Intrinsic::x86_avx10_vpdpwusd_512:
6398	case Intrinsic::x86_avx2_vpdpwusds_128:
6399	case Intrinsic::x86_avx2_vpdpwusds_256:
6400	case Intrinsic::x86_avx10_vpdpwusds_512:
6401	case Intrinsic::x86_avx2_vpdpwuud_128:
6402	case Intrinsic::x86_avx2_vpdpwuud_256:
6403	case Intrinsic::x86_avx10_vpdpwuud_512:
6404	case Intrinsic::x86_avx2_vpdpwuuds_128:
6405	case Intrinsic::x86_avx2_vpdpwuuds_256:
6406	case Intrinsic::x86_avx10_vpdpwuuds_512:
6407	handleVectorDotProductIntrinsic(I, /ReductionFactor=/`2`,
6408	/ZeroPurifies=/true,
6409	/EltSizeInBits=/`0`,
6410	/Lanes=/kBothLanes);
6411	break;
6412
6413	// Dot Product of BF16 Pairs Accumulated Into Packed Single
6414	// Precision
6415	// <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128
6416	// (<4 x float>, <8 x bfloat>, <8 x bfloat>)
6417	// <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256
6418	// (<8 x float>, <16 x bfloat>, <16 x bfloat>)
6419	// <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512
6420	// (<16 x float>, <32 x bfloat>, <32 x bfloat>)
6421	case Intrinsic::x86_avx512bf16_dpbf16ps_128:
6422	case Intrinsic::x86_avx512bf16_dpbf16ps_256:
6423	case Intrinsic::x86_avx512bf16_dpbf16ps_512:
6424	handleVectorDotProductIntrinsic(I, /ReductionFactor=/`2`,
6425	/ZeroPurifies=/false,
6426	/EltSizeInBits=/`0`,
6427	/Lanes=/kBothLanes);
6428	break;
6429
6430	case Intrinsic::x86_sse_cmp_ss:
6431	case Intrinsic::x86_sse2_cmp_sd:
6432	case Intrinsic::x86_sse_comieq_ss:
6433	case Intrinsic::x86_sse_comilt_ss:
6434	case Intrinsic::x86_sse_comile_ss:
6435	case Intrinsic::x86_sse_comigt_ss:
6436	case Intrinsic::x86_sse_comige_ss:
6437	case Intrinsic::x86_sse_comineq_ss:
6438	case Intrinsic::x86_sse_ucomieq_ss:
6439	case Intrinsic::x86_sse_ucomilt_ss:
6440	case Intrinsic::x86_sse_ucomile_ss:
6441	case Intrinsic::x86_sse_ucomigt_ss:
6442	case Intrinsic::x86_sse_ucomige_ss:
6443	case Intrinsic::x86_sse_ucomineq_ss:
6444	case Intrinsic::x86_sse2_comieq_sd:
6445	case Intrinsic::x86_sse2_comilt_sd:
6446	case Intrinsic::x86_sse2_comile_sd:
6447	case Intrinsic::x86_sse2_comigt_sd:
6448	case Intrinsic::x86_sse2_comige_sd:
6449	case Intrinsic::x86_sse2_comineq_sd:
6450	case Intrinsic::x86_sse2_ucomieq_sd:
6451	case Intrinsic::x86_sse2_ucomilt_sd:
6452	case Intrinsic::x86_sse2_ucomile_sd:
6453	case Intrinsic::x86_sse2_ucomigt_sd:
6454	case Intrinsic::x86_sse2_ucomige_sd:
6455	case Intrinsic::x86_sse2_ucomineq_sd:
6456	handleVectorCompareScalarIntrinsic(I);
6457	break;
6458
6459	case Intrinsic::x86_avx_cmp_pd_256:
6460	case Intrinsic::x86_avx_cmp_ps_256:
6461	case Intrinsic::x86_sse2_cmp_pd:
6462	case Intrinsic::x86_sse_cmp_ps:
6463	handleVectorComparePackedIntrinsic(I, /PredicateAsOperand=/true);
6464	break;
6465
6466	case Intrinsic::x86_bmi_bextr_32:
6467	case Intrinsic::x86_bmi_bextr_64:
6468	case Intrinsic::x86_bmi_bzhi_32:
6469	case Intrinsic::x86_bmi_bzhi_64:
6470	case Intrinsic::x86_bmi_pdep_32:
6471	case Intrinsic::x86_bmi_pdep_64:
6472	case Intrinsic::x86_bmi_pext_32:
6473	case Intrinsic::x86_bmi_pext_64:
6474	handleBmiIntrinsic(I);
6475	break;
6476
6477	case Intrinsic::x86_pclmulqdq:
6478	case Intrinsic::x86_pclmulqdq_256:
6479	case Intrinsic::x86_pclmulqdq_512:
6480	handlePclmulIntrinsic(I);
6481	break;
6482
6483	case Intrinsic::x86_avx_round_pd_256:
6484	case Intrinsic::x86_avx_round_ps_256:
6485	case Intrinsic::x86_sse41_round_pd:
6486	case Intrinsic::x86_sse41_round_ps:
6487	handleRoundPdPsIntrinsic(I);
6488	break;
6489
6490	case Intrinsic::x86_sse41_round_sd:
6491	case Intrinsic::x86_sse41_round_ss:
6492	handleUnarySdSsIntrinsic(I);
6493	break;
6494
6495	case Intrinsic::x86_sse2_max_sd:
6496	case Intrinsic::x86_sse_max_ss:
6497	case Intrinsic::x86_sse2_min_sd:
6498	case Intrinsic::x86_sse_min_ss:
6499	handleBinarySdSsIntrinsic(I);
6500	break;
6501
6502	case Intrinsic::x86_avx_vtestc_pd:
6503	case Intrinsic::x86_avx_vtestc_pd_256:
6504	case Intrinsic::x86_avx_vtestc_ps:
6505	case Intrinsic::x86_avx_vtestc_ps_256:
6506	case Intrinsic::x86_avx_vtestnzc_pd:
6507	case Intrinsic::x86_avx_vtestnzc_pd_256:
6508	case Intrinsic::x86_avx_vtestnzc_ps:
6509	case Intrinsic::x86_avx_vtestnzc_ps_256:
6510	case Intrinsic::x86_avx_vtestz_pd:
6511	case Intrinsic::x86_avx_vtestz_pd_256:
6512	case Intrinsic::x86_avx_vtestz_ps:
6513	case Intrinsic::x86_avx_vtestz_ps_256:
6514	case Intrinsic::x86_avx_ptestc_256:
6515	case Intrinsic::x86_avx_ptestnzc_256:
6516	case Intrinsic::x86_avx_ptestz_256:
6517	case Intrinsic::x86_sse41_ptestc:
6518	case Intrinsic::x86_sse41_ptestnzc:
6519	case Intrinsic::x86_sse41_ptestz:
6520	handleVtestIntrinsic(I);
6521	break;
6522
6523	// Packed Horizontal Add/Subtract
6524	case Intrinsic::x86_ssse3_phadd_w:
6525	case Intrinsic::x86_ssse3_phadd_w_128:
6526	case Intrinsic::x86_ssse3_phsub_w:
6527	case Intrinsic::x86_ssse3_phsub_w_128:
6528	handlePairwiseShadowOrIntrinsic(I, /Shards=/`1`,
6529	/ReinterpretElemWidth=/`16`);
6530	break;
6531
6532	case Intrinsic::x86_avx2_phadd_w:
6533	case Intrinsic::x86_avx2_phsub_w:
6534	handlePairwiseShadowOrIntrinsic(I, /Shards=/`2`,
6535	/ReinterpretElemWidth=/`16`);
6536	break;
6537
6538	// Packed Horizontal Add/Subtract
6539	case Intrinsic::x86_ssse3_phadd_d:
6540	case Intrinsic::x86_ssse3_phadd_d_128:
6541	case Intrinsic::x86_ssse3_phsub_d:
6542	case Intrinsic::x86_ssse3_phsub_d_128:
6543	handlePairwiseShadowOrIntrinsic(I, /Shards=/`1`,
6544	/ReinterpretElemWidth=/`32`);
6545	break;
6546
6547	case Intrinsic::x86_avx2_phadd_d:
6548	case Intrinsic::x86_avx2_phsub_d:
6549	handlePairwiseShadowOrIntrinsic(I, /Shards=/`2`,
6550	/ReinterpretElemWidth=/`32`);
6551	break;
6552
6553	// Packed Horizontal Add/Subtract and Saturate
6554	case Intrinsic::x86_ssse3_phadd_sw:
6555	case Intrinsic::x86_ssse3_phadd_sw_128:
6556	case Intrinsic::x86_ssse3_phsub_sw:
6557	case Intrinsic::x86_ssse3_phsub_sw_128:
6558	handlePairwiseShadowOrIntrinsic(I, /Shards=/`1`,
6559	/ReinterpretElemWidth=/`16`);
6560	break;
6561
6562	case Intrinsic::x86_avx2_phadd_sw:
6563	case Intrinsic::x86_avx2_phsub_sw:
6564	handlePairwiseShadowOrIntrinsic(I, /Shards=/`2`,
6565	/ReinterpretElemWidth=/`16`);
6566	break;
6567
6568	// Packed Single/Double Precision Floating-Point Horizontal Add
6569	case Intrinsic::x86_sse3_hadd_ps:
6570	case Intrinsic::x86_sse3_hadd_pd:
6571	case Intrinsic::x86_sse3_hsub_ps:
6572	case Intrinsic::x86_sse3_hsub_pd:
6573	handlePairwiseShadowOrIntrinsic(I, /Shards=/`1`);
6574	break;
6575
6576	case Intrinsic::x86_avx_hadd_pd_256:
6577	case Intrinsic::x86_avx_hadd_ps_256:
6578	case Intrinsic::x86_avx_hsub_pd_256:
6579	case Intrinsic::x86_avx_hsub_ps_256:
6580	handlePairwiseShadowOrIntrinsic(I, /Shards=/`2`);
6581	break;
6582
6583	case Intrinsic::x86_avx_maskstore_ps:
6584	case Intrinsic::x86_avx_maskstore_pd:
6585	case Intrinsic::x86_avx_maskstore_ps_256:
6586	case Intrinsic::x86_avx_maskstore_pd_256:
6587	case Intrinsic::x86_avx2_maskstore_d:
6588	case Intrinsic::x86_avx2_maskstore_q:
6589	case Intrinsic::x86_avx2_maskstore_d_256:
6590	case Intrinsic::x86_avx2_maskstore_q_256: {
6591	handleAVXMaskedStore(I);
6592	break;
6593	}
6594
6595	case Intrinsic::x86_avx_maskload_ps:
6596	case Intrinsic::x86_avx_maskload_pd:
6597	case Intrinsic::x86_avx_maskload_ps_256:
6598	case Intrinsic::x86_avx_maskload_pd_256:
6599	case Intrinsic::x86_avx2_maskload_d:
6600	case Intrinsic::x86_avx2_maskload_q:
6601	case Intrinsic::x86_avx2_maskload_d_256:
6602	case Intrinsic::x86_avx2_maskload_q_256: {
6603	handleAVXMaskedLoad(I);
6604	break;
6605	}
6606
6607	// Packed
6608	case Intrinsic::x86_avx512fp16_add_ph_512:
6609	case Intrinsic::x86_avx512fp16_sub_ph_512:
6610	case Intrinsic::x86_avx512fp16_mul_ph_512:
6611	case Intrinsic::x86_avx512fp16_div_ph_512:
6612	case Intrinsic::x86_avx512fp16_max_ph_512:
6613	case Intrinsic::x86_avx512fp16_min_ph_512:
6614	case Intrinsic::x86_avx512_min_ps_512:
6615	case Intrinsic::x86_avx512_min_pd_512:
6616	case Intrinsic::x86_avx512_max_ps_512:
6617	case Intrinsic::x86_avx512_max_pd_512: {
6618	// These AVX512 variants contain the rounding mode as a trailing flag.
6619	// Earlier variants do not have a trailing flag and are already handled
6620	// by maybeHandleSimpleNomemIntrinsic(I, 0) via
6621	// maybeHandleUnknownIntrinsic.
6622	[[maybe_unused]] bool Success =
6623	maybeHandleSimpleNomemIntrinsic(I, /trailingFlags=/`1`);
6624	assert(Success);
6625	break;
6626	}
6627
6628	case Intrinsic::x86_avx_vpermilvar_pd:
6629	case Intrinsic::x86_avx_vpermilvar_pd_256:
6630	case Intrinsic::x86_avx512_vpermilvar_pd_512:
6631	case Intrinsic::x86_avx_vpermilvar_ps:
6632	case Intrinsic::x86_avx_vpermilvar_ps_256:
6633	case Intrinsic::x86_avx512_vpermilvar_ps_512: {
6634	handleAVXVpermilvar(I);
6635	break;
6636	}
6637
6638	case Intrinsic::x86_avx512_vpermi2var_d_128:
6639	case Intrinsic::x86_avx512_vpermi2var_d_256:
6640	case Intrinsic::x86_avx512_vpermi2var_d_512:
6641	case Intrinsic::x86_avx512_vpermi2var_hi_128:
6642	case Intrinsic::x86_avx512_vpermi2var_hi_256:
6643	case Intrinsic::x86_avx512_vpermi2var_hi_512:
6644	case Intrinsic::x86_avx512_vpermi2var_pd_128:
6645	case Intrinsic::x86_avx512_vpermi2var_pd_256:
6646	case Intrinsic::x86_avx512_vpermi2var_pd_512:
6647	case Intrinsic::x86_avx512_vpermi2var_ps_128:
6648	case Intrinsic::x86_avx512_vpermi2var_ps_256:
6649	case Intrinsic::x86_avx512_vpermi2var_ps_512:
6650	case Intrinsic::x86_avx512_vpermi2var_q_128:
6651	case Intrinsic::x86_avx512_vpermi2var_q_256:
6652	case Intrinsic::x86_avx512_vpermi2var_q_512:
6653	case Intrinsic::x86_avx512_vpermi2var_qi_128:
6654	case Intrinsic::x86_avx512_vpermi2var_qi_256:
6655	case Intrinsic::x86_avx512_vpermi2var_qi_512:
6656	handleAVXVpermi2var(I);
6657	break;
6658
6659	// Packed Shuffle
6660	// llvm.x86.sse.pshuf.w(<1 x i64>, i8)
6661	// llvm.x86.ssse3.pshuf.b(<1 x i64>, <1 x i64>)
6662	// llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>)
6663	// llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>)
6664	// llvm.x86.avx512.pshuf.b.512(<64 x i8>, <64 x i8>)
6665	//
6666	// The following intrinsics are auto-upgraded:
6667	// llvm.x86.sse2.pshuf.d(<4 x i32>, i8)
6668	// llvm.x86.sse2.gpshufh.w(<8 x i16>, i8)
6669	// llvm.x86.sse2.pshufl.w(<8 x i16>, i8)
6670	case Intrinsic::x86_avx2_pshuf_b:
6671	case Intrinsic::x86_sse_pshuf_w:
6672	case Intrinsic::x86_ssse3_pshuf_b_128:
6673	case Intrinsic::x86_ssse3_pshuf_b:
6674	case Intrinsic::x86_avx512_pshuf_b_512:
6675	handleIntrinsicByApplyingToShadow(I, shadowIntrinsicID: I.getIntrinsicID(),
6676	/trailingVerbatimArgs=/`1`);
6677	break;
6678
6679	// AVX512 PMOV: Packed MOV, with truncation
6680	// Precisely handled by applying the same intrinsic to the shadow
6681	case Intrinsic::x86_avx512_mask_pmov_dw_512:
6682	case Intrinsic::x86_avx512_mask_pmov_db_512:
6683	case Intrinsic::x86_avx512_mask_pmov_qb_512:
6684	case Intrinsic::x86_avx512_mask_pmov_qw_512: {
6685	// Intrinsic::x86_avx512_mask_pmov_{qd,wb}_512 were removed in
6686	// f608dc1f5775ee880e8ea30e2d06ab5a4a935c22
6687	handleIntrinsicByApplyingToShadow(I, shadowIntrinsicID: I.getIntrinsicID(),
6688	/trailingVerbatimArgs=/`1`);
6689	break;
6690	}
6691
6692	// AVX512 PMVOV{S,US}: Packed MOV, with signed/unsigned saturation
6693	// Approximately handled using the corresponding truncation intrinsic
6694	// TODO: improve handleAVX512VectorDownConvert to precisely model saturation
6695	case Intrinsic::x86_avx512_mask_pmovs_dw_512:
6696	case Intrinsic::x86_avx512_mask_pmovus_dw_512: {
6697	handleIntrinsicByApplyingToShadow(I,
6698	shadowIntrinsicID: Intrinsic::x86_avx512_mask_pmov_dw_512,
6699	/ trailingVerbatimArgs=/`1`);
6700	break;
6701	}
6702
6703	case Intrinsic::x86_avx512_mask_pmovs_db_512:
6704	case Intrinsic::x86_avx512_mask_pmovus_db_512: {
6705	handleIntrinsicByApplyingToShadow(I,
6706	shadowIntrinsicID: Intrinsic::x86_avx512_mask_pmov_db_512,
6707	/ trailingVerbatimArgs=/`1`);
6708	break;
6709	}
6710
6711	case Intrinsic::x86_avx512_mask_pmovs_qb_512:
6712	case Intrinsic::x86_avx512_mask_pmovus_qb_512: {
6713	handleIntrinsicByApplyingToShadow(I,
6714	shadowIntrinsicID: Intrinsic::x86_avx512_mask_pmov_qb_512,
6715	/ trailingVerbatimArgs=/`1`);
6716	break;
6717	}
6718
6719	case Intrinsic::x86_avx512_mask_pmovs_qw_512:
6720	case Intrinsic::x86_avx512_mask_pmovus_qw_512: {
6721	handleIntrinsicByApplyingToShadow(I,
6722	shadowIntrinsicID: Intrinsic::x86_avx512_mask_pmov_qw_512,
6723	/ trailingVerbatimArgs=/`1`);
6724	break;
6725	}
6726
6727	case Intrinsic::x86_avx512_mask_pmovs_qd_512:
6728	case Intrinsic::x86_avx512_mask_pmovus_qd_512:
6729	case Intrinsic::x86_avx512_mask_pmovs_wb_512:
6730	case Intrinsic::x86_avx512_mask_pmovus_wb_512: {
6731	// Since Intrinsic::x86_avx512_mask_pmov_{qd,wb}_512 do not exist, we
6732	// cannot use handleIntrinsicByApplyingToShadow. Instead, we call the
6733	// slow-path handler.
6734	handleAVX512VectorDownConvert(I);
6735	break;
6736	}
6737
6738	// AVX512/AVX10 Reciprocal
6739	// <16 x float> @llvm.x86.avx512.rsqrt14.ps.512
6740	// (<16 x float>, <16 x float>, i16)
6741	// <8 x float> @llvm.x86.avx512.rsqrt14.ps.256
6742	// (<8 x float>, <8 x float>, i8)
6743	// <4 x float> @llvm.x86.avx512.rsqrt14.ps.128
6744	// (<4 x float>, <4 x float>, i8)
6745	//
6746	// <8 x double> @llvm.x86.avx512.rsqrt14.pd.512
6747	// (<8 x double>, <8 x double>, i8)
6748	// <4 x double> @llvm.x86.avx512.rsqrt14.pd.256
6749	// (<4 x double>, <4 x double>, i8)
6750	// <2 x double> @llvm.x86.avx512.rsqrt14.pd.128
6751	// (<2 x double>, <2 x double>, i8)
6752	//
6753	// <32 x bfloat> @llvm.x86.avx10.mask.rsqrt.bf16.512
6754	// (<32 x bfloat>, <32 x bfloat>, i32)
6755	// <16 x bfloat> @llvm.x86.avx10.mask.rsqrt.bf16.256
6756	// (<16 x bfloat>, <16 x bfloat>, i16)
6757	// <8 x bfloat> @llvm.x86.avx10.mask.rsqrt.bf16.128
6758	// (<8 x bfloat>, <8 x bfloat>, i8)
6759	//
6760	// <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512
6761	// (<32 x half>, <32 x half>, i32)
6762	// <16 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.256
6763	// (<16 x half>, <16 x half>, i16)
6764	// <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.128
6765	// (<8 x half>, <8 x half>, i8)
6766	//
6767	// TODO: 3-operand variants are not handled:
6768	// <2 x double> @llvm.x86.avx512.rsqrt14.sd
6769	// (<2 x double>, <2 x double>, <2 x double>, i8)
6770	// <4 x float> @llvm.x86.avx512.rsqrt14.ss
6771	// (<4 x float>, <4 x float>, <4 x float>, i8)
6772	// <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh
6773	// (<8 x half>, <8 x half>, <8 x half>, i8)
6774	case Intrinsic::x86_avx512_rsqrt14_ps_512:
6775	case Intrinsic::x86_avx512_rsqrt14_ps_256:
6776	case Intrinsic::x86_avx512_rsqrt14_ps_128:
6777	case Intrinsic::x86_avx512_rsqrt14_pd_512:
6778	case Intrinsic::x86_avx512_rsqrt14_pd_256:
6779	case Intrinsic::x86_avx512_rsqrt14_pd_128:
6780	case Intrinsic::x86_avx10_mask_rsqrt_bf16_512:
6781	case Intrinsic::x86_avx10_mask_rsqrt_bf16_256:
6782	case Intrinsic::x86_avx10_mask_rsqrt_bf16_128:
6783	case Intrinsic::x86_avx512fp16_mask_rsqrt_ph_512:
6784	case Intrinsic::x86_avx512fp16_mask_rsqrt_ph_256:
6785	case Intrinsic::x86_avx512fp16_mask_rsqrt_ph_128:
6786	handleAVX512VectorGenericMaskedFP(I, /DataIndices=/{`0`},
6787	/WriteThruIndex=/`1`,
6788	/MaskIndex=/`2`);
6789	break;
6790
6791	// AVX512/AVX10 Reciprocal Square Root
6792	// <16 x float> @llvm.x86.avx512.rcp14.ps.512
6793	// (<16 x float>, <16 x float>, i16)
6794	// <8 x float> @llvm.x86.avx512.rcp14.ps.256
6795	// (<8 x float>, <8 x float>, i8)
6796	// <4 x float> @llvm.x86.avx512.rcp14.ps.128
6797	// (<4 x float>, <4 x float>, i8)
6798	//
6799	// <8 x double> @llvm.x86.avx512.rcp14.pd.512
6800	// (<8 x double>, <8 x double>, i8)
6801	// <4 x double> @llvm.x86.avx512.rcp14.pd.256
6802	// (<4 x double>, <4 x double>, i8)
6803	// <2 x double> @llvm.x86.avx512.rcp14.pd.128
6804	// (<2 x double>, <2 x double>, i8)
6805	//
6806	// <32 x bfloat> @llvm.x86.avx10.mask.rcp.bf16.512
6807	// (<32 x bfloat>, <32 x bfloat>, i32)
6808	// <16 x bfloat> @llvm.x86.avx10.mask.rcp.bf16.256
6809	// (<16 x bfloat>, <16 x bfloat>, i16)
6810	// <8 x bfloat> @llvm.x86.avx10.mask.rcp.bf16.128
6811	// (<8 x bfloat>, <8 x bfloat>, i8)
6812	//
6813	// <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512
6814	// (<32 x half>, <32 x half>, i32)
6815	// <16 x half> @llvm.x86.avx512fp16.mask.rcp.ph.256
6816	// (<16 x half>, <16 x half>, i16)
6817	// <8 x half> @llvm.x86.avx512fp16.mask.rcp.ph.128
6818	// (<8 x half>, <8 x half>, i8)
6819	//
6820	// TODO: 3-operand variants are not handled:
6821	// <2 x double> @llvm.x86.avx512.rcp14.sd
6822	// (<2 x double>, <2 x double>, <2 x double>, i8)
6823	// <4 x float> @llvm.x86.avx512.rcp14.ss
6824	// (<4 x float>, <4 x float>, <4 x float>, i8)
6825	// <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh
6826	// (<8 x half>, <8 x half>, <8 x half>, i8)
6827	case Intrinsic::x86_avx512_rcp14_ps_512:
6828	case Intrinsic::x86_avx512_rcp14_ps_256:
6829	case Intrinsic::x86_avx512_rcp14_ps_128:
6830	case Intrinsic::x86_avx512_rcp14_pd_512:
6831	case Intrinsic::x86_avx512_rcp14_pd_256:
6832	case Intrinsic::x86_avx512_rcp14_pd_128:
6833	case Intrinsic::x86_avx10_mask_rcp_bf16_512:
6834	case Intrinsic::x86_avx10_mask_rcp_bf16_256:
6835	case Intrinsic::x86_avx10_mask_rcp_bf16_128:
6836	case Intrinsic::x86_avx512fp16_mask_rcp_ph_512:
6837	case Intrinsic::x86_avx512fp16_mask_rcp_ph_256:
6838	case Intrinsic::x86_avx512fp16_mask_rcp_ph_128:
6839	handleAVX512VectorGenericMaskedFP(I, /DataIndices=/{`0`},
6840	/WriteThruIndex=/`1`,
6841	/MaskIndex=/`2`);
6842	break;
6843
6844	// <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512
6845	// (<32 x half>, i32, <32 x half>, i32, i32)
6846	// <16 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.256
6847	// (<16 x half>, i32, <16 x half>, i32, i16)
6848	// <8 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.128
6849	// (<8 x half>, i32, <8 x half>, i32, i8)
6850	//
6851	// <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512
6852	// (<16 x float>, i32, <16 x float>, i16, i32)
6853	// <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256
6854	// (<8 x float>, i32, <8 x float>, i8)
6855	// <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128
6856	// (<4 x float>, i32, <4 x float>, i8)
6857	//
6858	// <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512
6859	// (<8 x double>, i32, <8 x double>, i8, i32)
6860	// A Imm WriteThru Mask Rounding
6861	// <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256
6862	// (<4 x double>, i32, <4 x double>, i8)
6863	// <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128
6864	// (<2 x double>, i32, <2 x double>, i8)
6865	// A Imm WriteThru Mask
6866	//
6867	// <32 x bfloat> @llvm.x86.avx10.mask.rndscale.bf16.512
6868	// (<32 x bfloat>, i32, <32 x bfloat>, i32)
6869	// <16 x bfloat> @llvm.x86.avx10.mask.rndscale.bf16.256
6870	// (<16 x bfloat>, i32, <16 x bfloat>, i16)
6871	// <8 x bfloat> @llvm.x86.avx10.mask.rndscale.bf16.128
6872	// (<8 x bfloat>, i32, <8 x bfloat>, i8)
6873	//
6874	// Not supported: three vectors
6875	// - <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh
6876	// (<8 x half>, <8 x half>,<8 x half>, i8, i32, i32)
6877	// - <4 x float> @llvm.x86.avx512.mask.rndscale.ss
6878	// (<4 x float>, <4 x float>, <4 x float>, i8, i32, i32)
6879	// - <2 x double> @llvm.x86.avx512.mask.rndscale.sd
6880	// (<2 x double>, <2 x double>, <2 x double>, i8, i32,
6881	// i32)
6882	// A B WriteThru Mask Imm
6883	// Rounding
6884	case Intrinsic::x86_avx512fp16_mask_rndscale_ph_512:
6885	case Intrinsic::x86_avx512fp16_mask_rndscale_ph_256:
6886	case Intrinsic::x86_avx512fp16_mask_rndscale_ph_128:
6887	case Intrinsic::x86_avx512_mask_rndscale_ps_512:
6888	case Intrinsic::x86_avx512_mask_rndscale_ps_256:
6889	case Intrinsic::x86_avx512_mask_rndscale_ps_128:
6890	case Intrinsic::x86_avx512_mask_rndscale_pd_512:
6891	case Intrinsic::x86_avx512_mask_rndscale_pd_256:
6892	case Intrinsic::x86_avx512_mask_rndscale_pd_128:
6893	case Intrinsic::x86_avx10_mask_rndscale_bf16_512:
6894	case Intrinsic::x86_avx10_mask_rndscale_bf16_256:
6895	case Intrinsic::x86_avx10_mask_rndscale_bf16_128:
6896	handleAVX512VectorGenericMaskedFP(I, /DataIndices=/{`0`},
6897	/WriteThruIndex=/`2`,
6898	/MaskIndex=/`3`);
6899	break;
6900
6901	// AVX512 Vector Scale Float Packed*
6902	//
6903	// < 8 x double> @llvm.x86.avx512.mask.scalef.pd.512
6904	// (<8 x double>, <8 x double>, <8 x double>, i8, i32)
6905	// A B WriteThru Msk Round
6906	// < 4 x double> @llvm.x86.avx512.mask.scalef.pd.256
6907	// (<4 x double>, <4 x double>, <4 x double>, i8)
6908	// < 2 x double> @llvm.x86.avx512.mask.scalef.pd.128
6909	// (<2 x double>, <2 x double>, <2 x double>, i8)
6910	//
6911	// <16 x float> @llvm.x86.avx512.mask.scalef.ps.512
6912	// (<16 x float>, <16 x float>, <16 x float>, i16, i32)
6913	// < 8 x float> @llvm.x86.avx512.mask.scalef.ps.256
6914	// (<8 x float>, <8 x float>, <8 x float>, i8)
6915	// < 4 x float> @llvm.x86.avx512.mask.scalef.ps.128
6916	// (<4 x float>, <4 x float>, <4 x float>, i8)
6917	//
6918	// <32 x half> @llvm.x86.avx512fp16.mask.scalef.ph.512
6919	// (<32 x half>, <32 x half>, <32 x half>, i32, i32)
6920	// <16 x half> @llvm.x86.avx512fp16.mask.scalef.ph.256
6921	// (<16 x half>, <16 x half>, <16 x half>, i16)
6922	// < 8 x half> @llvm.x86.avx512fp16.mask.scalef.ph.128
6923	// (<8 x half>, <8 x half>, <8 x half>, i8)
6924	//
6925	// TODO: AVX10
6926	// <32 x bfloat> @llvm.x86.avx10.mask.scalef.bf16.512
6927	// (<32 x bfloat>, <32 x bfloat>, <32 x bfloat>, i32)
6928	// <16 x bfloat> @llvm.x86.avx10.mask.scalef.bf16.256
6929	// (<16 x bfloat>, <16 x bfloat>, <16 x bfloat>, i16)
6930	// < 8 x bfloat> @llvm.x86.avx10.mask.scalef.bf16.128
6931	// (<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i8)
6932	case Intrinsic::x86_avx512_mask_scalef_pd_512:
6933	case Intrinsic::x86_avx512_mask_scalef_pd_256:
6934	case Intrinsic::x86_avx512_mask_scalef_pd_128:
6935	case Intrinsic::x86_avx512_mask_scalef_ps_512:
6936	case Intrinsic::x86_avx512_mask_scalef_ps_256:
6937	case Intrinsic::x86_avx512_mask_scalef_ps_128:
6938	case Intrinsic::x86_avx512fp16_mask_scalef_ph_512:
6939	case Intrinsic::x86_avx512fp16_mask_scalef_ph_256:
6940	case Intrinsic::x86_avx512fp16_mask_scalef_ph_128:
6941	// The AVX512 512-bit operand variants have an extra operand (the
6942	// Rounding mode). The extra operand, if present, will be
6943	// automatically checked by the handler.
6944	handleAVX512VectorGenericMaskedFP(I, /DataIndices=/{`0`, `1`},
6945	/WriteThruIndex=/`2`,
6946	/MaskIndex=/`3`);
6947	break;
6948
6949	// TODO: AVX512 Vector Scale Float Scalar*
6950	//
6951	// This is different from the Packed variant, because some bits are copied,
6952	// and some bits are zeroed.
6953	//
6954	// < 4 x float> @llvm.x86.avx512.mask.scalef.ss
6955	// (<4 x float>, <4 x float>, <4 x float>, i8, i32)
6956	//
6957	// < 2 x double> @llvm.x86.avx512.mask.scalef.sd
6958	// (<2 x double>, <2 x double>, <2 x double>, i8, i32)
6959	//
6960	// < 8 x half> @llvm.x86.avx512fp16.mask.scalef.sh
6961	// (<8 x half>, <8 x half>, <8 x half>, i8, i32)
6962
6963	// AVX512 FP16 Arithmetic
6964	case Intrinsic::x86_avx512fp16_mask_add_sh_round:
6965	case Intrinsic::x86_avx512fp16_mask_sub_sh_round:
6966	case Intrinsic::x86_avx512fp16_mask_mul_sh_round:
6967	case Intrinsic::x86_avx512fp16_mask_div_sh_round:
6968	case Intrinsic::x86_avx512fp16_mask_max_sh_round:
6969	case Intrinsic::x86_avx512fp16_mask_min_sh_round: {
6970	visitGenericScalarHalfwordInst(I);
6971	break;
6972	}
6973
6974	// AVX Galois Field New Instructions
6975	case Intrinsic::x86_vgf2p8affineqb_128:
6976	case Intrinsic::x86_vgf2p8affineqb_256:
6977	case Intrinsic::x86_vgf2p8affineqb_512:
6978	handleAVXGF2P8Affine(I);
6979	break;
6980
6981	default:
6982	return false;
6983	}
6984
6985	return true;
6986	}
6987
6988	bool maybeHandleArmSIMDIntrinsic(IntrinsicInst &I) {
6989	switch (I.getIntrinsicID()) {
6990	// Two operands e.g.,
6991	// - <8 x i8> @llvm.aarch64.neon.rshrn.v8i8 (<8 x i16>, i32)
6992	// - <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16>, <4 x i16>)
6993	case Intrinsic::aarch64_neon_rshrn:
6994	case Intrinsic::aarch64_neon_sqrshl:
6995	case Intrinsic::aarch64_neon_sqrshrn:
6996	case Intrinsic::aarch64_neon_sqrshrun:
6997	case Intrinsic::aarch64_neon_sqshl:
6998	case Intrinsic::aarch64_neon_sqshlu:
6999	case Intrinsic::aarch64_neon_sqshrn:
7000	case Intrinsic::aarch64_neon_sqshrun:
7001	case Intrinsic::aarch64_neon_srshl:
7002	case Intrinsic::aarch64_neon_sshl:
7003	case Intrinsic::aarch64_neon_uqrshl:
7004	case Intrinsic::aarch64_neon_uqrshrn:
7005	case Intrinsic::aarch64_neon_uqshl:
7006	case Intrinsic::aarch64_neon_uqshrn:
7007	case Intrinsic::aarch64_neon_urshl:
7008	case Intrinsic::aarch64_neon_ushl:
7009	handleVectorShiftIntrinsic(I, / Variable / false);
7010	break;
7011
7012	// Vector Shift Left/Right and Insert
7013	//
7014	// Three operands e.g.,
7015	// - <4 x i16> @llvm.aarch64.neon.vsli.v4i16
7016	// (<4 x i16> %a, <4 x i16> %b, i32 %n)
7017	// - <16 x i8> @llvm.aarch64.neon.vsri.v16i8
7018	// (<16 x i8> %a, <16 x i8> %b, i32 %n)
7019	//
7020	// %b is shifted by %n bits, and the "missing" bits are filled in with %a
7021	// (instead of zero-extending/sign-extending).
7022	case Intrinsic::aarch64_neon_vsli:
7023	case Intrinsic::aarch64_neon_vsri:
7024	handleIntrinsicByApplyingToShadow(I, shadowIntrinsicID: I.getIntrinsicID(),
7025	/trailingVerbatimArgs=/`1`);
7026	break;
7027
7028	// TODO: handling max/min similarly to AND/OR may be more precise
7029	// Floating-Point Maximum/Minimum Pairwise
7030	case Intrinsic::aarch64_neon_fmaxp:
7031	case Intrinsic::aarch64_neon_fminp:
7032	// Floating-Point Maximum/Minimum Number Pairwise
7033	case Intrinsic::aarch64_neon_fmaxnmp:
7034	case Intrinsic::aarch64_neon_fminnmp:
7035	// Signed/Unsigned Maximum/Minimum Pairwise
7036	case Intrinsic::aarch64_neon_smaxp:
7037	case Intrinsic::aarch64_neon_sminp:
7038	case Intrinsic::aarch64_neon_umaxp:
7039	case Intrinsic::aarch64_neon_uminp:
7040	// Add Pairwise
7041	case Intrinsic::aarch64_neon_addp:
7042	// Floating-point Add Pairwise
7043	case Intrinsic::aarch64_neon_faddp:
7044	// Add Long Pairwise
7045	case Intrinsic::aarch64_neon_saddlp:
7046	case Intrinsic::aarch64_neon_uaddlp: {
7047	handlePairwiseShadowOrIntrinsic(I, /Shards=/`1`);
7048	break;
7049	}
7050
7051	// Floating-point Convert to integer, rounding to nearest with ties to Away
7052	case Intrinsic::aarch64_neon_fcvtas:
7053	case Intrinsic::aarch64_neon_fcvtau:
7054	// Floating-point convert to integer, rounding toward minus infinity
7055	case Intrinsic::aarch64_neon_fcvtms:
7056	case Intrinsic::aarch64_neon_fcvtmu:
7057	// Floating-point convert to integer, rounding to nearest with ties to even
7058	case Intrinsic::aarch64_neon_fcvtns:
7059	case Intrinsic::aarch64_neon_fcvtnu:
7060	// Floating-point convert to integer, rounding toward plus infinity
7061	case Intrinsic::aarch64_neon_fcvtps:
7062	case Intrinsic::aarch64_neon_fcvtpu:
7063	// Floating-point Convert to integer, rounding toward Zero
7064	case Intrinsic::aarch64_neon_fcvtzs:
7065	case Intrinsic::aarch64_neon_fcvtzu:
7066	// Floating-point convert to lower precision narrow, rounding to odd
7067	case Intrinsic::aarch64_neon_fcvtxn:
7068	// Vector Conversions Between Half-Precision and Single-Precision
7069	case Intrinsic::aarch64_neon_vcvthf2fp:
7070	case Intrinsic::aarch64_neon_vcvtfp2hf:
7071	handleNEONVectorConvertIntrinsic(I, /FixedPoint=/false);
7072	break;
7073
7074	// Vector Conversions Between Fixed-Point and Floating-Point
7075	case Intrinsic::aarch64_neon_vcvtfxs2fp:
7076	case Intrinsic::aarch64_neon_vcvtfp2fxs:
7077	case Intrinsic::aarch64_neon_vcvtfxu2fp:
7078	case Intrinsic::aarch64_neon_vcvtfp2fxu:
7079	handleNEONVectorConvertIntrinsic(I, /FixedPoint=/true);
7080	break;
7081
7082	// TODO: bfloat conversions
7083	// - bfloat @llvm.aarch64.neon.bfcvt(float)
7084	// - <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float>)
7085	// - <8 x bfloat> @llvm.aarch64.neon.bfcvtn2(<8 x bfloat>, <4 x float>)
7086
7087	// Add reduction to scalar
7088	case Intrinsic::aarch64_neon_faddv:
7089	case Intrinsic::aarch64_neon_saddv:
7090	case Intrinsic::aarch64_neon_uaddv:
7091	// Signed/Unsigned min/max (Vector)
7092	// TODO: handling similarly to AND/OR may be more precise.
7093	case Intrinsic::aarch64_neon_smaxv:
7094	case Intrinsic::aarch64_neon_sminv:
7095	case Intrinsic::aarch64_neon_umaxv:
7096	case Intrinsic::aarch64_neon_uminv:
7097	// Floating-point min/max (vector)
7098	// The f{min,max}"nm"v variants handle NaN differently than f{min,max}v,
7099	// but our shadow propagation is the same.
7100	case Intrinsic::aarch64_neon_fmaxv:
7101	case Intrinsic::aarch64_neon_fminv:
7102	case Intrinsic::aarch64_neon_fmaxnmv:
7103	case Intrinsic::aarch64_neon_fminnmv:
7104	// Sum long across vector
7105	case Intrinsic::aarch64_neon_saddlv:
7106	case Intrinsic::aarch64_neon_uaddlv:
7107	handleVectorReduceIntrinsic(I, /AllowShadowCast=/true);
7108	break;
7109
7110	case Intrinsic::aarch64_neon_ld1x2:
7111	case Intrinsic::aarch64_neon_ld1x3:
7112	case Intrinsic::aarch64_neon_ld1x4:
7113	case Intrinsic::aarch64_neon_ld2:
7114	case Intrinsic::aarch64_neon_ld3:
7115	case Intrinsic::aarch64_neon_ld4:
7116	case Intrinsic::aarch64_neon_ld2r:
7117	case Intrinsic::aarch64_neon_ld3r:
7118	case Intrinsic::aarch64_neon_ld4r: {
7119	handleNEONVectorLoad(I, /WithLane=/false);
7120	break;
7121	}
7122
7123	case Intrinsic::aarch64_neon_ld2lane:
7124	case Intrinsic::aarch64_neon_ld3lane:
7125	case Intrinsic::aarch64_neon_ld4lane: {
7126	handleNEONVectorLoad(I, /WithLane=/true);
7127	break;
7128	}
7129
7130	// Saturating extract narrow
7131	case Intrinsic::aarch64_neon_sqxtn:
7132	case Intrinsic::aarch64_neon_sqxtun:
7133	case Intrinsic::aarch64_neon_uqxtn:
7134	// These only have one argument, but we (ab)use handleShadowOr because it
7135	// does work on single argument intrinsics and will typecast the shadow
7136	// (and update the origin).
7137	handleShadowOr(I);
7138	break;
7139
7140	case Intrinsic::aarch64_neon_st1x2:
7141	case Intrinsic::aarch64_neon_st1x3:
7142	case Intrinsic::aarch64_neon_st1x4:
7143	case Intrinsic::aarch64_neon_st2:
7144	case Intrinsic::aarch64_neon_st3:
7145	case Intrinsic::aarch64_neon_st4: {
7146	handleNEONVectorStoreIntrinsic(I, useLane: false);
7147	break;
7148	}
7149
7150	case Intrinsic::aarch64_neon_st2lane:
7151	case Intrinsic::aarch64_neon_st3lane:
7152	case Intrinsic::aarch64_neon_st4lane: {
7153	handleNEONVectorStoreIntrinsic(I, useLane: true);
7154	break;
7155	}
7156
7157	// Arm NEON vector table intrinsics have the source/table register(s) as
7158	// arguments, followed by the index register. They return the output.
7159	//
7160	// 'TBL writes a zero if an index is out-of-range, while TBX leaves the
7161	// original value unchanged in the destination register.'
7162	// Conveniently, zero denotes a clean shadow, which means out-of-range
7163	// indices for TBL will initialize the user data with zero and also clean
7164	// the shadow. (For TBX, neither the user data nor the shadow will be
7165	// updated, which is also correct.)
7166	case Intrinsic::aarch64_neon_tbl1:
7167	case Intrinsic::aarch64_neon_tbl2:
7168	case Intrinsic::aarch64_neon_tbl3:
7169	case Intrinsic::aarch64_neon_tbl4:
7170	case Intrinsic::aarch64_neon_tbx1:
7171	case Intrinsic::aarch64_neon_tbx2:
7172	case Intrinsic::aarch64_neon_tbx3:
7173	case Intrinsic::aarch64_neon_tbx4: {
7174	// The last trailing argument (index register) should be handled verbatim
7175	handleIntrinsicByApplyingToShadow(
7176	I, /shadowIntrinsicID=/I.getIntrinsicID(),
7177	/trailingVerbatimArgs/ `1`);
7178	break;
7179	}
7180
7181	case Intrinsic::aarch64_neon_fmulx:
7182	case Intrinsic::aarch64_neon_pmul:
7183	case Intrinsic::aarch64_neon_pmull:
7184	case Intrinsic::aarch64_neon_smull:
7185	case Intrinsic::aarch64_neon_pmull64:
7186	case Intrinsic::aarch64_neon_umull: {
7187	handleNEONVectorMultiplyIntrinsic(I);
7188	break;
7189	}
7190
7191	case Intrinsic::aarch64_neon_smmla:
7192	case Intrinsic::aarch64_neon_ummla:
7193	case Intrinsic::aarch64_neon_usmmla:
7194	case Intrinsic::aarch64_neon_bfmmla:
7195	handleNEONMatrixMultiply(I);
7196	break;
7197
7198	// <2 x i32> @llvm.aarch64.neon.{u,s,us}dot.v2i32.v8i8
7199	// (<2 x i32> %acc, <8 x i8> %a, <8 x i8> %b)
7200	// <4 x i32> @llvm.aarch64.neon.{u,s,us}dot.v4i32.v16i8
7201	// (<4 x i32> %acc, <16 x i8> %a, <16 x i8> %b)
7202	case Intrinsic::aarch64_neon_sdot:
7203	case Intrinsic::aarch64_neon_udot:
7204	case Intrinsic::aarch64_neon_usdot:
7205	handleVectorDotProductIntrinsic(I, /ReductionFactor=/`4`,
7206	/ZeroPurifies=/true,
7207	/EltSizeInBits=/`0`,
7208	/Lanes=/kBothLanes);
7209	break;
7210
7211	// <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16
7212	// (<2 x float> %acc, <4 x bfloat> %a, <4 x bfloat> %b)
7213	// <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16
7214	// (<4 x float> %acc, <8 x bfloat> %a, <8 x bfloat> %b)
7215	case Intrinsic::aarch64_neon_bfdot:
7216	handleVectorDotProductIntrinsic(I, /ReductionFactor=/`2`,
7217	/ZeroPurifies=/false,
7218	/EltSizeInBits=/`0`,
7219	/Lanes=/kBothLanes);
7220	break;
7221
7222	// Floating-Point Absolute Compare Greater Than/Equal
7223	case Intrinsic::aarch64_neon_facge:
7224	case Intrinsic::aarch64_neon_facgt:
7225	handleVectorComparePackedIntrinsic(I, /PredicateAsOperand=/false);
7226	break;
7227
7228	default:
7229	return false;
7230	}
7231
7232	return true;
7233	}
7234
7235	void visitIntrinsicInst(IntrinsicInst &I) {
7236	if (maybeHandleCrossPlatformIntrinsic(I))
7237	return;
7238
7239	if (maybeHandleX86SIMDIntrinsic(I))
7240	return;
7241
7242	if (maybeHandleArmSIMDIntrinsic(I))
7243	return;
7244
7245	if (maybeHandleUnknownIntrinsic(I))
7246	return;
7247
7248	visitInstruction(I);
7249	}
7250
7251	void visitLibAtomicLoad(CallBase &CB) {
7252	// Since we use getNextNode here, we can't have CB terminate the BB.
7253	assert(isa<CallInst>(CB));
7254
7255	IRBuilder<> IRB(&CB);
7256	Value *Size = CB.getArgOperand(i: `0`);
7257	Value *SrcPtr = CB.getArgOperand(i: `1`);
7258	Value *DstPtr = CB.getArgOperand(i: `2`);
7259	Value *Ordering = CB.getArgOperand(i: `3`);
7260	// Convert the call to have at least Acquire ordering to make sure
7261	// the shadow operations aren't reordered before it.
7262	Value *NewOrdering =
7263	IRB.CreateExtractElement(Vec: makeAddAcquireOrderingTable(IRB), Idx: Ordering);
7264	CB.setArgOperand(i: `3`, v: NewOrdering);
7265
7266	NextNodeIRBuilder NextIRB(&CB);
7267	Value SrcShadowPtr, SrcOriginPtr;
7268	std::tie(args&: SrcShadowPtr, args&: SrcOriginPtr) =
7269	getShadowOriginPtr(Addr: SrcPtr, IRB&: NextIRB, ShadowTy: NextIRB.getInt8Ty(), Alignment: Align (`1`),
7270	/isStore/ false);
7271	Value *DstShadowPtr =
7272	getShadowOriginPtr(Addr: DstPtr, IRB&: NextIRB, ShadowTy: NextIRB.getInt8Ty(), Alignment: Align (`1`),
7273	/isStore/ true)
7274	.first;
7275
7276	NextIRB.CreateMemCpy(Dst: DstShadowPtr, DstAlign: Align (`1`), Src: SrcShadowPtr, SrcAlign: Align (`1`), Size);
7277	if (MS.TrackOrigins) {
7278	Value *SrcOrigin = NextIRB.CreateAlignedLoad(Ty: MS.OriginTy, Ptr: SrcOriginPtr,
7279	Align: kMinOriginAlignment);
7280	Value *NewOrigin = updateOrigin(V: SrcOrigin, IRB&: NextIRB);
7281	NextIRB.CreateCall(Callee: MS.MsanSetOriginFn, Args: {DstPtr, Size, NewOrigin});
7282	}
7283	}
7284
7285	void visitLibAtomicStore(CallBase &CB) {
7286	IRBuilder<> IRB(&CB);
7287	Value *Size = CB.getArgOperand(i: `0`);
7288	Value *DstPtr = CB.getArgOperand(i: `2`);
7289	Value *Ordering = CB.getArgOperand(i: `3`);
7290	// Convert the call to have at least Release ordering to make sure
7291	// the shadow operations aren't reordered after it.
7292	Value *NewOrdering =
7293	IRB.CreateExtractElement(Vec: makeAddReleaseOrderingTable(IRB), Idx: Ordering);
7294	CB.setArgOperand(i: `3`, v: NewOrdering);
7295
7296	Value *DstShadowPtr =
7297	getShadowOriginPtr(Addr: DstPtr, IRB, ShadowTy: IRB.getInt8Ty(), Alignment: Align (`1`),
7298	/isStore/ true)
7299	.first;
7300
7301	// Atomic store always paints clean shadow/origin. See file header.
7302	IRB.CreateMemSet(Ptr: DstShadowPtr, Val: getCleanShadow(OrigTy: IRB.getInt8Ty()), Size,
7303	Align: Align (`1`));
7304	}
7305
7306	void visitCallBase(CallBase &CB) {
7307	assert(!CB.getMetadata(LLVMContext::MD_nosanitize));
7308	if (CB.isInlineAsm()) {
7309	// For inline asm (either a call to asm function, or callbr instruction),
7310	// do the usual thing: check argument shadow and mark all outputs as
7311	// clean. Note that any side effects of the inline asm that are not
7312	// immediately visible in its constraints are not handled.
7313	if (ClHandleAsmConservative)
7314	visitAsmInstruction(I&: CB);
7315	else
7316	visitInstruction(I&: CB);
7317	return;
7318	}
7319	LibFunc LF;
7320	if (TLI->getLibFunc(CB, F&: LF)) {
7321	// libatomic.a functions need to have special handling because there isn't
7322	// a good way to intercept them or compile the library with
7323	// instrumentation.
7324	switch (LF) {
7325	case LibFunc_atomic_load:
7326	if (!isa<CallInst>(Val: CB)) {
7327	llvm::errs() << "MSAN -- cannot instrument invoke of libatomic load."
7328	"Ignoring!\n";
7329	break;
7330	}
7331	visitLibAtomicLoad(CB);
7332	return;
7333	case LibFunc_atomic_store:
7334	visitLibAtomicStore(CB);
7335	return;
7336	default:
7337	break;
7338	}
7339	}
7340
7341	if (auto *Call = dyn_cast<CallInst>(Val: &CB)) {
7342	assert(!isa<IntrinsicInst>(Call) && "intrinsics are handled elsewhere");
7343
7344	// We are going to insert code that relies on the fact that the callee
7345	// will become a non-readonly function after it is instrumented by us. To
7346	// prevent this code from being optimized out, mark that function
7347	// non-readonly in advance.
7348	// TODO: We can likely do better than dropping memory() completely here.
7349	AttributeMask B;
7350	B.addAttribute(Val: Attribute::Memory).addAttribute(Val: Attribute::Speculatable);
7351
7352	Call->removeFnAttrs(AttrsToRemove: B);
7353	if (Function *Func = Call->getCalledFunction()) {
7354	Func->removeFnAttrs(Attrs: B);
7355	}
7356
7357	maybeMarkSanitizerLibraryCallNoBuiltin(CI: Call, TLI);
7358	}
7359	IRBuilder<> IRB(&CB);
7360	bool MayCheckCall = MS.EagerChecks;
7361	if (Function *Func = CB.getCalledFunction()) {
7362	// __sanitizer_unaligned_{load,store} functions may be called by users
7363	// and always expects shadows in the TLS. So don't check them.
7364	MayCheckCall &= !Func->getName().starts_with(Prefix: "__sanitizer_unaligned_");
7365	}
7366
7367	unsigned ArgOffset = `0`;
7368	LLVM_DEBUG(dbgs() << " CallSite: " << CB << "\n");
7369	for (const auto &[i, A] : llvm::enumerate(First: CB.args())) {
7370	if (!A ->getType()->isSized()) {
7371	LLVM_DEBUG(dbgs() << "Arg " << i << " is not sized: " << CB << "\n");
7372	continue;
7373	}
7374
7375	if (A ->getType()->isScalableTy()) {
7376	LLVM_DEBUG(dbgs() << "Arg " << i << " is vscale: " << CB << "\n");
7377	// Handle as noundef, but don't reserve tls slots.
7378	insertCheckShadowOf(Val: A, OrigIns: &CB);
7379	continue;
7380	}
7381
7382	unsigned Size = `0`;
7383	const DataLayout &DL = F.getDataLayout();
7384
7385	bool ByVal = CB.paramHasAttr(ArgNo: i, Kind: Attribute::ByVal);
7386	bool NoUndef = CB.paramHasAttr(ArgNo: i, Kind: Attribute::NoUndef);
7387	bool EagerCheck = MayCheckCall && !ByVal && NoUndef;
7388
7389	if (EagerCheck) {
7390	insertCheckShadowOf(Val: A, OrigIns: &CB);
7391	Size = DL.getTypeAllocSize(Ty: A ->getType());
7392	} else {
7393	[[maybe_unused]] Value Store = nullptr*;
7394	// Compute the Shadow for arg even if it is ByVal, because
7395	// in that case getShadow() will copy the actual arg shadow to
7396	// __msan_param_tls.
7397	Value *ArgShadow = getShadow(V: A);
7398	Value *ArgShadowBase = getShadowPtrForArgument(IRB, ArgOffset);
7399	LLVM_DEBUG(dbgs() << " Arg#" << i << ": " << *A
7400	<< " Shadow: " << *ArgShadow << "\n");
7401	if (ByVal) {
7402	// ByVal requires some special handling as it's too big for a single
7403	// load
7404	assert(A->getType()->isPointerTy() &&
7405	"ByVal argument is not a pointer!");
7406	Size = DL.getTypeAllocSize(Ty: CB.getParamByValType(ArgNo: i));
7407	if (ArgOffset + Size > kParamTLSSize)
7408	break;
7409	const MaybeAlign ParamAlignment(CB.getParamAlign(ArgNo: i));
7410	MaybeAlign Alignment = std::nullopt;
7411	if (ParamAlignment)
7412	Alignment = std::min(a: *ParamAlignment, b: kShadowTLSAlignment);
7413	Value AShadowPtr, AOriginPtr;
7414	std::tie(args&: AShadowPtr, args&: AOriginPtr) =
7415	getShadowOriginPtr(Addr: A, IRB, ShadowTy: IRB.getInt8Ty(), Alignment,
7416	/isStore/ false);
7417	if (!PropagateShadow) {
7418	Store = IRB.CreateMemSet(Ptr: ArgShadowBase,
7419	Val: Constant::getNullValue(Ty: IRB.getInt8Ty()),
7420	Size, Align: Alignment);
7421	} else {
7422	Store = IRB.CreateMemCpy(Dst: ArgShadowBase, DstAlign: Alignment, Src: AShadowPtr,
7423	SrcAlign: Alignment, Size);
7424	if (MS.TrackOrigins) {
7425	Value *ArgOriginBase = getOriginPtrForArgument(IRB, ArgOffset);
7426	// FIXME: OriginSize should be:
7427	// alignTo(A % kMinOriginAlignment + Size, kMinOriginAlignment)
7428	unsigned OriginSize = alignTo(Size, A: kMinOriginAlignment);
7429	IRB.CreateMemCpy(
7430	Dst: ArgOriginBase,
7431	/ by origin_tls[ArgOffset] / DstAlign: kMinOriginAlignment,
7432	Src: AOriginPtr,
7433	/ by getShadowOriginPtr / SrcAlign: kMinOriginAlignment, Size: OriginSize);
7434	}
7435	}
7436	} else {
7437	// Any other parameters mean we need bit-grained tracking of uninit
7438	// data
7439	Size = DL.getTypeAllocSize(Ty: A ->getType());
7440	if (ArgOffset + Size > kParamTLSSize)
7441	break;
7442	Store = IRB.CreateAlignedStore(Val: ArgShadow, Ptr: ArgShadowBase,
7443	Align: kShadowTLSAlignment);
7444	Constant *Cst = dyn_cast<Constant>(Val: ArgShadow);
7445	if (MS.TrackOrigins && !(Cst && Cst->isNullValue())) {
7446	IRB.CreateStore(Val: getOrigin(V: A),
7447	Ptr: getOriginPtrForArgument(IRB, ArgOffset));
7448	}
7449	}
7450	assert(Store != nullptr);
7451	LLVM_DEBUG(dbgs() << " Param:" << *Store << "\n");
7452	}
7453	assert(Size != `0`);
7454	ArgOffset += alignTo(Size, A: kShadowTLSAlignment);
7455	}
7456	LLVM_DEBUG(dbgs() << " done with call args\n");
7457
7458	FunctionType *FT = CB.getFunctionType();
7459	if (FT->isVarArg()) {
7460	VAHelper ->visitCallBase(CB, IRB);
7461	}
7462
7463	// Now, get the shadow for the RetVal.
7464	if (!CB.getType()->isSized())
7465	return;
7466	// Don't emit the epilogue for musttail call returns.
7467	if (isa<CallInst>(Val: CB) && cast<CallInst>(Val&: CB).isMustTailCall())
7468	return;
7469
7470	if (MayCheckCall && CB.hasRetAttr(Kind: Attribute::NoUndef)) {
7471	setShadow(V: &CB, SV: getCleanShadow(V: &CB));
7472	setOrigin(V: &CB, Origin: getCleanOrigin());
7473	return;
7474	}
7475
7476	IRBuilder<> IRBBefore(&CB);
7477	// Until we have full dynamic coverage, make sure the retval shadow is 0.
7478	Value *Base = getShadowPtrForRetval(IRB&: IRBBefore);
7479	IRBBefore.CreateAlignedStore(Val: getCleanShadow(V: &CB), Ptr: Base,
7480	Align: kShadowTLSAlignment);
7481	BasicBlock::iterator NextInsn;
7482	if (isa<CallInst>(Val: CB)) {
7483	NextInsn = ++CB.getIterator();
7484	assert(NextInsn != CB.getParent()->end());
7485	} else {
7486	BasicBlock *NormalDest = cast<InvokeInst>(Val&: CB).getNormalDest();
7487	if (!NormalDest->getSinglePredecessor()) {
7488	// FIXME: this case is tricky, so we are just conservative here.
7489	// Perhaps we need to split the edge between this BB and NormalDest,
7490	// but a naive attempt to use SplitEdge leads to a crash.
7491	setShadow(V: &CB, SV: getCleanShadow(V: &CB));
7492	setOrigin(V: &CB, Origin: getCleanOrigin());
7493	return;
7494	}
7495	// FIXME: NextInsn is likely in a basic block that has not been visited
7496	// yet. Anything inserted there will be instrumented by MSan later!
7497	NextInsn = NormalDest->getFirstInsertionPt();
7498	assert(NextInsn != NormalDest->end() &&
7499	"Could not find insertion point for retval shadow load");
7500	}
7501	IRBuilder<> IRBAfter(&*NextInsn);
7502	Value *RetvalShadow = IRBAfter.CreateAlignedLoad(
7503	Ty: getShadowTy(V: &CB), Ptr: getShadowPtrForRetval(IRB&: IRBAfter), Align: kShadowTLSAlignment,
7504	Name: "_msret");
7505	setShadow(V: &CB, SV: RetvalShadow);
7506	if (MS.TrackOrigins)
7507	setOrigin(V: &CB, Origin: IRBAfter.CreateLoad(Ty: MS.OriginTy, Ptr: getOriginPtrForRetval()));
7508	}
7509
7510	bool isAMustTailRetVal(Value *RetVal) {
7511	if (auto *I = dyn_cast<BitCastInst>(Val: RetVal)) {
7512	RetVal = I->getOperand(i_nocapture: `0`);
7513	}
7514	if (auto *I = dyn_cast<CallInst>(Val: RetVal)) {
7515	return I->isMustTailCall();
7516	}
7517	return false;
7518	}
7519
7520	void visitReturnInst(ReturnInst &I) {
7521	IRBuilder<> IRB(&I);
7522	Value *RetVal = I.getReturnValue();
7523	if (!RetVal)
7524	return;
7525	// Don't emit the epilogue for musttail call returns.
7526	if (isAMustTailRetVal(RetVal))
7527	return;
7528	Value *ShadowPtr = getShadowPtrForRetval(IRB);
7529	bool HasNoUndef = F.hasRetAttribute(Kind: Attribute::NoUndef);
7530	bool StoreShadow = !(MS.EagerChecks && HasNoUndef);
7531	// FIXME: Consider using SpecialCaseList to specify a list of functions that
7532	// must always return fully initialized values. For now, we hardcode "main".
7533	bool EagerCheck = (MS.EagerChecks && HasNoUndef) \|\| (F.getName() == "main");
7534
7535	Value *Shadow = getShadow(V: RetVal);
7536	bool StoreOrigin = true;
7537	if (EagerCheck) {
7538	insertCheckShadowOf(Val: RetVal, OrigIns: &I);
7539	Shadow = getCleanShadow(V: RetVal);
7540	StoreOrigin = false;
7541	}
7542
7543	// The caller may still expect information passed over TLS if we pass our
7544	// check
7545	if (StoreShadow) {
7546	IRB.CreateAlignedStore(Val: Shadow, Ptr: ShadowPtr, Align: kShadowTLSAlignment);
7547	if (MS.TrackOrigins && StoreOrigin)
7548	IRB.CreateStore(Val: getOrigin(V: RetVal), Ptr: getOriginPtrForRetval());
7549	}
7550	}
7551
7552	void visitPHINode(PHINode &I) {
7553	IRBuilder<> IRB(&I);
7554	if (!PropagateShadow) {
7555	setShadow(V: &I, SV: getCleanShadow(V: &I));
7556	setOrigin(V: &I, Origin: getCleanOrigin());
7557	return;
7558	}
7559
7560	ShadowPHINodes.push_back(Elt: &I);
7561	setShadow(V: &I, SV: IRB.CreatePHI(Ty: getShadowTy(V: &I), NumReservedValues: I.getNumIncomingValues(),
7562	Name: "_msphi_s"));
7563	if (MS.TrackOrigins)
7564	setOrigin(
7565	V: &I, Origin: IRB.CreatePHI(Ty: MS.OriginTy, NumReservedValues: I.getNumIncomingValues(), Name: "_msphi_o"));
7566	}
7567
7568	Value *getLocalVarIdptr(AllocaInst &I) {
7569	ConstantInt *IntConst =
7570	ConstantInt::get(Ty: Type::getInt32Ty(C&: (*F.getParent()).getContext()), V: `0`);
7571	return new GlobalVariable (*F.getParent(), IntConst->getType(),
7572	/isConstant=/false, GlobalValue::PrivateLinkage,
7573	IntConst);
7574	}
7575
7576	Value *getLocalVarDescription(AllocaInst &I) {
7577	return createPrivateConstGlobalForString(M&: *F.getParent(), Str: I.getName());
7578	}
7579
7580	void poisonAllocaUserspace(AllocaInst &I, IRBuilder<> &IRB, Value *Len) {
7581	if (PoisonStack && ClPoisonStackWithCall) {
7582	IRB.CreateCall(Callee: MS.MsanPoisonStackFn, Args: {&I, Len});
7583	} else {
7584	Value ShadowBase, OriginBase;
7585	std::tie(args&: ShadowBase, args&: OriginBase) = getShadowOriginPtr(
7586	Addr: &I, IRB, ShadowTy: IRB.getInt8Ty(), Alignment: Align (`1`), /isStore/ true);
7587
7588	Value *PoisonValue = IRB.getInt8(C: PoisonStack ? ClPoisonStackPattern : `0`);
7589	IRB.CreateMemSet(Ptr: ShadowBase, Val: PoisonValue, Size: Len, Align: I.getAlign());
7590	}
7591
7592	if (PoisonStack && MS.TrackOrigins) {
7593	Value *Idptr = getLocalVarIdptr(I);
7594	if (ClPrintStackNames) {
7595	Value *Descr = getLocalVarDescription(I);
7596	IRB.CreateCall(Callee: MS.MsanSetAllocaOriginWithDescriptionFn,
7597	Args: {&I, Len, Idptr, Descr});
7598	} else {
7599	IRB.CreateCall(Callee: MS.MsanSetAllocaOriginNoDescriptionFn, Args: {&I, Len, Idptr});
7600	}
7601	}
7602	}
7603
7604	void poisonAllocaKmsan(AllocaInst &I, IRBuilder<> &IRB, Value *Len) {
7605	Value *Descr = getLocalVarDescription(I);
7606	if (PoisonStack) {
7607	IRB.CreateCall(Callee: MS.MsanPoisonAllocaFn, Args: {&I, Len, Descr});
7608	} else {
7609	IRB.CreateCall(Callee: MS.MsanUnpoisonAllocaFn, Args: {&I, Len});
7610	}
7611	}
7612
7613	void instrumentAlloca(AllocaInst &I, Instruction InsPoint = nullptr*) {
7614	if (!InsPoint)
7615	InsPoint = &I;
7616	NextNodeIRBuilder IRB(InsPoint);
7617	Value *Len = IRB.CreateAllocationSize(DestTy: MS.IntptrTy, AI: &I);
7618
7619	if (MS.CompileKernel)
7620	poisonAllocaKmsan(I, IRB, Len);
7621	else
7622	poisonAllocaUserspace(I, IRB, Len);
7623	}
7624
7625	void visitAllocaInst(AllocaInst &I) {
7626	setShadow(V: &I, SV: getCleanShadow(V: &I));
7627	setOrigin(V: &I, Origin: getCleanOrigin());
7628	// We'll get to this alloca later unless it's poisoned at the corresponding
7629	// llvm.lifetime.start.
7630	AllocaSet.insert(X: &I);
7631	}
7632
7633	void visitSelectInst(SelectInst &I) {
7634	// a = select b, c, d
7635	Value *B = I.getCondition();
7636	Value *C = I.getTrueValue();
7637	Value *D = I.getFalseValue();
7638
7639	handleSelectLikeInst(I, B, C, D);
7640	}
7641
7642	void handleSelectLikeInst(Instruction &I, Value B, Value C, Value *D) {
7643	IRBuilder<> IRB(&I);
7644
7645	Value *Sb = getShadow(V: B);
7646	Value *Sc = getShadow(V: C);
7647	Value *Sd = getShadow(V: D);
7648
7649	Value Ob = MS.TrackOrigins ? getOrigin(V: B) : nullptr*;
7650	Value Oc = MS.TrackOrigins ? getOrigin(V: C) : nullptr*;
7651	Value Od = MS.TrackOrigins ? getOrigin(V: D) : nullptr*;
7652
7653	// Result shadow if condition shadow is 0.
7654	Value *Sa0 = IRB.CreateSelect(C: B, True: Sc, False: Sd);
7655	Value *Sa1;
7656	if (I.getType()->isAggregateType()) {
7657	// To avoid "sign extending" i1 to an arbitrary aggregate type, we just do
7658	// an extra "select". This results in much more compact IR.
7659	// Sa = select Sb, poisoned, (select b, Sc, Sd)
7660	Sa1 = getPoisonedShadow(ShadowTy: getShadowTy(OrigTy: I.getType()));
7661	} else if (isScalableNonVectorType(Ty: I.getType())) {
7662	// This is intended to handle target("aarch64.svcount"), which can't be
7663	// handled in the else branch because of incompatibility with CreateXor
7664	// ("The supported LLVM operations on this type are limited to load,
7665	// store, phi, select and alloca instructions").
7666
7667	// TODO: this currently underapproximates. Use Arm SVE EOR in the else
7668	// branch as needed instead.
7669	Sa1 = getCleanShadow(OrigTy: getShadowTy(OrigTy: I.getType()));
7670	} else {
7671	// Sa = select Sb, [ (c^d) \| Sc \| Sd ], [ b ? Sc : Sd ]
7672	// If Sb (condition is poisoned), look for bits in c and d that are equal
7673	// and both unpoisoned.
7674	// If !Sb (condition is unpoisoned), simply pick one of Sc and Sd.
7675
7676	// Cast arguments to shadow-compatible type.
7677	C = CreateAppToShadowCast(IRB, V: C);
7678	D = CreateAppToShadowCast(IRB, V: D);
7679
7680	// Result shadow if condition shadow is 1.
7681	Sa1 = IRB.CreateOr(Ops: {IRB.CreateXor(LHS: C, RHS: D), Sc, Sd});
7682	}
7683	Value *Sa = IRB.CreateSelect(C: Sb, True: Sa1, False: Sa0, Name: "_msprop_select");
7684	setShadow(V: &I, SV: Sa);
7685	if (MS.TrackOrigins) {
7686	// Origins are always i32, so any vector conditions must be flattened.
7687	// FIXME: consider tracking vector origins for app vectors?
7688	if (B->getType()->isVectorTy()) {
7689	B = convertToBool(V: B, IRB);
7690	Sb = convertToBool(V: Sb, IRB);
7691	}
7692	// a = select b, c, d
7693	// Oa = Sb ? Ob : (b ? Oc : Od)
7694	setOrigin(V: &I, Origin: IRB.CreateSelect(C: Sb, True: Ob, False: IRB.CreateSelect(C: B, True: Oc, False: Od)));
7695	}
7696	}
7697
7698	void visitLandingPadInst(LandingPadInst &I) {
7699	// Do nothing.
7700	// See https://github.com/google/sanitizers/issues/504
7701	setShadow(V: &I, SV: getCleanShadow(V: &I));
7702	setOrigin(V: &I, Origin: getCleanOrigin());
7703	}
7704
7705	void visitCatchSwitchInst(CatchSwitchInst &I) {
7706	setShadow(V: &I, SV: getCleanShadow(V: &I));
7707	setOrigin(V: &I, Origin: getCleanOrigin());
7708	}
7709
7710	void visitFuncletPadInst(FuncletPadInst &I) {
7711	setShadow(V: &I, SV: getCleanShadow(V: &I));
7712	setOrigin(V: &I, Origin: getCleanOrigin());
7713	}
7714
7715	void visitGetElementPtrInst(GetElementPtrInst &I) { handleShadowOr(I); }
7716
7717	void visitExtractValueInst(ExtractValueInst &I) {
7718	IRBuilder<> IRB(&I);
7719	Value *Agg = I.getAggregateOperand();
7720	LLVM_DEBUG(dbgs() << "ExtractValue: " << I << "\n");
7721	Value *AggShadow = getShadow(V: Agg);
7722	LLVM_DEBUG(dbgs() << " AggShadow: " << *AggShadow << "\n");
7723	Value *ResShadow = IRB.CreateExtractValue(Agg: AggShadow, Idxs: I.getIndices());
7724	LLVM_DEBUG(dbgs() << " ResShadow: " << *ResShadow << "\n");
7725	setShadow(V: &I, SV: ResShadow);
7726	setOriginForNaryOp(I);
7727	}
7728
7729	void visitInsertValueInst(InsertValueInst &I) {
7730	IRBuilder<> IRB(&I);
7731	LLVM_DEBUG(dbgs() << "InsertValue: " << I << "\n");
7732	Value *AggShadow = getShadow(V: I.getAggregateOperand());
7733	Value *InsShadow = getShadow(V: I.getInsertedValueOperand());
7734	LLVM_DEBUG(dbgs() << " AggShadow: " << *AggShadow << "\n");
7735	LLVM_DEBUG(dbgs() << " InsShadow: " << *InsShadow << "\n");
7736	Value *Res = IRB.CreateInsertValue(Agg: AggShadow, Val: InsShadow, Idxs: I.getIndices());
7737	LLVM_DEBUG(dbgs() << " Res: " << *Res << "\n");
7738	setShadow(V: &I, SV: Res);
7739	setOriginForNaryOp(I);
7740	}
7741
7742	void dumpInst(Instruction &I) {
7743	// Instruction name only
7744	// For intrinsics, the full/overloaded name is used
7745	//
7746	// e.g., "call llvm.aarch64.neon.uqsub.v16i8"
7747	if (CallInst *CI = dyn_cast<CallInst>(Val: &I)) {
7748	errs() << "ZZZ call " << CI->getCalledFunction()->getName() << "\n";
7749	} else {
7750	errs() << "ZZZ " << I.getOpcodeName() << "\n";
7751	}
7752
7753	// Instruction prototype (including return type and parameter types)
7754	// For intrinsics, we use the base/non-overloaded name
7755	//
7756	// e.g., "call <16 x i8> @llvm.aarch64.neon.uqsub(<16 x i8>, <16 x i8>)"
7757	unsigned NumOperands = I.getNumOperands();
7758	if (CallInst *CI = dyn_cast<CallInst>(Val: &I)) {
7759	errs() << "YYY call " << *I.getType() << " @";
7760
7761	if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: CI))
7762	errs() << Intrinsic::getBaseName(id: II->getIntrinsicID());
7763	else
7764	errs() << CI->getCalledFunction()->getName();
7765
7766	errs() << "(";
7767
7768	// The last operand of a CallInst is the function itself.
7769	NumOperands--;
7770	} else
7771	errs() << "YYY " << *I.getType() << " " << I.getOpcodeName() << "(";
7772
7773	for (size_t i = `0`; i < NumOperands; i++) {
7774	if (i > `0`)
7775	errs() << ", ";
7776
7777	errs() << *(I.getOperand(i)->getType());
7778	}
7779
7780	errs() << ")\n";
7781
7782	// Full instruction, including types and operand values
7783	// For intrinsics, the full/overloaded name is used
7784	//
7785	// e.g., "%vqsubq_v.i15 = call noundef <16 x i8>
7786	// @llvm.aarch64.neon.uqsub.v16i8(<16 x i8> %vext21.i,
7787	// <16 x i8> splat (i8 1)), !dbg !66"
7788	errs() << "QQQ " << I << "\n";
7789	}
7790
7791	void visitResumeInst(ResumeInst &I) {
7792	LLVM_DEBUG(dbgs() << "Resume: " << I << "\n");
7793	// Nothing to do here.
7794	}
7795
7796	void visitCleanupReturnInst(CleanupReturnInst &CRI) {
7797	LLVM_DEBUG(dbgs() << "CleanupReturn: " << CRI << "\n");
7798	// Nothing to do here.
7799	}
7800
7801	void visitCatchReturnInst(CatchReturnInst &CRI) {
7802	LLVM_DEBUG(dbgs() << "CatchReturn: " << CRI << "\n");
7803	// Nothing to do here.
7804	}
7805
7806	void instrumentAsmArgument(Value Operand, Type ElemTy, Instruction &I,
7807	IRBuilder<> &IRB, const DataLayout &DL,
7808	bool isOutput) {
7809	// For each assembly argument, we check its value for being initialized.
7810	// If the argument is a pointer, we assume it points to a single element
7811	// of the corresponding type (or to a 8-byte word, if the type is unsized).
7812	// Each such pointer is instrumented with a call to the runtime library.
7813	Type *OpType = Operand->getType();
7814	// Check the operand value itself.
7815	insertCheckShadowOf(Val: Operand, OrigIns: &I);
7816	if (!OpType->isPointerTy() \|\| !isOutput) {
7817	assert(!isOutput);
7818	return;
7819	}
7820	if (!ElemTy->isSized())
7821	return;
7822	auto Size = DL.getTypeStoreSize(Ty: ElemTy);
7823	Value *SizeVal = IRB.CreateTypeSize(Ty: MS.IntptrTy, Size);
7824	if (MS.CompileKernel) {
7825	IRB.CreateCall(Callee: MS.MsanInstrumentAsmStoreFn, Args: {Operand, SizeVal});
7826	} else {
7827	// ElemTy, derived from elementtype(), does not encode the alignment of
7828	// the pointer. Conservatively assume that the shadow memory is unaligned.
7829	// When Size is large, avoid StoreInst as it would expand to many
7830	// instructions.
7831	auto [ShadowPtr, _] =
7832	getShadowOriginPtrUserspace(Addr: Operand, IRB, ShadowTy: IRB.getInt8Ty(), Alignment: Align (`1`));
7833	if (Size <= `32`)
7834	IRB.CreateAlignedStore(Val: getCleanShadow(OrigTy: ElemTy), Ptr: ShadowPtr, Align: Align (`1`));
7835	else
7836	IRB.CreateMemSet(Ptr: ShadowPtr, Val: ConstantInt::getNullValue(Ty: IRB.getInt8Ty()),
7837	Size: SizeVal, Align: Align (`1`));
7838	}
7839	}
7840
7841	/// Get the number of output arguments returned by pointers.
7842	int getNumOutputArgs(InlineAsm IA, CallBase CB) {
7843	int NumRetOutputs = `0`;
7844	int NumOutputs = `0`;
7845	Type *RetTy = cast<Value>(Val: CB)->getType();
7846	if (!RetTy->isVoidTy()) {
7847	// Register outputs are returned via the CallInst return value.
7848	auto *ST = dyn_cast<StructType>(Val: RetTy);
7849	if (ST)
7850	NumRetOutputs = ST->getNumElements();
7851	else
7852	NumRetOutputs = `1`;
7853	}
7854	InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
7855	for (const InlineAsm::ConstraintInfo &Info : Constraints) {
7856	switch (Info.Type) {
7857	case InlineAsm::isOutput:
7858	NumOutputs++;
7859	break;
7860	default:
7861	break;
7862	}
7863	}
7864	return NumOutputs - NumRetOutputs;
7865	}
7866
7867	void visitAsmInstruction(Instruction &I) {
7868	// Conservative inline assembly handling: check for poisoned shadow of
7869	// asm() arguments, then unpoison the result and all the memory locations
7870	// pointed to by those arguments.
7871	// An inline asm() statement in C++ contains lists of input and output
7872	// arguments used by the assembly code. These are mapped to operands of the
7873	// CallInst as follows:
7874	// - nR register outputs ("=r) are returned by value in a single structure
7875	// (SSA value of the CallInst);
7876	// - nO other outputs ("=m" and others) are returned by pointer as first
7877	// nO operands of the CallInst;
7878	// - nI inputs ("r", "m" and others) are passed to CallInst as the
7879	// remaining nI operands.
7880	// The total number of asm() arguments in the source is nR+nO+nI, and the
7881	// corresponding CallInst has nO+nI+1 operands (the last operand is the
7882	// function to be called).
7883	const DataLayout &DL = F.getDataLayout();
7884	CallBase *CB = cast<CallBase>(Val: &I);
7885	IRBuilder<> IRB(&I);
7886	InlineAsm *IA = cast<InlineAsm>(Val: CB->getCalledOperand());
7887	int OutputArgs = getNumOutputArgs(IA, CB);
7888	// The last operand of a CallInst is the function itself.
7889	int NumOperands = CB->getNumOperands() - `1`;
7890
7891	// Check input arguments. Doing so before unpoisoning output arguments, so
7892	// that we won't overwrite uninit values before checking them.
7893	for (int i = OutputArgs; i < NumOperands; i++) {
7894	Value *Operand = CB->getOperand(i_nocapture: i);
7895	instrumentAsmArgument(Operand, ElemTy: CB->getParamElementType(ArgNo: i), I, IRB, DL,
7896	/isOutput/ false);
7897	}
7898	// Unpoison output arguments. This must happen before the actual InlineAsm
7899	// call, so that the shadow for memory published in the asm() statement
7900	// remains valid.
7901	for (int i = `0`; i < OutputArgs; i++) {
7902	Value *Operand = CB->getOperand(i_nocapture: i);
7903	instrumentAsmArgument(Operand, ElemTy: CB->getParamElementType(ArgNo: i), I, IRB, DL,
7904	/isOutput/ true);
7905	}
7906
7907	setShadow(V: &I, SV: getCleanShadow(V: &I));
7908	setOrigin(V: &I, Origin: getCleanOrigin());
7909	}
7910
7911	void visitFreezeInst(FreezeInst &I) {
7912	// Freeze always returns a fully defined value.
7913	setShadow(V: &I, SV: getCleanShadow(V: &I));
7914	setOrigin(V: &I, Origin: getCleanOrigin());
7915	}
7916
7917	void visitInstruction(Instruction &I) {
7918	// Everything else: stop propagating and check for poisoned shadow.
7919	if (ClDumpStrictInstructions)
7920	dumpInst(I);
7921	LLVM_DEBUG(dbgs() << "DEFAULT: " << I << "\n");
7922	for (size_t i = `0`, n = I.getNumOperands(); i < n; i++) {
7923	Value *Operand = I.getOperand(i);
7924	if (Operand->getType()->isSized())
7925	insertCheckShadowOf(Val: Operand, OrigIns: &I);
7926	}
7927	setShadow(V: &I, SV: getCleanShadow(V: &I));
7928	setOrigin(V: &I, Origin: getCleanOrigin());
7929	}
7930	};
7931
7932	struct VarArgHelperBase : public VarArgHelper {
7933	Function &F;
7934	MemorySanitizer &MS;
7935	MemorySanitizerVisitor &MSV;
7936	SmallVector<CallInst *, `16`> VAStartInstrumentationList;
7937	const unsigned VAListTagSize;
7938
7939	VarArgHelperBase(Function &F, MemorySanitizer &MS,
7940	MemorySanitizerVisitor &MSV, unsigned VAListTagSize)
7941	: F(F), MS(MS), MSV(MSV), VAListTagSize(VAListTagSize) {}
7942
7943	Value getShadowAddrForVAArgument(IRBuilder<> &IRB, unsigned* ArgOffset) {
7944	Value *Base = IRB.CreatePointerCast(V: MS.VAArgTLS, DestTy: MS.IntptrTy);
7945	return IRB.CreateAdd(LHS: Base, RHS: ConstantInt::get(Ty: MS.IntptrTy, V: ArgOffset));
7946	}
7947
7948	/// Compute the shadow address for a given va_arg.
7949	Value getShadowPtrForVAArgument(IRBuilder<> &IRB, unsigned* ArgOffset) {
7950	return IRB.CreatePtrAdd(
7951	Ptr: MS.VAArgTLS, Offset: ConstantInt::get(Ty: MS.IntptrTy, V: ArgOffset), Name: "_msarg_va_s");
7952	}
7953
7954	/// Compute the shadow address for a given va_arg.
7955	Value getShadowPtrForVAArgument(IRBuilder<> &IRB, unsigned* ArgOffset,
7956	unsigned ArgSize) {
7957	// Make sure we don't overflow __msan_va_arg_tls.
7958	if (ArgOffset + ArgSize > kParamTLSSize)
7959	return nullptr;
7960	return getShadowPtrForVAArgument(IRB, ArgOffset);
7961	}
7962
7963	/// Compute the origin address for a given va_arg.
7964	Value getOriginPtrForVAArgument(IRBuilder<> &IRB, int* ArgOffset) {
7965	// getOriginPtrForVAArgument() is always called after
7966	// getShadowPtrForVAArgument(), so __msan_va_arg_origin_tls can never
7967	// overflow.
7968	return IRB.CreatePtrAdd(Ptr: MS.VAArgOriginTLS,
7969	Offset: ConstantInt::get(Ty: MS.IntptrTy, V: ArgOffset),
7970	Name: "_msarg_va_o");
7971	}
7972
7973	void CleanUnusedTLS(IRBuilder<> &IRB, Value *ShadowBase,
7974	unsigned BaseOffset) {
7975	// The tails of __msan_va_arg_tls is not large enough to fit full
7976	// value shadow, but it will be copied to backup anyway. Make it
7977	// clean.
7978	if (BaseOffset >= kParamTLSSize)
7979	return;
7980	Value *TailSize =
7981	ConstantInt::getSigned(Ty: IRB.getInt32Ty(), V: kParamTLSSize - BaseOffset);
7982	IRB.CreateMemSet(Ptr: ShadowBase, Val: ConstantInt::getNullValue(Ty: IRB.getInt8Ty()),
7983	Size: TailSize, Align: Align (`8`));
7984	}
7985
7986	void unpoisonVAListTagForInst(IntrinsicInst &I) {
7987	IRBuilder<> IRB(&I);
7988	Value *VAListTag = I.getArgOperand(i: `0`);
7989	const Align Alignment = Align (`8`);
7990	auto [ShadowPtr, OriginPtr] = MSV.getShadowOriginPtr(
7991	Addr: VAListTag, IRB, ShadowTy: IRB.getInt8Ty(), Alignment, /isStore/ true);
7992	// Unpoison the whole __va_list_tag.
7993	IRB.CreateMemSet(Ptr: ShadowPtr, Val: Constant::getNullValue(Ty: IRB.getInt8Ty()),
7994	Size: VAListTagSize, Align: Alignment, isVolatile: false);
7995	}
7996
7997	void visitVAStartInst(VAStartInst &I) override {
7998	if (F.getCallingConv() == CallingConv::Win64)
7999	return;
8000	VAStartInstrumentationList.push_back(Elt: &I);
8001	unpoisonVAListTagForInst(I);
8002	}
8003
8004	void visitVACopyInst(VACopyInst &I) override {
8005	if (F.getCallingConv() == CallingConv::Win64)
8006	return;
8007	unpoisonVAListTagForInst(I);
8008	}
8009	};
8010
8011	/// AMD64-specific implementation of VarArgHelper.
8012	struct VarArgAMD64Helper : public VarArgHelperBase {
8013	// An unfortunate workaround for asymmetric lowering of va_arg stuff.
8014	// See a comment in visitCallBase for more details.
8015	static const unsigned AMD64GpEndOffset = `48`; // AMD64 ABI Draft 0.99.6 p3.5.7
8016	static const unsigned AMD64FpEndOffsetSSE = `176`;
8017	// If SSE is disabled, fp_offset in va_list is zero.
8018	static const unsigned AMD64FpEndOffsetNoSSE = AMD64GpEndOffset;
8019
8020	unsigned AMD64FpEndOffset;
8021	AllocaInst VAArgTLSCopy = nullptr*;
8022	AllocaInst VAArgTLSOriginCopy = nullptr*;
8023	Value VAArgOverflowSize = nullptr*;
8024
8025	enum ArgKind { AK_GeneralPurpose, AK_FloatingPoint, AK_Memory };
8026
8027	VarArgAMD64Helper(Function &F, MemorySanitizer &MS,
8028	MemorySanitizerVisitor &MSV)
8029	: VarArgHelperBase (F, MS, MSV, /VAListTagSize=/`24`) {
8030	AMD64FpEndOffset = AMD64FpEndOffsetSSE;
8031	for (const auto &Attr : F.getAttributes().getFnAttrs()) {
8032	if (Attr.isStringAttribute() &&
8033	(Attr.getKindAsString() == "target-features")) {
8034	if (Attr.getValueAsString().contains(Other: "-sse"))
8035	AMD64FpEndOffset = AMD64FpEndOffsetNoSSE;
8036	break;
8037	}
8038	}
8039	}
8040
8041	ArgKind classifyArgument(Value *arg) {
8042	// A very rough approximation of X86_64 argument classification rules.
8043	Type *T = arg->getType();
8044	if (T->isX86_FP80Ty())
8045	return AK_Memory;
8046	if (T->isFPOrFPVectorTy())
8047	return AK_FloatingPoint;
8048	if (T->isIntegerTy() && T->getPrimitiveSizeInBits() <= `64`)
8049	return AK_GeneralPurpose;
8050	if (T->isPointerTy())
8051	return AK_GeneralPurpose;
8052	return AK_Memory;
8053	}
8054
8055	// For VarArg functions, store the argument shadow in an ABI-specific format
8056	// that corresponds to va_list layout.
8057	// We do this because Clang lowers va_arg in the frontend, and this pass
8058	// only sees the low level code that deals with va_list internals.
8059	// A much easier alternative (provided that Clang emits va_arg instructions)
8060	// would have been to associate each live instance of va_list with a copy of
8061	// MSanParamTLS, and extract shadow on va_arg() call in the argument list
8062	// order.
8063	void visitCallBase(CallBase &CB, IRBuilder<> &IRB) override {
8064	unsigned GpOffset = `0`;
8065	unsigned FpOffset = AMD64GpEndOffset;
8066	unsigned OverflowOffset = AMD64FpEndOffset;
8067	const DataLayout &DL = F.getDataLayout();
8068
8069	for (const auto &[ArgNo, A] : llvm::enumerate(First: CB.args())) {
8070	bool IsFixed = ArgNo < CB.getFunctionType()->getNumParams();
8071	bool IsByVal = CB.paramHasAttr(ArgNo, Kind: Attribute::ByVal);
8072	if (IsByVal) {
8073	// ByVal arguments always go to the overflow area.
8074	// Fixed arguments passed through the overflow area will be stepped
8075	// over by va_start, so don't count them towards the offset.
8076	if (IsFixed)
8077	continue;
8078	assert(A->getType()->isPointerTy());
8079	Type *RealTy = CB.getParamByValType(ArgNo);
8080	uint64_t ArgSize = DL.getTypeAllocSize(Ty: RealTy);
8081	uint64_t AlignedSize = alignTo(Value: ArgSize, Align: `8`);
8082	unsigned BaseOffset = OverflowOffset;
8083	Value *ShadowBase = getShadowPtrForVAArgument(IRB, ArgOffset: OverflowOffset);
8084	Value OriginBase = nullptr*;
8085	if (MS.TrackOrigins)
8086	OriginBase = getOriginPtrForVAArgument(IRB, ArgOffset: OverflowOffset);
8087	OverflowOffset += AlignedSize;
8088
8089	if (OverflowOffset > kParamTLSSize) {
8090	CleanUnusedTLS(IRB, ShadowBase, BaseOffset);
8091	continue; // We have no space to copy shadow there.
8092	}
8093
8094	Value ShadowPtr, OriginPtr;
8095	std::tie(args&: ShadowPtr, args&: OriginPtr) =
8096	MSV.getShadowOriginPtr(Addr: A, IRB, ShadowTy: IRB.getInt8Ty(), Alignment: kShadowTLSAlignment,
8097	/isStore/ false);
8098	IRB.CreateMemCpy(Dst: ShadowBase, DstAlign: kShadowTLSAlignment, Src: ShadowPtr,
8099	SrcAlign: kShadowTLSAlignment, Size: ArgSize);
8100	if (MS.TrackOrigins)
8101	IRB.CreateMemCpy(Dst: OriginBase, DstAlign: kShadowTLSAlignment, Src: OriginPtr,
8102	SrcAlign: kShadowTLSAlignment, Size: ArgSize);
8103	} else {
8104	ArgKind AK = classifyArgument(arg: A);
8105	if (AK == AK_GeneralPurpose && GpOffset >= AMD64GpEndOffset)
8106	AK = AK_Memory;
8107	if (AK == AK_FloatingPoint && FpOffset >= AMD64FpEndOffset)
8108	AK = AK_Memory;
8109	Value ShadowBase, OriginBase = nullptr;
8110	switch (AK) {
8111	case AK_GeneralPurpose:
8112	ShadowBase = getShadowPtrForVAArgument(IRB, ArgOffset: GpOffset);
8113	if (MS.TrackOrigins)
8114	OriginBase = getOriginPtrForVAArgument(IRB, ArgOffset: GpOffset);
8115	GpOffset += `8`;
8116	assert(GpOffset <= kParamTLSSize);
8117	break;
8118	case AK_FloatingPoint:
8119	ShadowBase = getShadowPtrForVAArgument(IRB, ArgOffset: FpOffset);
8120	if (MS.TrackOrigins)
8121	OriginBase = getOriginPtrForVAArgument(IRB, ArgOffset: FpOffset);
8122	FpOffset += `16`;
8123	assert(FpOffset <= kParamTLSSize);
8124	break;
8125	case AK_Memory:
8126	if (IsFixed)
8127	continue;
8128	uint64_t ArgSize = DL.getTypeAllocSize(Ty: A ->getType());
8129	uint64_t AlignedSize = alignTo(Value: ArgSize, Align: `8`);
8130	unsigned BaseOffset = OverflowOffset;
8131	ShadowBase = getShadowPtrForVAArgument(IRB, ArgOffset: OverflowOffset);
8132	if (MS.TrackOrigins) {
8133	OriginBase = getOriginPtrForVAArgument(IRB, ArgOffset: OverflowOffset);
8134	}
8135	OverflowOffset += AlignedSize;
8136	if (OverflowOffset > kParamTLSSize) {
8137	// We have no space to copy shadow there.
8138	CleanUnusedTLS(IRB, ShadowBase, BaseOffset);
8139	continue;
8140	}
8141	}
8142	// Take fixed arguments into account for GpOffset and FpOffset,
8143	// but don't actually store shadows for them.
8144	// TODO(glider): don't call getPtrForVAArgument() for them.*
8145	if (IsFixed)
8146	continue;
8147	Value *Shadow = MSV.getShadow(V: A);
8148	IRB.CreateAlignedStore(Val: Shadow, Ptr: ShadowBase, Align: kShadowTLSAlignment);
8149	if (MS.TrackOrigins) {
8150	Value *Origin = MSV.getOrigin(V: A);
8151	TypeSize StoreSize = DL.getTypeStoreSize(Ty: Shadow->getType());
8152	MSV.paintOrigin(IRB, Origin, OriginPtr: OriginBase, TS: StoreSize,
8153	Alignment: std::max(a: kShadowTLSAlignment, b: kMinOriginAlignment));
8154	}
8155	}
8156	}
8157	Constant *OverflowSize =
8158	ConstantInt::get(Ty: IRB.getInt64Ty(), V: OverflowOffset - AMD64FpEndOffset);
8159	IRB.CreateStore(Val: OverflowSize, Ptr: MS.VAArgOverflowSizeTLS);
8160	}
8161
8162	void finalizeInstrumentation() override {
8163	assert(!VAArgOverflowSize && !VAArgTLSCopy &&
8164	"finalizeInstrumentation called twice");
8165	if (!VAStartInstrumentationList.empty()) {
8166	// If there is a va_start in this function, make a backup copy of
8167	// va_arg_tls somewhere in the function entry block.
8168	IRBuilder<> IRB(MSV.FnPrologueEnd);
8169	VAArgOverflowSize =
8170	IRB.CreateLoad(Ty: IRB.getInt64Ty(), Ptr: MS.VAArgOverflowSizeTLS);
8171	Value *CopySize = IRB.CreateAdd(
8172	LHS: ConstantInt::get(Ty: MS.IntptrTy, V: AMD64FpEndOffset), RHS: VAArgOverflowSize);
8173	VAArgTLSCopy = IRB.CreateAlloca(Ty: Type::getInt8Ty(C&: *MS.C), ArraySize: CopySize);
8174	VAArgTLSCopy->setAlignment(kShadowTLSAlignment);
8175	IRB.CreateMemSet(Ptr: VAArgTLSCopy, Val: Constant::getNullValue(Ty: IRB.getInt8Ty()),
8176	Size: CopySize, Align: kShadowTLSAlignment, isVolatile: false);
8177
8178	Value *SrcSize = IRB.CreateBinaryIntrinsic(
8179	ID: Intrinsic::umin, LHS: CopySize,
8180	RHS: ConstantInt::get(Ty: MS.IntptrTy, V: kParamTLSSize));
8181	IRB.CreateMemCpy(Dst: VAArgTLSCopy, DstAlign: kShadowTLSAlignment, Src: MS.VAArgTLS,
8182	SrcAlign: kShadowTLSAlignment, Size: SrcSize);
8183	if (MS.TrackOrigins) {
8184	VAArgTLSOriginCopy = IRB.CreateAlloca(Ty: Type::getInt8Ty(C&: *MS.C), ArraySize: CopySize);
8185	VAArgTLSOriginCopy->setAlignment(kShadowTLSAlignment);
8186	IRB.CreateMemCpy(Dst: VAArgTLSOriginCopy, DstAlign: kShadowTLSAlignment,
8187	Src: MS.VAArgOriginTLS, SrcAlign: kShadowTLSAlignment, Size: SrcSize);
8188	}
8189	}
8190
8191	// Instrument va_start.
8192	// Copy va_list shadow from the backup copy of the TLS contents.
8193	for (CallInst *OrigInst : VAStartInstrumentationList) {
8194	NextNodeIRBuilder IRB(OrigInst);
8195	Value *VAListTag = OrigInst->getArgOperand(i: `0`);
8196
8197	Value *RegSaveAreaPtrPtr =
8198	IRB.CreatePtrAdd(Ptr: VAListTag, Offset: ConstantInt::get(Ty: MS.IntptrTy, V: `16`));
8199	Value *RegSaveAreaPtr = IRB.CreateLoad(Ty: MS.PtrTy, Ptr: RegSaveAreaPtrPtr);
8200	Value RegSaveAreaShadowPtr, RegSaveAreaOriginPtr;
8201	const Align Alignment = Align (`16`);
8202	std::tie(args&: RegSaveAreaShadowPtr, args&: RegSaveAreaOriginPtr) =
8203	MSV.getShadowOriginPtr(Addr: RegSaveAreaPtr, IRB, ShadowTy: IRB.getInt8Ty(),
8204	Alignment, /isStore/ true);
8205	IRB.CreateMemCpy(Dst: RegSaveAreaShadowPtr, DstAlign: Alignment, Src: VAArgTLSCopy, SrcAlign: Alignment,
8206	Size: AMD64FpEndOffset);
8207	if (MS.TrackOrigins)
8208	IRB.CreateMemCpy(Dst: RegSaveAreaOriginPtr, DstAlign: Alignment, Src: VAArgTLSOriginCopy,
8209	SrcAlign: Alignment, Size: AMD64FpEndOffset);
8210	Value *OverflowArgAreaPtrPtr =
8211	IRB.CreatePtrAdd(Ptr: VAListTag, Offset: ConstantInt::get(Ty: MS.IntptrTy, V: `8`));
8212	Value *OverflowArgAreaPtr =
8213	IRB.CreateLoad(Ty: MS.PtrTy, Ptr: OverflowArgAreaPtrPtr);
8214	Value OverflowArgAreaShadowPtr, OverflowArgAreaOriginPtr;
8215	std::tie(args&: OverflowArgAreaShadowPtr, args&: OverflowArgAreaOriginPtr) =
8216	MSV.getShadowOriginPtr(Addr: OverflowArgAreaPtr, IRB, ShadowTy: IRB.getInt8Ty(),
8217	Alignment, /isStore/ true);
8218	Value *SrcPtr = IRB.CreateConstGEP1_32(Ty: IRB.getInt8Ty(), Ptr: VAArgTLSCopy,
8219	Idx0: AMD64FpEndOffset);
8220	IRB.CreateMemCpy(Dst: OverflowArgAreaShadowPtr, DstAlign: Alignment, Src: SrcPtr, SrcAlign: Alignment,
8221	Size: VAArgOverflowSize);
8222	if (MS.TrackOrigins) {
8223	SrcPtr = IRB.CreateConstGEP1_32(Ty: IRB.getInt8Ty(), Ptr: VAArgTLSOriginCopy,
8224	Idx0: AMD64FpEndOffset);
8225	IRB.CreateMemCpy(Dst: OverflowArgAreaOriginPtr, DstAlign: Alignment, Src: SrcPtr, SrcAlign: Alignment,
8226	Size: VAArgOverflowSize);
8227	}
8228	}
8229	}
8230	};
8231
8232	/// AArch64-specific implementation of VarArgHelper.
8233	struct VarArgAArch64Helper : public VarArgHelperBase {
8234	static const unsigned kAArch64GrArgSize = `64`;
8235	static const unsigned kAArch64VrArgSize = `128`;
8236
8237	static const unsigned AArch64GrBegOffset = `0`;
8238	static const unsigned AArch64GrEndOffset = kAArch64GrArgSize;
8239	// Make VR space aligned to 16 bytes.
8240	static const unsigned AArch64VrBegOffset = AArch64GrEndOffset;
8241	static const unsigned AArch64VrEndOffset =
8242	AArch64VrBegOffset + kAArch64VrArgSize;
8243	static const unsigned AArch64VAEndOffset = AArch64VrEndOffset;
8244
8245	AllocaInst VAArgTLSCopy = nullptr*;
8246	Value VAArgOverflowSize = nullptr*;
8247
8248	enum ArgKind { AK_GeneralPurpose, AK_FloatingPoint, AK_Memory };
8249
8250	VarArgAArch64Helper(Function &F, MemorySanitizer &MS,
8251	MemorySanitizerVisitor &MSV)
8252	: VarArgHelperBase (F, MS, MSV, /VAListTagSize=/`32`) {}
8253
8254	// A very rough approximation of aarch64 argument classification rules.
8255	std::pair<ArgKind, uint64_t> classifyArgument(Type *T) {
8256	if (T->isIntOrPtrTy() && T->getPrimitiveSizeInBits() <= `64`)
8257	return {AK_GeneralPurpose, `1`};
8258	if (T->isFloatingPointTy() && T->getPrimitiveSizeInBits() <= `128`)
8259	return {AK_FloatingPoint, `1`};
8260
8261	if (T->isArrayTy()) {
8262	auto R = classifyArgument(T: T->getArrayElementType());
8263	R.second *= T->getScalarType()->getArrayNumElements();
8264	return R;
8265	}
8266
8267	if (const FixedVectorType *FV = dyn_cast<FixedVectorType>(Val: T)) {
8268	auto R = classifyArgument(T: FV->getScalarType());
8269	R.second *= FV->getNumElements();
8270	return R;
8271	}
8272
8273	LLVM_DEBUG(errs() << "Unknown vararg type: " << *T << "\n");
8274	return {AK_Memory, `0`};
8275	}
8276
8277	// The instrumentation stores the argument shadow in a non ABI-specific
8278	// format because it does not know which argument is named (since Clang,
8279	// like x86_64 case, lowers the va_args in the frontend and this pass only
8280	// sees the low level code that deals with va_list internals).
8281	// The first seven GR registers are saved in the first 56 bytes of the
8282	// va_arg tls arra, followed by the first 8 FP/SIMD registers, and then
8283	// the remaining arguments.
8284	// Using constant offset within the va_arg TLS array allows fast copy
8285	// in the finalize instrumentation.
8286	void visitCallBase(CallBase &CB, IRBuilder<> &IRB) override {
8287	unsigned GrOffset = AArch64GrBegOffset;
8288	unsigned VrOffset = AArch64VrBegOffset;
8289	unsigned OverflowOffset = AArch64VAEndOffset;
8290
8291	const DataLayout &DL = F.getDataLayout();
8292	for (const auto &[ArgNo, A] : llvm::enumerate(First: CB.args())) {
8293	bool IsFixed = ArgNo < CB.getFunctionType()->getNumParams();
8294	auto [AK, RegNum] = classifyArgument(T: A ->getType());
8295	if (AK == AK_GeneralPurpose &&
8296	(GrOffset + RegNum * `8`) > AArch64GrEndOffset)
8297	AK = AK_Memory;
8298	if (AK == AK_FloatingPoint &&
8299	(VrOffset + RegNum * `16`) > AArch64VrEndOffset)
8300	AK = AK_Memory;
8301	Value *Base;
8302	switch (AK) {
8303	case AK_GeneralPurpose:
8304	Base = getShadowPtrForVAArgument(IRB, ArgOffset: GrOffset);
8305	GrOffset += `8` * RegNum;
8306	break;
8307	case AK_FloatingPoint:
8308	Base = getShadowPtrForVAArgument(IRB, ArgOffset: VrOffset);
8309	VrOffset += `16` * RegNum;
8310	break;
8311	case AK_Memory:
8312	// Don't count fixed arguments in the overflow area - va_start will
8313	// skip right over them.
8314	if (IsFixed)
8315	continue;
8316	uint64_t ArgSize = DL.getTypeAllocSize(Ty: A ->getType());
8317	uint64_t AlignedSize = alignTo(Value: ArgSize, Align: `8`);
8318	unsigned BaseOffset = OverflowOffset;
8319	Base = getShadowPtrForVAArgument(IRB, ArgOffset: BaseOffset);
8320	OverflowOffset += AlignedSize;
8321	if (OverflowOffset > kParamTLSSize) {
8322	// We have no space to copy shadow there.
8323	CleanUnusedTLS(IRB, ShadowBase: Base, BaseOffset);
8324	continue;
8325	}
8326	break;
8327	}
8328	// Count Gp/Vr fixed arguments to their respective offsets, but don't
8329	// bother to actually store a shadow.
8330	if (IsFixed)
8331	continue;
8332	IRB.CreateAlignedStore(Val: MSV.getShadow(V: A), Ptr: Base, Align: kShadowTLSAlignment);
8333	}
8334	Constant *OverflowSize =
8335	ConstantInt::get(Ty: IRB.getInt64Ty(), V: OverflowOffset - AArch64VAEndOffset);
8336	IRB.CreateStore(Val: OverflowSize, Ptr: MS.VAArgOverflowSizeTLS);
8337	}
8338
8339	// Retrieve a va_list field of 'void' size.*
8340	Value getVAField64(IRBuilder<> &IRB, Value VAListTag, int offset) {
8341	Value *SaveAreaPtrPtr =
8342	IRB.CreatePtrAdd(Ptr: VAListTag, Offset: ConstantInt::get(Ty: MS.IntptrTy, V: offset));
8343	return IRB.CreateLoad(Ty: Type::getInt64Ty(C&: *MS.C), Ptr: SaveAreaPtrPtr);
8344	}
8345
8346	// Retrieve a va_list field of 'int' size.
8347	Value getVAField32(IRBuilder<> &IRB, Value VAListTag, int offset) {
8348	Value *SaveAreaPtr =
8349	IRB.CreatePtrAdd(Ptr: VAListTag, Offset: ConstantInt::get(Ty: MS.IntptrTy, V: offset));
8350	Value *SaveArea32 = IRB.CreateLoad(Ty: IRB.getInt32Ty(), Ptr: SaveAreaPtr);
8351	return IRB.CreateSExt(V: SaveArea32, DestTy: MS.IntptrTy);
8352	}
8353
8354	void finalizeInstrumentation() override {
8355	assert(!VAArgOverflowSize && !VAArgTLSCopy &&
8356	"finalizeInstrumentation called twice");
8357	if (!VAStartInstrumentationList.empty()) {
8358	// If there is a va_start in this function, make a backup copy of
8359	// va_arg_tls somewhere in the function entry block.
8360	IRBuilder<> IRB(MSV.FnPrologueEnd);
8361	VAArgOverflowSize =
8362	IRB.CreateLoad(Ty: IRB.getInt64Ty(), Ptr: MS.VAArgOverflowSizeTLS);
8363	Value *CopySize = IRB.CreateAdd(
8364	LHS: ConstantInt::get(Ty: MS.IntptrTy, V: AArch64VAEndOffset), RHS: VAArgOverflowSize);
8365	VAArgTLSCopy = IRB.CreateAlloca(Ty: Type::getInt8Ty(C&: *MS.C), ArraySize: CopySize);
8366	VAArgTLSCopy->setAlignment(kShadowTLSAlignment);
8367	IRB.CreateMemSet(Ptr: VAArgTLSCopy, Val: Constant::getNullValue(Ty: IRB.getInt8Ty()),
8368	Size: CopySize, Align: kShadowTLSAlignment, isVolatile: false);
8369
8370	Value *SrcSize = IRB.CreateBinaryIntrinsic(
8371	ID: Intrinsic::umin, LHS: CopySize,
8372	RHS: ConstantInt::get(Ty: MS.IntptrTy, V: kParamTLSSize));
8373	IRB.CreateMemCpy(Dst: VAArgTLSCopy, DstAlign: kShadowTLSAlignment, Src: MS.VAArgTLS,
8374	SrcAlign: kShadowTLSAlignment, Size: SrcSize);
8375	}
8376
8377	Value *GrArgSize = ConstantInt::get(Ty: MS.IntptrTy, V: kAArch64GrArgSize);
8378	Value *VrArgSize = ConstantInt::get(Ty: MS.IntptrTy, V: kAArch64VrArgSize);
8379
8380	// Instrument va_start, copy va_list shadow from the backup copy of
8381	// the TLS contents.
8382	for (CallInst *OrigInst : VAStartInstrumentationList) {
8383	NextNodeIRBuilder IRB(OrigInst);
8384
8385	Value *VAListTag = OrigInst->getArgOperand(i: `0`);
8386
8387	// The variadic ABI for AArch64 creates two areas to save the incoming
8388	// argument registers (one for 64-bit general register xn-x7 and another
8389	// for 128-bit FP/SIMD vn-v7).
8390	// We need then to propagate the shadow arguments on both regions
8391	// 'va::__gr_top + va::__gr_offs' and 'va::__vr_top + va::__vr_offs'.
8392	// The remaining arguments are saved on shadow for 'va::stack'.
8393	// One caveat is it requires only to propagate the non-named arguments,
8394	// however on the call site instrumentation 'all' the arguments are
8395	// saved. So to copy the shadow values from the va_arg TLS array
8396	// we need to adjust the offset for both GR and VR fields based on
8397	// the __{gr,vr}_offs value (since they are stores based on incoming
8398	// named arguments).
8399	Type *RegSaveAreaPtrTy = IRB.getPtrTy();
8400
8401	// Read the stack pointer from the va_list.
8402	Value *StackSaveAreaPtr =
8403	IRB.CreateIntToPtr(V: getVAField64(IRB, VAListTag, offset: `0`), DestTy: RegSaveAreaPtrTy);
8404
8405	// Read both the __gr_top and __gr_off and add them up.
8406	Value *GrTopSaveAreaPtr = getVAField64(IRB, VAListTag, offset: `8`);
8407	Value *GrOffSaveArea = getVAField32(IRB, VAListTag, offset: `24`);
8408
8409	Value *GrRegSaveAreaPtr = IRB.CreateIntToPtr(
8410	V: IRB.CreateAdd(LHS: GrTopSaveAreaPtr, RHS: GrOffSaveArea), DestTy: RegSaveAreaPtrTy);
8411
8412	// Read both the __vr_top and __vr_off and add them up.
8413	Value *VrTopSaveAreaPtr = getVAField64(IRB, VAListTag, offset: `16`);
8414	Value *VrOffSaveArea = getVAField32(IRB, VAListTag, offset: `28`);
8415
8416	Value *VrRegSaveAreaPtr = IRB.CreateIntToPtr(
8417	V: IRB.CreateAdd(LHS: VrTopSaveAreaPtr, RHS: VrOffSaveArea), DestTy: RegSaveAreaPtrTy);
8418
8419	// It does not know how many named arguments is being used and, on the
8420	// callsite all the arguments were saved. Since __gr_off is defined as
8421	// '0 - ((8 - named_gr) 8)', the idea is to just propagate the variadic*
8422	// argument by ignoring the bytes of shadow from named arguments.
8423	Value *GrRegSaveAreaShadowPtrOff =
8424	IRB.CreateAdd(LHS: GrArgSize, RHS: GrOffSaveArea);
8425
8426	Value *GrRegSaveAreaShadowPtr =
8427	MSV.getShadowOriginPtr(Addr: GrRegSaveAreaPtr, IRB, ShadowTy: IRB.getInt8Ty(),
8428	Alignment: Align (`8`), /isStore/ true)
8429	.first;
8430
8431	Value *GrSrcPtr =
8432	IRB.CreateInBoundsPtrAdd(Ptr: VAArgTLSCopy, Offset: GrRegSaveAreaShadowPtrOff);
8433	Value *GrCopySize = IRB.CreateSub(LHS: GrArgSize, RHS: GrRegSaveAreaShadowPtrOff);
8434
8435	IRB.CreateMemCpy(Dst: GrRegSaveAreaShadowPtr, DstAlign: Align (`8`), Src: GrSrcPtr, SrcAlign: Align (`8`),
8436	Size: GrCopySize);
8437
8438	// Again, but for FP/SIMD values.
8439	Value *VrRegSaveAreaShadowPtrOff =
8440	IRB.CreateAdd(LHS: VrArgSize, RHS: VrOffSaveArea);
8441
8442	Value *VrRegSaveAreaShadowPtr =
8443	MSV.getShadowOriginPtr(Addr: VrRegSaveAreaPtr, IRB, ShadowTy: IRB.getInt8Ty(),
8444	Alignment: Align (`8`), /isStore/ true)
8445	.first;
8446
8447	Value *VrSrcPtr = IRB.CreateInBoundsPtrAdd(
8448	Ptr: IRB.CreateInBoundsPtrAdd(Ptr: VAArgTLSCopy,
8449	Offset: IRB.getInt32(C: AArch64VrBegOffset)),
8450	Offset: VrRegSaveAreaShadowPtrOff);
8451	Value *VrCopySize = IRB.CreateSub(LHS: VrArgSize, RHS: VrRegSaveAreaShadowPtrOff);
8452
8453	IRB.CreateMemCpy(Dst: VrRegSaveAreaShadowPtr, DstAlign: Align (`8`), Src: VrSrcPtr, SrcAlign: Align (`8`),
8454	Size: VrCopySize);
8455
8456	// And finally for remaining arguments.
8457	Value *StackSaveAreaShadowPtr =
8458	MSV.getShadowOriginPtr(Addr: StackSaveAreaPtr, IRB, ShadowTy: IRB.getInt8Ty(),
8459	Alignment: Align (`16`), /isStore/ true)
8460	.first;
8461
8462	Value *StackSrcPtr = IRB.CreateInBoundsPtrAdd(
8463	Ptr: VAArgTLSCopy, Offset: IRB.getInt32(C: AArch64VAEndOffset));
8464
8465	IRB.CreateMemCpy(Dst: StackSaveAreaShadowPtr, DstAlign: Align (`16`), Src: StackSrcPtr,
8466	SrcAlign: Align (`16`), Size: VAArgOverflowSize);
8467	}
8468	}
8469	};
8470
8471	/// PowerPC64-specific implementation of VarArgHelper.
8472	struct VarArgPowerPC64Helper : public VarArgHelperBase {
8473	AllocaInst VAArgTLSCopy = nullptr*;
8474	Value VAArgSize = nullptr*;
8475
8476	VarArgPowerPC64Helper(Function &F, MemorySanitizer &MS,
8477	MemorySanitizerVisitor &MSV)
8478	: VarArgHelperBase (F, MS, MSV, /VAListTagSize=/`8`) {}
8479
8480	void visitCallBase(CallBase &CB, IRBuilder<> &IRB) override {
8481	// For PowerPC, we need to deal with alignment of stack arguments -
8482	// they are mostly aligned to 8 bytes, but vectors and i128 arrays
8483	// are aligned to 16 bytes, byvals can be aligned to 8 or 16 bytes,
8484	// For that reason, we compute current offset from stack pointer (which is
8485	// always properly aligned), and offset for the first vararg, then subtract
8486	// them.
8487	unsigned VAArgBase;
8488	Triple TargetTriple(F.getParent()->getTargetTriple());
8489	// Parameter save area starts at 48 bytes from frame pointer for ABIv1,
8490	// and 32 bytes for ABIv2. This is usually determined by target
8491	// endianness, but in theory could be overridden by function attribute.
8492	if (TargetTriple.isPPC64ELFv2ABI())
8493	VAArgBase = `32`;
8494	else
8495	VAArgBase = `48`;
8496	unsigned VAArgOffset = VAArgBase;
8497	const DataLayout &DL = F.getDataLayout();
8498	for (const auto &[ArgNo, A] : llvm::enumerate(First: CB.args())) {
8499	bool IsFixed = ArgNo < CB.getFunctionType()->getNumParams();
8500	bool IsByVal = CB.paramHasAttr(ArgNo, Kind: Attribute::ByVal);
8501	if (IsByVal) {
8502	assert(A->getType()->isPointerTy());
8503	Type *RealTy = CB.getParamByValType(ArgNo);
8504	uint64_t ArgSize = DL.getTypeAllocSize(Ty: RealTy);
8505	Align ArgAlign = CB.getParamAlign(ArgNo).value_or(u: Align (`8`));
8506	if (ArgAlign < `8`)
8507	ArgAlign = Align (`8`);
8508	VAArgOffset = alignTo(Size: VAArgOffset, A: ArgAlign);
8509	if (!IsFixed) {
8510	Value *Base =
8511	getShadowPtrForVAArgument(IRB, ArgOffset: VAArgOffset - VAArgBase, ArgSize);
8512	if (Base) {
8513	Value AShadowPtr, AOriginPtr;
8514	std::tie(args&: AShadowPtr, args&: AOriginPtr) =
8515	MSV.getShadowOriginPtr(Addr: A, IRB, ShadowTy: IRB.getInt8Ty(),
8516	Alignment: kShadowTLSAlignment, /isStore/ false);
8517
8518	IRB.CreateMemCpy(Dst: Base, DstAlign: kShadowTLSAlignment, Src: AShadowPtr,
8519	SrcAlign: kShadowTLSAlignment, Size: ArgSize);
8520	}
8521	}
8522	VAArgOffset += alignTo(Size: ArgSize, A: Align (`8`));
8523	} else {
8524	Value *Base;
8525	uint64_t ArgSize = DL.getTypeAllocSize(Ty: A ->getType());
8526	Align ArgAlign = Align (`8`);
8527	if (A ->getType()->isArrayTy()) {
8528	// Arrays are aligned to element size, except for long double
8529	// arrays, which are aligned to 8 bytes.
8530	Type *ElementTy = A ->getType()->getArrayElementType();
8531	if (!ElementTy->isPPC_FP128Ty())
8532	ArgAlign = Align (DL.getTypeAllocSize(Ty: ElementTy));
8533	} else if (A ->getType()->isVectorTy()) {
8534	// Vectors are naturally aligned.
8535	ArgAlign = Align (ArgSize);
8536	}
8537	if (ArgAlign < `8`)
8538	ArgAlign = Align (`8`);
8539	VAArgOffset = alignTo(Size: VAArgOffset, A: ArgAlign);
8540	if (DL.isBigEndian()) {
8541	// Adjusting the shadow for argument with size < 8 to match the
8542	// placement of bits in big endian system
8543	if (ArgSize < `8`)
8544	VAArgOffset += (`8` - ArgSize);
8545	}
8546	if (!IsFixed) {
8547	Base =
8548	getShadowPtrForVAArgument(IRB, ArgOffset: VAArgOffset - VAArgBase, ArgSize);
8549	if (Base)
8550	IRB.CreateAlignedStore(Val: MSV.getShadow(V: A), Ptr: Base, Align: kShadowTLSAlignment);
8551	}
8552	VAArgOffset += ArgSize;
8553	VAArgOffset = alignTo(Size: VAArgOffset, A: Align (`8`));
8554	}
8555	if (IsFixed)
8556	VAArgBase = VAArgOffset;
8557	}
8558
8559	Constant *TotalVAArgSize =
8560	ConstantInt::get(Ty: MS.IntptrTy, V: VAArgOffset - VAArgBase);
8561	// Here using VAArgOverflowSizeTLS as VAArgSizeTLS to avoid creation of
8562	// a new class member i.e. it is the total size of all VarArgs.
8563	IRB.CreateStore(Val: TotalVAArgSize, Ptr: MS.VAArgOverflowSizeTLS);
8564	}
8565
8566	void finalizeInstrumentation() override {
8567	assert(!VAArgSize && !VAArgTLSCopy &&
8568	"finalizeInstrumentation called twice");
8569	IRBuilder<> IRB(MSV.FnPrologueEnd);
8570	VAArgSize = IRB.CreateLoad(Ty: IRB.getInt64Ty(), Ptr: MS.VAArgOverflowSizeTLS);
8571	Value *CopySize = VAArgSize;
8572
8573	if (!VAStartInstrumentationList.empty()) {
8574	// If there is a va_start in this function, make a backup copy of
8575	// va_arg_tls somewhere in the function entry block.
8576
8577	VAArgTLSCopy = IRB.CreateAlloca(Ty: Type::getInt8Ty(C&: *MS.C), ArraySize: CopySize);
8578	VAArgTLSCopy->setAlignment(kShadowTLSAlignment);
8579	IRB.CreateMemSet(Ptr: VAArgTLSCopy, Val: Constant::getNullValue(Ty: IRB.getInt8Ty()),
8580	Size: CopySize, Align: kShadowTLSAlignment, isVolatile: false);
8581
8582	Value *SrcSize = IRB.CreateBinaryIntrinsic(
8583	ID: Intrinsic::umin, LHS: CopySize,
8584	RHS: ConstantInt::get(Ty: IRB.getInt64Ty(), V: kParamTLSSize));
8585	IRB.CreateMemCpy(Dst: VAArgTLSCopy, DstAlign: kShadowTLSAlignment, Src: MS.VAArgTLS,
8586	SrcAlign: kShadowTLSAlignment, Size: SrcSize);
8587	}
8588
8589	// Instrument va_start.
8590	// Copy va_list shadow from the backup copy of the TLS contents.
8591	for (CallInst *OrigInst : VAStartInstrumentationList) {
8592	NextNodeIRBuilder IRB(OrigInst);
8593	Value *VAListTag = OrigInst->getArgOperand(i: `0`);
8594	Value *RegSaveAreaPtrPtr = IRB.CreatePtrToInt(V: VAListTag, DestTy: MS.IntptrTy);
8595
8596	RegSaveAreaPtrPtr = IRB.CreateIntToPtr(V: RegSaveAreaPtrPtr, DestTy: MS.PtrTy);
8597
8598	Value *RegSaveAreaPtr = IRB.CreateLoad(Ty: MS.PtrTy, Ptr: RegSaveAreaPtrPtr);
8599	Value RegSaveAreaShadowPtr, RegSaveAreaOriginPtr;
8600	const DataLayout &DL = F.getDataLayout();
8601	unsigned IntptrSize = DL.getTypeStoreSize(Ty: MS.IntptrTy);
8602	const Align Alignment = Align (IntptrSize);
8603	std::tie(args&: RegSaveAreaShadowPtr, args&: RegSaveAreaOriginPtr) =
8604	MSV.getShadowOriginPtr(Addr: RegSaveAreaPtr, IRB, ShadowTy: IRB.getInt8Ty(),
8605	Alignment, /isStore/ true);
8606	IRB.CreateMemCpy(Dst: RegSaveAreaShadowPtr, DstAlign: Alignment, Src: VAArgTLSCopy, SrcAlign: Alignment,
8607	Size: CopySize);
8608	}
8609	}
8610	};
8611
8612	/// PowerPC32-specific implementation of VarArgHelper.
8613	struct VarArgPowerPC32Helper : public VarArgHelperBase {
8614	AllocaInst VAArgTLSCopy = nullptr*;
8615	Value VAArgSize = nullptr*;
8616
8617	VarArgPowerPC32Helper(Function &F, MemorySanitizer &MS,
8618	MemorySanitizerVisitor &MSV)
8619	: VarArgHelperBase (F, MS, MSV, /VAListTagSize=/`12`) {}
8620
8621	void visitCallBase(CallBase &CB, IRBuilder<> &IRB) override {
8622	unsigned VAArgBase;
8623	// Parameter save area is 8 bytes from frame pointer in PPC32
8624	VAArgBase = `8`;
8625	unsigned VAArgOffset = VAArgBase;
8626	const DataLayout &DL = F.getDataLayout();
8627	unsigned IntptrSize = DL.getTypeStoreSize(Ty: MS.IntptrTy);
8628	for (const auto &[ArgNo, A] : llvm::enumerate(First: CB.args())) {
8629	bool IsFixed = ArgNo < CB.getFunctionType()->getNumParams();
8630	bool IsByVal = CB.paramHasAttr(ArgNo, Kind: Attribute::ByVal);
8631	if (IsByVal) {
8632	assert(A->getType()->isPointerTy());
8633	Type *RealTy = CB.getParamByValType(ArgNo);
8634	uint64_t ArgSize = DL.getTypeAllocSize(Ty: RealTy);
8635	Align ArgAlign = CB.getParamAlign(ArgNo).value_or(u: Align (IntptrSize));
8636	if (ArgAlign < IntptrSize)
8637	ArgAlign = Align (IntptrSize);
8638	VAArgOffset = alignTo(Size: VAArgOffset, A: ArgAlign);
8639	if (!IsFixed) {
8640	Value *Base =
8641	getShadowPtrForVAArgument(IRB, ArgOffset: VAArgOffset - VAArgBase, ArgSize);
8642	if (Base) {
8643	Value AShadowPtr, AOriginPtr;
8644	std::tie(args&: AShadowPtr, args&: AOriginPtr) =
8645	MSV.getShadowOriginPtr(Addr: A, IRB, ShadowTy: IRB.getInt8Ty(),
8646	Alignment: kShadowTLSAlignment, /isStore/ false);
8647
8648	IRB.CreateMemCpy(Dst: Base, DstAlign: kShadowTLSAlignment, Src: AShadowPtr,
8649	SrcAlign: kShadowTLSAlignment, Size: ArgSize);
8650	}
8651	}
8652	VAArgOffset += alignTo(Size: ArgSize, A: Align (IntptrSize));
8653	} else {
8654	Value *Base;
8655	Type *ArgTy = A ->getType();
8656
8657	// On PPC 32 floating point variable arguments are stored in separate
8658	// area: fp_save_area = reg_save_area + 48. We do not copy shaodow for*
8659	// them as they will be found when checking call arguments.
8660	if (!ArgTy->isFloatingPointTy()) {
8661	uint64_t ArgSize = DL.getTypeAllocSize(Ty: ArgTy);
8662	Align ArgAlign = Align (IntptrSize);
8663	if (ArgTy->isArrayTy()) {
8664	// Arrays are aligned to element size, except for long double
8665	// arrays, which are aligned to 8 bytes.
8666	Type *ElementTy = ArgTy->getArrayElementType();
8667	if (!ElementTy->isPPC_FP128Ty())
8668	ArgAlign = Align (DL.getTypeAllocSize(Ty: ElementTy));
8669	} else if (ArgTy->isVectorTy()) {
8670	// Vectors are naturally aligned.
8671	ArgAlign = Align (ArgSize);
8672	}
8673	if (ArgAlign < IntptrSize)
8674	ArgAlign = Align (IntptrSize);
8675	VAArgOffset = alignTo(Size: VAArgOffset, A: ArgAlign);
8676	if (DL.isBigEndian()) {
8677	// Adjusting the shadow for argument with size < IntptrSize to match
8678	// the placement of bits in big endian system
8679	if (ArgSize < IntptrSize)
8680	VAArgOffset += (IntptrSize - ArgSize);
8681	}
8682	if (!IsFixed) {
8683	Base = getShadowPtrForVAArgument(IRB, ArgOffset: VAArgOffset - VAArgBase,
8684	ArgSize);
8685	if (Base)
8686	IRB.CreateAlignedStore(Val: MSV.getShadow(V: A), Ptr: Base,
8687	Align: kShadowTLSAlignment);
8688	}
8689	VAArgOffset += ArgSize;
8690	VAArgOffset = alignTo(Size: VAArgOffset, A: Align (IntptrSize));
8691	}
8692	}
8693	}
8694
8695	Constant *TotalVAArgSize =
8696	ConstantInt::get(Ty: MS.IntptrTy, V: VAArgOffset - VAArgBase);
8697	// Here using VAArgOverflowSizeTLS as VAArgSizeTLS to avoid creation of
8698	// a new class member i.e. it is the total size of all VarArgs.
8699	IRB.CreateStore(Val: TotalVAArgSize, Ptr: MS.VAArgOverflowSizeTLS);
8700	}
8701
8702	void finalizeInstrumentation() override {
8703	assert(!VAArgSize && !VAArgTLSCopy &&
8704	"finalizeInstrumentation called twice");
8705	IRBuilder<> IRB(MSV.FnPrologueEnd);
8706	VAArgSize = IRB.CreateLoad(Ty: MS.IntptrTy, Ptr: MS.VAArgOverflowSizeTLS);
8707	Value *CopySize = VAArgSize;
8708
8709	if (!VAStartInstrumentationList.empty()) {
8710	// If there is a va_start in this function, make a backup copy of
8711	// va_arg_tls somewhere in the function entry block.
8712
8713	VAArgTLSCopy = IRB.CreateAlloca(Ty: Type::getInt8Ty(C&: *MS.C), ArraySize: CopySize);
8714	VAArgTLSCopy->setAlignment(kShadowTLSAlignment);
8715	IRB.CreateMemSet(Ptr: VAArgTLSCopy, Val: Constant::getNullValue(Ty: IRB.getInt8Ty()),
8716	Size: CopySize, Align: kShadowTLSAlignment, isVolatile: false);
8717
8718	Value *SrcSize = IRB.CreateBinaryIntrinsic(
8719	ID: Intrinsic::umin, LHS: CopySize,
8720	RHS: ConstantInt::get(Ty: MS.IntptrTy, V: kParamTLSSize));
8721	IRB.CreateMemCpy(Dst: VAArgTLSCopy, DstAlign: kShadowTLSAlignment, Src: MS.VAArgTLS,
8722	SrcAlign: kShadowTLSAlignment, Size: SrcSize);
8723	}
8724
8725	// Instrument va_start.
8726	// Copy va_list shadow from the backup copy of the TLS contents.
8727	for (CallInst *OrigInst : VAStartInstrumentationList) {
8728	NextNodeIRBuilder IRB(OrigInst);
8729	Value *VAListTag = OrigInst->getArgOperand(i: `0`);
8730	Value *RegSaveAreaPtrPtr = IRB.CreatePtrToInt(V: VAListTag, DestTy: MS.IntptrTy);
8731	Value *RegSaveAreaSize = CopySize;
8732
8733	// In PPC32 va_list_tag is a struct
8734	RegSaveAreaPtrPtr =
8735	IRB.CreateAdd(LHS: RegSaveAreaPtrPtr, RHS: ConstantInt::get(Ty: MS.IntptrTy, V: `8`));
8736
8737	// On PPC 32 reg_save_area can only hold 32 bytes of data
8738	RegSaveAreaSize = IRB.CreateBinaryIntrinsic(
8739	ID: Intrinsic::umin, LHS: CopySize, RHS: ConstantInt::get(Ty: MS.IntptrTy, V: `32`));
8740
8741	RegSaveAreaPtrPtr = IRB.CreateIntToPtr(V: RegSaveAreaPtrPtr, DestTy: MS.PtrTy);
8742	Value *RegSaveAreaPtr = IRB.CreateLoad(Ty: MS.PtrTy, Ptr: RegSaveAreaPtrPtr);
8743
8744	const DataLayout &DL = F.getDataLayout();
8745	unsigned IntptrSize = DL.getTypeStoreSize(Ty: MS.IntptrTy);
8746	const Align Alignment = Align (IntptrSize);
8747
8748	{ // Copy reg save area
8749	Value RegSaveAreaShadowPtr, RegSaveAreaOriginPtr;
8750	std::tie(args&: RegSaveAreaShadowPtr, args&: RegSaveAreaOriginPtr) =
8751	MSV.getShadowOriginPtr(Addr: RegSaveAreaPtr, IRB, ShadowTy: IRB.getInt8Ty(),
8752	Alignment, /isStore/ true);
8753	IRB.CreateMemCpy(Dst: RegSaveAreaShadowPtr, DstAlign: Alignment, Src: VAArgTLSCopy,
8754	SrcAlign: Alignment, Size: RegSaveAreaSize);
8755
8756	RegSaveAreaShadowPtr =
8757	IRB.CreatePtrToInt(V: RegSaveAreaShadowPtr, DestTy: MS.IntptrTy);
8758	Value *FPSaveArea = IRB.CreateAdd(LHS: RegSaveAreaShadowPtr,
8759	RHS: ConstantInt::get(Ty: MS.IntptrTy, V: `32`));
8760	FPSaveArea = IRB.CreateIntToPtr(V: FPSaveArea, DestTy: MS.PtrTy);
8761	// We fill fp shadow with zeroes as uninitialized fp args should have
8762	// been found during call base check
8763	IRB.CreateMemSet(Ptr: FPSaveArea, Val: ConstantInt::getNullValue(Ty: IRB.getInt8Ty()),
8764	Size: ConstantInt::get(Ty: MS.IntptrTy, V: `32`), Align: Alignment);
8765	}
8766
8767	{ // Copy overflow area
8768	// RegSaveAreaSize is min(CopySize, 32) -> no overflow can occur
8769	Value *OverflowAreaSize = IRB.CreateSub(LHS: CopySize, RHS: RegSaveAreaSize);
8770
8771	Value *OverflowAreaPtrPtr = IRB.CreatePtrToInt(V: VAListTag, DestTy: MS.IntptrTy);
8772	OverflowAreaPtrPtr =
8773	IRB.CreateAdd(LHS: OverflowAreaPtrPtr, RHS: ConstantInt::get(Ty: MS.IntptrTy, V: `4`));
8774	OverflowAreaPtrPtr = IRB.CreateIntToPtr(V: OverflowAreaPtrPtr, DestTy: MS.PtrTy);
8775
8776	Value *OverflowAreaPtr = IRB.CreateLoad(Ty: MS.PtrTy, Ptr: OverflowAreaPtrPtr);
8777
8778	Value OverflowAreaShadowPtr, OverflowAreaOriginPtr;
8779	std::tie(args&: OverflowAreaShadowPtr, args&: OverflowAreaOriginPtr) =
8780	MSV.getShadowOriginPtr(Addr: OverflowAreaPtr, IRB, ShadowTy: IRB.getInt8Ty(),
8781	Alignment, /isStore/ true);
8782
8783	Value *OverflowVAArgTLSCopyPtr =
8784	IRB.CreatePtrToInt(V: VAArgTLSCopy, DestTy: MS.IntptrTy);
8785	OverflowVAArgTLSCopyPtr =
8786	IRB.CreateAdd(LHS: OverflowVAArgTLSCopyPtr, RHS: RegSaveAreaSize);
8787
8788	OverflowVAArgTLSCopyPtr =
8789	IRB.CreateIntToPtr(V: OverflowVAArgTLSCopyPtr, DestTy: MS.PtrTy);
8790	IRB.CreateMemCpy(Dst: OverflowAreaShadowPtr, DstAlign: Alignment,
8791	Src: OverflowVAArgTLSCopyPtr, SrcAlign: Alignment, Size: OverflowAreaSize);
8792	}
8793	}
8794	}
8795	};
8796
8797	/// SystemZ-specific implementation of VarArgHelper.
8798	struct VarArgSystemZHelper : public VarArgHelperBase {
8799	static const unsigned SystemZGpOffset = `16`;
8800	static const unsigned SystemZGpEndOffset = `56`;
8801	static const unsigned SystemZFpOffset = `128`;
8802	static const unsigned SystemZFpEndOffset = `160`;
8803	static const unsigned SystemZMaxVrArgs = `8`;
8804	static const unsigned SystemZRegSaveAreaSize = `160`;
8805	static const unsigned SystemZOverflowOffset = `160`;
8806	static const unsigned SystemZVAListTagSize = `32`;
8807	static const unsigned SystemZOverflowArgAreaPtrOffset = `16`;
8808	static const unsigned SystemZRegSaveAreaPtrOffset = `24`;
8809
8810	bool IsSoftFloatABI;
8811	AllocaInst VAArgTLSCopy = nullptr*;
8812	AllocaInst VAArgTLSOriginCopy = nullptr*;
8813	Value VAArgOverflowSize = nullptr*;
8814
8815	enum class ArgKind {
8816	GeneralPurpose,
8817	FloatingPoint,
8818	Vector,
8819	Memory,
8820	Indirect,
8821	};
8822
8823	enum class ShadowExtension { None, Zero, Sign };
8824
8825	VarArgSystemZHelper(Function &F, MemorySanitizer &MS,
8826	MemorySanitizerVisitor &MSV)
8827	: VarArgHelperBase (F, MS, MSV, SystemZVAListTagSize),
8828	IsSoftFloatABI(F.getFnAttribute(Kind: "use-soft-float").getValueAsBool()) {}
8829
8830	ArgKind classifyArgument(Type *T) {
8831	// T is a SystemZABIInfo::classifyArgumentType() output, and there are
8832	// only a few possibilities of what it can be. In particular, enums, single
8833	// element structs and large types have already been taken care of.
8834
8835	// Some i128 and fp128 arguments are converted to pointers only in the
8836	// back end.
8837	if (T->isIntegerTy(Bitwidth: `128`) \|\| T->isFP128Ty())
8838	return ArgKind::Indirect;
8839	if (T->isFloatingPointTy())
8840	return IsSoftFloatABI ? ArgKind::GeneralPurpose : ArgKind::FloatingPoint;
8841	if (T->isIntegerTy() \|\| T->isPointerTy())
8842	return ArgKind::GeneralPurpose;
8843	if (T->isVectorTy())
8844	return ArgKind::Vector;
8845	return ArgKind::Memory;
8846	}
8847
8848	ShadowExtension getShadowExtension(const CallBase &CB, unsigned ArgNo) {
8849	// ABI says: "One of the simple integer types no more than 64 bits wide.
8850	// ... If such an argument is shorter than 64 bits, replace it by a full
8851	// 64-bit integer representing the same number, using sign or zero
8852	// extension". Shadow for an integer argument has the same type as the
8853	// argument itself, so it can be sign or zero extended as well.
8854	bool ZExt = CB.paramHasAttr(ArgNo, Kind: Attribute::ZExt);
8855	bool SExt = CB.paramHasAttr(ArgNo, Kind: Attribute::SExt);
8856	if (ZExt) {
8857	assert(!SExt);
8858	return ShadowExtension::Zero;
8859	}
8860	if (SExt) {
8861	assert(!ZExt);
8862	return ShadowExtension::Sign;
8863	}
8864	return ShadowExtension::None;
8865	}
8866
8867	void visitCallBase(CallBase &CB, IRBuilder<> &IRB) override {
8868	unsigned GpOffset = SystemZGpOffset;
8869	unsigned FpOffset = SystemZFpOffset;
8870	unsigned VrIndex = `0`;
8871	unsigned OverflowOffset = SystemZOverflowOffset;
8872	const DataLayout &DL = F.getDataLayout();
8873	for (const auto &[ArgNo, A] : llvm::enumerate(First: CB.args())) {
8874	bool IsFixed = ArgNo < CB.getFunctionType()->getNumParams();
8875	// SystemZABIInfo does not produce ByVal parameters.
8876	assert(!CB.paramHasAttr(ArgNo, Attribute::ByVal));
8877	Type *T = A ->getType();
8878	ArgKind AK = classifyArgument(T);
8879	if (AK == ArgKind::Indirect) {
8880	T = MS.PtrTy;
8881	AK = ArgKind::GeneralPurpose;
8882	}
8883	if (AK == ArgKind::GeneralPurpose && GpOffset >= SystemZGpEndOffset)
8884	AK = ArgKind::Memory;
8885	if (AK == ArgKind::FloatingPoint && FpOffset >= SystemZFpEndOffset)
8886	AK = ArgKind::Memory;
8887	if (AK == ArgKind::Vector && (VrIndex >= SystemZMaxVrArgs \|\| !IsFixed))
8888	AK = ArgKind::Memory;
8889	Value ShadowBase = nullptr*;
8890	Value OriginBase = nullptr*;
8891	ShadowExtension SE = ShadowExtension::None;
8892	switch (AK) {
8893	case ArgKind::GeneralPurpose: {
8894	// Always keep track of GpOffset, but store shadow only for varargs.
8895	uint64_t ArgSize = `8`;
8896	if (GpOffset + ArgSize <= kParamTLSSize) {
8897	if (!IsFixed) {
8898	SE = getShadowExtension(CB, ArgNo);
8899	uint64_t GapSize = `0`;
8900	if (SE == ShadowExtension::None) {
8901	uint64_t ArgAllocSize = DL.getTypeAllocSize(Ty: T);
8902	assert(ArgAllocSize <= ArgSize);
8903	GapSize = ArgSize - ArgAllocSize;
8904	}
8905	ShadowBase = getShadowAddrForVAArgument(IRB, ArgOffset: GpOffset + GapSize);
8906	if (MS.TrackOrigins)
8907	OriginBase = getOriginPtrForVAArgument(IRB, ArgOffset: GpOffset + GapSize);
8908	}
8909	GpOffset += ArgSize;
8910	} else {
8911	GpOffset = kParamTLSSize;
8912	}
8913	break;
8914	}
8915	case ArgKind::FloatingPoint: {
8916	// Always keep track of FpOffset, but store shadow only for varargs.
8917	uint64_t ArgSize = `8`;
8918	if (FpOffset + ArgSize <= kParamTLSSize) {
8919	if (!IsFixed) {
8920	// PoP says: "A short floating-point datum requires only the
8921	// left-most 32 bit positions of a floating-point register".
8922	// Therefore, in contrast to AK_GeneralPurpose and AK_Memory,
8923	// don't extend shadow and don't mind the gap.
8924	ShadowBase = getShadowAddrForVAArgument(IRB, ArgOffset: FpOffset);
8925	if (MS.TrackOrigins)
8926	OriginBase = getOriginPtrForVAArgument(IRB, ArgOffset: FpOffset);
8927	}
8928	FpOffset += ArgSize;
8929	} else {
8930	FpOffset = kParamTLSSize;
8931	}
8932	break;
8933	}
8934	case ArgKind::Vector: {
8935	// Keep track of VrIndex. No need to store shadow, since vector varargs
8936	// go through AK_Memory.
8937	assert(IsFixed);
8938	VrIndex++;
8939	break;
8940	}
8941	case ArgKind::Memory: {
8942	// Keep track of OverflowOffset and store shadow only for varargs.
8943	// Ignore fixed args, since we need to copy only the vararg portion of
8944	// the overflow area shadow.
8945	if (!IsFixed) {
8946	uint64_t ArgAllocSize = DL.getTypeAllocSize(Ty: T);
8947	uint64_t ArgSize = alignTo(Value: ArgAllocSize, Align: `8`);
8948	if (OverflowOffset + ArgSize <= kParamTLSSize) {
8949	SE = getShadowExtension(CB, ArgNo);
8950	uint64_t GapSize =
8951	SE == ShadowExtension::None ? ArgSize - ArgAllocSize : `0`;
8952	ShadowBase =
8953	getShadowAddrForVAArgument(IRB, ArgOffset: OverflowOffset + GapSize);
8954	if (MS.TrackOrigins)
8955	OriginBase =
8956	getOriginPtrForVAArgument(IRB, ArgOffset: OverflowOffset + GapSize);
8957	OverflowOffset += ArgSize;
8958	} else {
8959	OverflowOffset = kParamTLSSize;
8960	}
8961	}
8962	break;
8963	}
8964	case ArgKind::Indirect:
8965	llvm_unreachable("Indirect must be converted to GeneralPurpose");
8966	}
8967	if (ShadowBase == nullptr)
8968	continue;
8969	Value *Shadow = MSV.getShadow(V: A);
8970	if (SE != ShadowExtension::None)
8971	Shadow = MSV.CreateShadowCast(IRB, V: Shadow, dstTy: IRB.getInt64Ty(),
8972	/Signed/ SE == ShadowExtension::Sign);
8973	ShadowBase = IRB.CreateIntToPtr(V: ShadowBase, DestTy: MS.PtrTy, Name: "_msarg_va_s");
8974	IRB.CreateStore(Val: Shadow, Ptr: ShadowBase);
8975	if (MS.TrackOrigins) {
8976	Value *Origin = MSV.getOrigin(V: A);
8977	TypeSize StoreSize = DL.getTypeStoreSize(Ty: Shadow->getType());
8978	MSV.paintOrigin(IRB, Origin, OriginPtr: OriginBase, TS: StoreSize,
8979	Alignment: kMinOriginAlignment);
8980	}
8981	}
8982	Constant *OverflowSize = ConstantInt::get(
8983	Ty: IRB.getInt64Ty(), V: OverflowOffset - SystemZOverflowOffset);
8984	IRB.CreateStore(Val: OverflowSize, Ptr: MS.VAArgOverflowSizeTLS);
8985	}
8986
8987	void copyRegSaveArea(IRBuilder<> &IRB, Value *VAListTag) {
8988	Value *RegSaveAreaPtrPtr = IRB.CreateIntToPtr(
8989	V: IRB.CreateAdd(
8990	LHS: IRB.CreatePtrToInt(V: VAListTag, DestTy: MS.IntptrTy),
8991	RHS: ConstantInt::get(Ty: MS.IntptrTy, V: SystemZRegSaveAreaPtrOffset)),
8992	DestTy: MS.PtrTy);
8993	Value *RegSaveAreaPtr = IRB.CreateLoad(Ty: MS.PtrTy, Ptr: RegSaveAreaPtrPtr);
8994	Value RegSaveAreaShadowPtr, RegSaveAreaOriginPtr;
8995	const Align Alignment = Align (`8`);
8996	std::tie(args&: RegSaveAreaShadowPtr, args&: RegSaveAreaOriginPtr) =
8997	MSV.getShadowOriginPtr(Addr: RegSaveAreaPtr, IRB, ShadowTy: IRB.getInt8Ty(), Alignment,
8998	/isStore/ true);
8999	// TODO(iii): copy only fragments filled by visitCallBase()
9000	// TODO(iii): support packed-stack && !use-soft-float
9001	// For use-soft-float functions, it is enough to copy just the GPRs.
9002	unsigned RegSaveAreaSize =
9003	IsSoftFloatABI ? SystemZGpEndOffset : SystemZRegSaveAreaSize;
9004	IRB.CreateMemCpy(Dst: RegSaveAreaShadowPtr, DstAlign: Alignment, Src: VAArgTLSCopy, SrcAlign: Alignment,
9005	Size: RegSaveAreaSize);
9006	if (MS.TrackOrigins)
9007	IRB.CreateMemCpy(Dst: RegSaveAreaOriginPtr, DstAlign: Alignment, Src: VAArgTLSOriginCopy,
9008	SrcAlign: Alignment, Size: RegSaveAreaSize);
9009	}
9010
9011	// FIXME: This implementation limits OverflowOffset to kParamTLSSize, so we
9012	// don't know real overflow size and can't clear shadow beyond kParamTLSSize.
9013	void copyOverflowArea(IRBuilder<> &IRB, Value *VAListTag) {
9014	Value *OverflowArgAreaPtrPtr = IRB.CreateIntToPtr(
9015	V: IRB.CreateAdd(
9016	LHS: IRB.CreatePtrToInt(V: VAListTag, DestTy: MS.IntptrTy),
9017	RHS: ConstantInt::get(Ty: MS.IntptrTy, V: SystemZOverflowArgAreaPtrOffset)),
9018	DestTy: MS.PtrTy);
9019	Value *OverflowArgAreaPtr = IRB.CreateLoad(Ty: MS.PtrTy, Ptr: OverflowArgAreaPtrPtr);
9020	Value OverflowArgAreaShadowPtr, OverflowArgAreaOriginPtr;
9021	const Align Alignment = Align (`8`);
9022	std::tie(args&: OverflowArgAreaShadowPtr, args&: OverflowArgAreaOriginPtr) =
9023	MSV.getShadowOriginPtr(Addr: OverflowArgAreaPtr, IRB, ShadowTy: IRB.getInt8Ty(),
9024	Alignment, /isStore/ true);
9025	Value *SrcPtr = IRB.CreateConstGEP1_32(Ty: IRB.getInt8Ty(), Ptr: VAArgTLSCopy,
9026	Idx0: SystemZOverflowOffset);
9027	IRB.CreateMemCpy(Dst: OverflowArgAreaShadowPtr, DstAlign: Alignment, Src: SrcPtr, SrcAlign: Alignment,
9028	Size: VAArgOverflowSize);
9029	if (MS.TrackOrigins) {
9030	SrcPtr = IRB.CreateConstGEP1_32(Ty: IRB.getInt8Ty(), Ptr: VAArgTLSOriginCopy,
9031	Idx0: SystemZOverflowOffset);
9032	IRB.CreateMemCpy(Dst: OverflowArgAreaOriginPtr, DstAlign: Alignment, Src: SrcPtr, SrcAlign: Alignment,
9033	Size: VAArgOverflowSize);
9034	}
9035	}
9036
9037	void finalizeInstrumentation() override {
9038	assert(!VAArgOverflowSize && !VAArgTLSCopy &&
9039	"finalizeInstrumentation called twice");
9040	if (!VAStartInstrumentationList.empty()) {
9041	// If there is a va_start in this function, make a backup copy of
9042	// va_arg_tls somewhere in the function entry block.
9043	IRBuilder<> IRB(MSV.FnPrologueEnd);
9044	VAArgOverflowSize =
9045	IRB.CreateLoad(Ty: IRB.getInt64Ty(), Ptr: MS.VAArgOverflowSizeTLS);
9046	Value *CopySize =
9047	IRB.CreateAdd(LHS: ConstantInt::get(Ty: MS.IntptrTy, V: SystemZOverflowOffset),
9048	RHS: VAArgOverflowSize);
9049	VAArgTLSCopy = IRB.CreateAlloca(Ty: Type::getInt8Ty(C&: *MS.C), ArraySize: CopySize);
9050	VAArgTLSCopy->setAlignment(kShadowTLSAlignment);
9051	IRB.CreateMemSet(Ptr: VAArgTLSCopy, Val: Constant::getNullValue(Ty: IRB.getInt8Ty()),
9052	Size: CopySize, Align: kShadowTLSAlignment, isVolatile: false);
9053
9054	Value *SrcSize = IRB.CreateBinaryIntrinsic(
9055	ID: Intrinsic::umin, LHS: CopySize,
9056	RHS: ConstantInt::get(Ty: MS.IntptrTy, V: kParamTLSSize));
9057	IRB.CreateMemCpy(Dst: VAArgTLSCopy, DstAlign: kShadowTLSAlignment, Src: MS.VAArgTLS,
9058	SrcAlign: kShadowTLSAlignment, Size: SrcSize);
9059	if (MS.TrackOrigins) {
9060	VAArgTLSOriginCopy = IRB.CreateAlloca(Ty: Type::getInt8Ty(C&: *MS.C), ArraySize: CopySize);
9061	VAArgTLSOriginCopy->setAlignment(kShadowTLSAlignment);
9062	IRB.CreateMemCpy(Dst: VAArgTLSOriginCopy, DstAlign: kShadowTLSAlignment,
9063	Src: MS.VAArgOriginTLS, SrcAlign: kShadowTLSAlignment, Size: SrcSize);
9064	}
9065	}
9066
9067	// Instrument va_start.
9068	// Copy va_list shadow from the backup copy of the TLS contents.
9069	for (CallInst *OrigInst : VAStartInstrumentationList) {
9070	NextNodeIRBuilder IRB(OrigInst);
9071	Value *VAListTag = OrigInst->getArgOperand(i: `0`);
9072	copyRegSaveArea(IRB, VAListTag);
9073	copyOverflowArea(IRB, VAListTag);
9074	}
9075	}
9076	};
9077
9078	/// i386-specific implementation of VarArgHelper.
9079	struct VarArgI386Helper : public VarArgHelperBase {
9080	AllocaInst VAArgTLSCopy = nullptr*;
9081	Value VAArgSize = nullptr*;
9082
9083	VarArgI386Helper(Function &F, MemorySanitizer &MS,
9084	MemorySanitizerVisitor &MSV)
9085	: VarArgHelperBase (F, MS, MSV, /VAListTagSize=/`4`) {}
9086
9087	void visitCallBase(CallBase &CB, IRBuilder<> &IRB) override {
9088	const DataLayout &DL = F.getDataLayout();
9089	unsigned IntptrSize = DL.getTypeStoreSize(Ty: MS.IntptrTy);
9090	unsigned VAArgOffset = `0`;
9091	for (const auto &[ArgNo, A] : llvm::enumerate(First: CB.args())) {
9092	bool IsFixed = ArgNo < CB.getFunctionType()->getNumParams();
9093	bool IsByVal = CB.paramHasAttr(ArgNo, Kind: Attribute::ByVal);
9094	if (IsByVal) {
9095	assert(A->getType()->isPointerTy());
9096	Type *RealTy = CB.getParamByValType(ArgNo);
9097	uint64_t ArgSize = DL.getTypeAllocSize(Ty: RealTy);
9098	Align ArgAlign = CB.getParamAlign(ArgNo).value_or(u: Align (IntptrSize));
9099	if (ArgAlign < IntptrSize)
9100	ArgAlign = Align (IntptrSize);
9101	VAArgOffset = alignTo(Size: VAArgOffset, A: ArgAlign);
9102	if (!IsFixed) {
9103	Value *Base = getShadowPtrForVAArgument(IRB, ArgOffset: VAArgOffset, ArgSize);
9104	if (Base) {
9105	Value AShadowPtr, AOriginPtr;
9106	std::tie(args&: AShadowPtr, args&: AOriginPtr) =
9107	MSV.getShadowOriginPtr(Addr: A, IRB, ShadowTy: IRB.getInt8Ty(),
9108	Alignment: kShadowTLSAlignment, /isStore/ false);
9109
9110	IRB.CreateMemCpy(Dst: Base, DstAlign: kShadowTLSAlignment, Src: AShadowPtr,
9111	SrcAlign: kShadowTLSAlignment, Size: ArgSize);
9112	}
9113	VAArgOffset += alignTo(Size: ArgSize, A: Align (IntptrSize));
9114	}
9115	} else {
9116	Value *Base;
9117	uint64_t ArgSize = DL.getTypeAllocSize(Ty: A ->getType());
9118	Align ArgAlign = Align (IntptrSize);
9119	VAArgOffset = alignTo(Size: VAArgOffset, A: ArgAlign);
9120	if (DL.isBigEndian()) {
9121	// Adjusting the shadow for argument with size < IntptrSize to match
9122	// the placement of bits in big endian system
9123	if (ArgSize < IntptrSize)
9124	VAArgOffset += (IntptrSize - ArgSize);
9125	}
9126	if (!IsFixed) {
9127	Base = getShadowPtrForVAArgument(IRB, ArgOffset: VAArgOffset, ArgSize);
9128	if (Base)
9129	IRB.CreateAlignedStore(Val: MSV.getShadow(V: A), Ptr: Base, Align: kShadowTLSAlignment);
9130	VAArgOffset += ArgSize;
9131	VAArgOffset = alignTo(Size: VAArgOffset, A: Align (IntptrSize));
9132	}
9133	}
9134	}
9135
9136	Constant *TotalVAArgSize = ConstantInt::get(Ty: MS.IntptrTy, V: VAArgOffset);
9137	// Here using VAArgOverflowSizeTLS as VAArgSizeTLS to avoid creation of
9138	// a new class member i.e. it is the total size of all VarArgs.
9139	IRB.CreateStore(Val: TotalVAArgSize, Ptr: MS.VAArgOverflowSizeTLS);
9140	}
9141
9142	void finalizeInstrumentation() override {
9143	assert(!VAArgSize && !VAArgTLSCopy &&
9144	"finalizeInstrumentation called twice");
9145	IRBuilder<> IRB(MSV.FnPrologueEnd);
9146	VAArgSize = IRB.CreateLoad(Ty: MS.IntptrTy, Ptr: MS.VAArgOverflowSizeTLS);
9147	Value *CopySize = VAArgSize;
9148
9149	if (!VAStartInstrumentationList.empty()) {
9150	// If there is a va_start in this function, make a backup copy of
9151	// va_arg_tls somewhere in the function entry block.
9152	VAArgTLSCopy = IRB.CreateAlloca(Ty: Type::getInt8Ty(C&: *MS.C), ArraySize: CopySize);
9153	VAArgTLSCopy->setAlignment(kShadowTLSAlignment);
9154	IRB.CreateMemSet(Ptr: VAArgTLSCopy, Val: Constant::getNullValue(Ty: IRB.getInt8Ty()),
9155	Size: CopySize, Align: kShadowTLSAlignment, isVolatile: false);
9156
9157	Value *SrcSize = IRB.CreateBinaryIntrinsic(
9158	ID: Intrinsic::umin, LHS: CopySize,
9159	RHS: ConstantInt::get(Ty: MS.IntptrTy, V: kParamTLSSize));
9160	IRB.CreateMemCpy(Dst: VAArgTLSCopy, DstAlign: kShadowTLSAlignment, Src: MS.VAArgTLS,
9161	SrcAlign: kShadowTLSAlignment, Size: SrcSize);
9162	}
9163
9164	// Instrument va_start.
9165	// Copy va_list shadow from the backup copy of the TLS contents.
9166	for (CallInst *OrigInst : VAStartInstrumentationList) {
9167	NextNodeIRBuilder IRB(OrigInst);
9168	Value *VAListTag = OrigInst->getArgOperand(i: `0`);
9169	Type RegSaveAreaPtrTy = PointerType::getUnqual(C&: MS.C);
9170	Value *RegSaveAreaPtrPtr =
9171	IRB.CreateIntToPtr(V: IRB.CreatePtrToInt(V: VAListTag, DestTy: MS.IntptrTy),
9172	DestTy: PointerType::get(C&: *MS.C, AddressSpace: `0`));
9173	Value *RegSaveAreaPtr =
9174	IRB.CreateLoad(Ty: RegSaveAreaPtrTy, Ptr: RegSaveAreaPtrPtr);
9175	Value RegSaveAreaShadowPtr, RegSaveAreaOriginPtr;
9176	const DataLayout &DL = F.getDataLayout();
9177	unsigned IntptrSize = DL.getTypeStoreSize(Ty: MS.IntptrTy);
9178	const Align Alignment = Align (IntptrSize);
9179	std::tie(args&: RegSaveAreaShadowPtr, args&: RegSaveAreaOriginPtr) =
9180	MSV.getShadowOriginPtr(Addr: RegSaveAreaPtr, IRB, ShadowTy: IRB.getInt8Ty(),
9181	Alignment, /isStore/ true);
9182	IRB.CreateMemCpy(Dst: RegSaveAreaShadowPtr, DstAlign: Alignment, Src: VAArgTLSCopy, SrcAlign: Alignment,
9183	Size: CopySize);
9184	}
9185	}
9186	};
9187
9188	/// Implementation of VarArgHelper that is used for ARM32, MIPS, RISCV,
9189	/// LoongArch64.
9190	struct VarArgGenericHelper : public VarArgHelperBase {
9191	AllocaInst VAArgTLSCopy = nullptr*;
9192	Value VAArgSize = nullptr*;
9193
9194	VarArgGenericHelper(Function &F, MemorySanitizer &MS,
9195	MemorySanitizerVisitor &MSV, const unsigned VAListTagSize)
9196	: VarArgHelperBase (F, MS, MSV, VAListTagSize) {}
9197
9198	void visitCallBase(CallBase &CB, IRBuilder<> &IRB) override {
9199	unsigned VAArgOffset = `0`;
9200	const DataLayout &DL = F.getDataLayout();
9201	unsigned IntptrSize = DL.getTypeStoreSize(Ty: MS.IntptrTy);
9202	for (const auto &[ArgNo, A] : llvm::enumerate(First: CB.args())) {
9203	bool IsFixed = ArgNo < CB.getFunctionType()->getNumParams();
9204	if (IsFixed)
9205	continue;
9206	uint64_t ArgSize = DL.getTypeAllocSize(Ty: A ->getType());
9207	if (DL.isBigEndian()) {
9208	// Adjusting the shadow for argument with size < IntptrSize to match the
9209	// placement of bits in big endian system
9210	if (ArgSize < IntptrSize)
9211	VAArgOffset += (IntptrSize - ArgSize);
9212	}
9213	Value *Base = getShadowPtrForVAArgument(IRB, ArgOffset: VAArgOffset, ArgSize);
9214	VAArgOffset += ArgSize;
9215	VAArgOffset = alignTo(Value: VAArgOffset, Align: IntptrSize);
9216	if (!Base)
9217	continue;
9218	IRB.CreateAlignedStore(Val: MSV.getShadow(V: A), Ptr: Base, Align: kShadowTLSAlignment);
9219	}
9220
9221	Constant *TotalVAArgSize = ConstantInt::get(Ty: MS.IntptrTy, V: VAArgOffset);
9222	// Here using VAArgOverflowSizeTLS as VAArgSizeTLS to avoid creation of
9223	// a new class member i.e. it is the total size of all VarArgs.
9224	IRB.CreateStore(Val: TotalVAArgSize, Ptr: MS.VAArgOverflowSizeTLS);
9225	}
9226
9227	void finalizeInstrumentation() override {
9228	assert(!VAArgSize && !VAArgTLSCopy &&
9229	"finalizeInstrumentation called twice");
9230	IRBuilder<> IRB(MSV.FnPrologueEnd);
9231	VAArgSize = IRB.CreateLoad(Ty: MS.IntptrTy, Ptr: MS.VAArgOverflowSizeTLS);
9232	Value *CopySize = VAArgSize;
9233
9234	if (!VAStartInstrumentationList.empty()) {
9235	// If there is a va_start in this function, make a backup copy of
9236	// va_arg_tls somewhere in the function entry block.
9237	VAArgTLSCopy = IRB.CreateAlloca(Ty: Type::getInt8Ty(C&: *MS.C), ArraySize: CopySize);
9238	VAArgTLSCopy->setAlignment(kShadowTLSAlignment);
9239	IRB.CreateMemSet(Ptr: VAArgTLSCopy, Val: Constant::getNullValue(Ty: IRB.getInt8Ty()),
9240	Size: CopySize, Align: kShadowTLSAlignment, isVolatile: false);
9241
9242	Value *SrcSize = IRB.CreateBinaryIntrinsic(
9243	ID: Intrinsic::umin, LHS: CopySize,
9244	RHS: ConstantInt::get(Ty: MS.IntptrTy, V: kParamTLSSize));
9245	IRB.CreateMemCpy(Dst: VAArgTLSCopy, DstAlign: kShadowTLSAlignment, Src: MS.VAArgTLS,
9246	SrcAlign: kShadowTLSAlignment, Size: SrcSize);
9247	}
9248
9249	// Instrument va_start.
9250	// Copy va_list shadow from the backup copy of the TLS contents.
9251	for (CallInst *OrigInst : VAStartInstrumentationList) {
9252	NextNodeIRBuilder IRB(OrigInst);
9253	Value *VAListTag = OrigInst->getArgOperand(i: `0`);
9254	Type RegSaveAreaPtrTy = PointerType::getUnqual(C&: MS.C);
9255	Value *RegSaveAreaPtrPtr =
9256	IRB.CreateIntToPtr(V: IRB.CreatePtrToInt(V: VAListTag, DestTy: MS.IntptrTy),
9257	DestTy: PointerType::get(C&: *MS.C, AddressSpace: `0`));
9258	Value *RegSaveAreaPtr =
9259	IRB.CreateLoad(Ty: RegSaveAreaPtrTy, Ptr: RegSaveAreaPtrPtr);
9260	Value RegSaveAreaShadowPtr, RegSaveAreaOriginPtr;
9261	const DataLayout &DL = F.getDataLayout();
9262	unsigned IntptrSize = DL.getTypeStoreSize(Ty: MS.IntptrTy);
9263	const Align Alignment = Align (IntptrSize);
9264	std::tie(args&: RegSaveAreaShadowPtr, args&: RegSaveAreaOriginPtr) =
9265	MSV.getShadowOriginPtr(Addr: RegSaveAreaPtr, IRB, ShadowTy: IRB.getInt8Ty(),
9266	Alignment, /isStore/ true);
9267	IRB.CreateMemCpy(Dst: RegSaveAreaShadowPtr, DstAlign: Alignment, Src: VAArgTLSCopy, SrcAlign: Alignment,
9268	Size: CopySize);
9269	}
9270	}
9271	};
9272
9273	// ARM32, Loongarch64, MIPS and RISCV share the same calling conventions
9274	// regarding VAArgs.
9275	using VarArgARM32Helper = VarArgGenericHelper;
9276	using VarArgRISCVHelper = VarArgGenericHelper;
9277	using VarArgMIPSHelper = VarArgGenericHelper;
9278	using VarArgLoongArch64Helper = VarArgGenericHelper;
9279
9280	/// A no-op implementation of VarArgHelper.
9281	struct VarArgNoOpHelper : public VarArgHelper {
9282	VarArgNoOpHelper(Function &F, MemorySanitizer &MS,
9283	MemorySanitizerVisitor &MSV) {}
9284
9285	void visitCallBase(CallBase &CB, IRBuilder<> &IRB) override {}
9286
9287	void visitVAStartInst(VAStartInst &I) override {}
9288
9289	void visitVACopyInst(VACopyInst &I) override {}
9290
9291	void finalizeInstrumentation() override {}
9292	};
9293
9294	} // end anonymous namespace
9295
9296	static VarArgHelper *CreateVarArgHelper(Function &Func, MemorySanitizer &Msan,
9297	MemorySanitizerVisitor &Visitor) {
9298	// VarArg handling is only implemented on AMD64. False positives are possible
9299	// on other platforms.
9300	Triple TargetTriple(Func.getParent()->getTargetTriple());
9301
9302	if (TargetTriple.getArch() == Triple::x86)
9303	return new VarArgI386Helper (Func, Msan, Visitor);
9304
9305	if (TargetTriple.getArch() == Triple::x86_64)
9306	return new VarArgAMD64Helper (Func, Msan, Visitor);
9307
9308	if (TargetTriple.isARM())
9309	return new VarArgARM32Helper (Func, Msan, Visitor, /VAListTagSize=/`4`);
9310
9311	if (TargetTriple.isAArch64())
9312	return new VarArgAArch64Helper (Func, Msan, Visitor);
9313
9314	if (TargetTriple.isSystemZ())
9315	return new VarArgSystemZHelper (Func, Msan, Visitor);
9316
9317	// On PowerPC32 VAListTag is a struct
9318	// {char, char, i16 padding, char , char }
9319	if (TargetTriple.isPPC32())
9320	return new VarArgPowerPC32Helper (Func, Msan, Visitor);
9321
9322	if (TargetTriple.isPPC64())
9323	return new VarArgPowerPC64Helper (Func, Msan, Visitor);
9324
9325	if (TargetTriple.isRISCV32())
9326	return new VarArgRISCVHelper (Func, Msan, Visitor, /VAListTagSize=/`4`);
9327
9328	if (TargetTriple.isRISCV64())
9329	return new VarArgRISCVHelper (Func, Msan, Visitor, /VAListTagSize=/`8`);
9330
9331	if (TargetTriple.isMIPS32())
9332	return new VarArgMIPSHelper (Func, Msan, Visitor, /VAListTagSize=/`4`);
9333
9334	if (TargetTriple.isMIPS64())
9335	return new VarArgMIPSHelper (Func, Msan, Visitor, /VAListTagSize=/`8`);
9336
9337	if (TargetTriple.isLoongArch64())
9338	return new VarArgLoongArch64Helper (Func, Msan, Visitor,
9339	/VAListTagSize=/`8`);
9340
9341	return new VarArgNoOpHelper (Func, Msan, Visitor);
9342	}
9343
9344	bool MemorySanitizer::sanitizeFunction(Function &F, TargetLibraryInfo &TLI) {
9345	if (!CompileKernel && F.getName() == kMsanModuleCtorName)
9346	return false;
9347
9348	if (F.hasFnAttribute(Kind: Attribute::DisableSanitizerInstrumentation))
9349	return false;
9350
9351	MemorySanitizerVisitor Visitor(F, *this, TLI);
9352
9353	// Clear out memory attributes.
9354	AttributeMask B;
9355	B.addAttribute(Val: Attribute::Memory).addAttribute(Val: Attribute::Speculatable);
9356	F.removeFnAttrs(Attrs: B);
9357
9358	return Visitor.runOnFunction();
9359	}
9360

Browse the source code of llvm_projects/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp