OpenMPOpt.cpp source code [llvm_projects/llvm/lib/Transforms/IPO/OpenMPOpt.cpp]

1	//===-- IPO/OpenMPOpt.cpp - Collection of OpenMP specific optimizations ---===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// OpenMP specific optimizations:
10	//
11	// - Deduplication of runtime calls, e.g., omp_get_thread_num.
12	// - Replacing globalized device memory with stack memory.
13	// - Replacing globalized device memory with shared memory.
14	// - Parallel region merging.
15	// - Transforming generic-mode device kernels to SPMD mode.
16	// - Specializing the state machine for generic-mode device kernels.
17	//
18	//===----------------------------------------------------------------------===//
19
20	#include "llvm/Transforms/IPO/OpenMPOpt.h"
21
22	#include "llvm/ADT/EnumeratedArray.h"
23	#include "llvm/ADT/PostOrderIterator.h"
24	#include "llvm/ADT/SetVector.h"
25	#include "llvm/ADT/SmallPtrSet.h"
26	#include "llvm/ADT/SmallVector.h"
27	#include "llvm/ADT/Statistic.h"
28	#include "llvm/ADT/StringExtras.h"
29	#include "llvm/ADT/StringRef.h"
30	#include "llvm/Analysis/CallGraph.h"
31	#include "llvm/Analysis/CallGraphSCCPass.h"
32	#include "llvm/Analysis/MemoryLocation.h"
33	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
34	#include "llvm/Analysis/ValueTracking.h"
35	#include "llvm/Frontend/OpenMP/OMPConstants.h"
36	#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
37	#include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
38	#include "llvm/IR/Assumptions.h"
39	#include "llvm/IR/BasicBlock.h"
40	#include "llvm/IR/Constants.h"
41	#include "llvm/IR/DiagnosticInfo.h"
42	#include "llvm/IR/Dominators.h"
43	#include "llvm/IR/Function.h"
44	#include "llvm/IR/GlobalValue.h"
45	#include "llvm/IR/GlobalVariable.h"
46	#include "llvm/IR/InstrTypes.h"
47	#include "llvm/IR/Instruction.h"
48	#include "llvm/IR/Instructions.h"
49	#include "llvm/IR/IntrinsicInst.h"
50	#include "llvm/IR/IntrinsicsAMDGPU.h"
51	#include "llvm/IR/IntrinsicsNVPTX.h"
52	#include "llvm/IR/LLVMContext.h"
53	#include "llvm/Support/Casting.h"
54	#include "llvm/Support/CommandLine.h"
55	#include "llvm/Support/Debug.h"
56	#include "llvm/Transforms/IPO/Attributor.h"
57	#include "llvm/Transforms/Utils/BasicBlockUtils.h"
58	#include "llvm/Transforms/Utils/CallGraphUpdater.h"
59
60	#include <algorithm>
61	#include <optional>
62	#include <string>
63
64	using namespace llvm;
65	using namespace omp;
66
67	#define DEBUG_TYPE "openmp-opt"
68
69	static cl::opt<bool> DisableOpenMPOptimizations(
70	"openmp-opt-disable", cl::desc ("Disable OpenMP specific optimizations."),
71	cl::Hidden, cl::init(Val: false));
72
73	static cl::opt<bool> EnableParallelRegionMerging(
74	"openmp-opt-enable-merging",
75	cl::desc ("Enable the OpenMP region merging optimization."), cl::Hidden,
76	cl::init(Val: false));
77
78	static cl::opt<bool>
79	DisableInternalization("openmp-opt-disable-internalization",
80	cl::desc ("Disable function internalization."),
81	cl::Hidden, cl::init(Val: false));
82
83	static cl::opt<bool> DeduceICVValues("openmp-deduce-icv-values",
84	cl::init(Val: false), cl::Hidden);
85	static cl::opt<bool> PrintICVValues("openmp-print-icv-values", cl::init(Val: false),
86	cl::Hidden);
87	static cl::opt<bool> PrintOpenMPKernels("openmp-print-gpu-kernels",
88	cl::init(Val: false), cl::Hidden);
89
90	static cl::opt<bool> HideMemoryTransferLatency(
91	"openmp-hide-memory-transfer-latency",
92	cl::desc ("[WIP] Tries to hide the latency of host to device memory"
93	" transfers"),
94	cl::Hidden, cl::init(Val: false));
95
96	static cl::opt<bool> DisableOpenMPOptDeglobalization(
97	"openmp-opt-disable-deglobalization",
98	cl::desc ("Disable OpenMP optimizations involving deglobalization."),
99	cl::Hidden, cl::init(Val: false));
100
101	static cl::opt<bool> DisableOpenMPOptSPMDization(
102	"openmp-opt-disable-spmdization",
103	cl::desc ("Disable OpenMP optimizations involving SPMD-ization."),
104	cl::Hidden, cl::init(Val: false));
105
106	static cl::opt<bool> DisableOpenMPOptFolding(
107	"openmp-opt-disable-folding",
108	cl::desc ("Disable OpenMP optimizations involving folding."), cl::Hidden,
109	cl::init(Val: false));
110
111	static cl::opt<bool> DisableOpenMPOptStateMachineRewrite(
112	"openmp-opt-disable-state-machine-rewrite",
113	cl::desc ("Disable OpenMP optimizations that replace the state machine."),
114	cl::Hidden, cl::init(Val: false));
115
116	static cl::opt<bool> DisableOpenMPOptBarrierElimination(
117	"openmp-opt-disable-barrier-elimination",
118	cl::desc ("Disable OpenMP optimizations that eliminate barriers."),
119	cl::Hidden, cl::init(Val: false));
120
121	static cl::opt<bool> PrintModuleAfterOptimizations(
122	"openmp-opt-print-module-after",
123	cl::desc ("Print the current module after OpenMP optimizations."),
124	cl::Hidden, cl::init(Val: false));
125
126	static cl::opt<bool> PrintModuleBeforeOptimizations(
127	"openmp-opt-print-module-before",
128	cl::desc ("Print the current module before OpenMP optimizations."),
129	cl::Hidden, cl::init(Val: false));
130
131	static cl::opt<bool> AlwaysInlineDeviceFunctions(
132	"openmp-opt-inline-device",
133	cl::desc ("Inline all applicible functions on the device."), cl::Hidden,
134	cl::init(Val: false));
135
136	static cl::opt<bool>
137	EnableVerboseRemarks("openmp-opt-verbose-remarks",
138	cl::desc ("Enables more verbose remarks."), cl::Hidden,
139	cl::init(Val: false));
140
141	static cl::opt<unsigned>
142	SetFixpointIterations("openmp-opt-max-iterations", cl::Hidden,
143	cl::desc ("Maximal number of attributor iterations."),
144	cl::init(Val: `256`));
145
146	static cl::opt<unsigned>
147	SharedMemoryLimit("openmp-opt-shared-limit", cl::Hidden,
148	cl::desc ("Maximum amount of shared memory to use."),
149	cl::init(Val: std::numeric_limits<unsigned>::max()));
150
151	STATISTIC(NumOpenMPRuntimeCallsDeduplicated,
152	"Number of OpenMP runtime calls deduplicated");
153	STATISTIC(NumOpenMPParallelRegionsDeleted,
154	"Number of OpenMP parallel regions deleted");
155	STATISTIC(NumOpenMPRuntimeFunctionsIdentified,
156	"Number of OpenMP runtime functions identified");
157	STATISTIC(NumOpenMPRuntimeFunctionUsesIdentified,
158	"Number of OpenMP runtime function uses identified");
159	STATISTIC(NumOpenMPTargetRegionKernels,
160	"Number of OpenMP target region entry points (=kernels) identified");
161	STATISTIC(NumNonOpenMPTargetRegionKernels,
162	"Number of non-OpenMP target region kernels identified");
163	STATISTIC(NumOpenMPTargetRegionKernelsSPMD,
164	"Number of OpenMP target region entry points (=kernels) executed in "
165	"SPMD-mode instead of generic-mode");
166	STATISTIC(NumOpenMPTargetRegionKernelsWithoutStateMachine,
167	"Number of OpenMP target region entry points (=kernels) executed in "
168	"generic-mode without a state machines");
169	STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback,
170	"Number of OpenMP target region entry points (=kernels) executed in "
171	"generic-mode with customized state machines with fallback");
172	STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback,
173	"Number of OpenMP target region entry points (=kernels) executed in "
174	"generic-mode with customized state machines without fallback");
175	STATISTIC(
176	NumOpenMPParallelRegionsReplacedInGPUStateMachine,
177	"Number of OpenMP parallel regions replaced with ID in GPU state machines");
178	STATISTIC(NumOpenMPParallelRegionsMerged,
179	"Number of OpenMP parallel regions merged");
180	STATISTIC(NumBytesMovedToSharedMemory,
181	"Amount of memory pushed to shared memory");
182	STATISTIC(NumBarriersEliminated, "Number of redundant barriers eliminated");
183
184	#if !defined(NDEBUG)
185	static constexpr auto TAG = "[" DEBUG_TYPE "]";
186	#endif
187
188	namespace KernelInfo {
189
190	// struct ConfigurationEnvironmentTy {
191	// uint8_t UseGenericStateMachine;
192	// uint8_t MayUseNestedParallelism;
193	// llvm::omp::OMPTgtExecModeFlags ExecMode;
194	// int32_t MinThreads;
195	// int32_t MaxThreads;
196	// int32_t MinTeams;
197	// int32_t MaxTeams;
198	// };
199
200	// struct DynamicEnvironmentTy {
201	// uint16_t DebugIndentionLevel;
202	// };
203
204	// struct KernelEnvironmentTy {
205	// ConfigurationEnvironmentTy Configuration;
206	// IdentTy Ident;*
207	// DynamicEnvironmentTy DynamicEnv;*
208	// };
209
210	#define KERNEL_ENVIRONMENT_IDX(MEMBER, IDX) \
211	constexpr const unsigned MEMBER##Idx = IDX;
212
213	KERNEL_ENVIRONMENT_IDX(Configuration, `0`)
214	KERNEL_ENVIRONMENT_IDX(Ident, `1`)
215
216	#undef KERNEL_ENVIRONMENT_IDX
217
218	#define KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MEMBER, IDX) \
219	constexpr const unsigned MEMBER##Idx = IDX;
220
221	KERNEL_ENVIRONMENT_CONFIGURATION_IDX(UseGenericStateMachine, `0`)
222	KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MayUseNestedParallelism, `1`)
223	KERNEL_ENVIRONMENT_CONFIGURATION_IDX(ExecMode, `2`)
224	KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MinThreads, `3`)
225	KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MaxThreads, `4`)
226	KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MinTeams, `5`)
227	KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MaxTeams, `6`)
228
229	#undef KERNEL_ENVIRONMENT_CONFIGURATION_IDX
230
231	#define KERNEL_ENVIRONMENT_GETTER(MEMBER, RETURNTYPE) \
232	RETURNTYPE get##MEMBER##FromKernelEnvironment(ConstantStruct KernelEnvC) { \
233	return cast<RETURNTYPE>(KernelEnvC->getAggregateElement(MEMBER##Idx)); \
234	}
235
236	KERNEL_ENVIRONMENT_GETTER(Ident, Constant)
237	KERNEL_ENVIRONMENT_GETTER(Configuration, ConstantStruct)
238
239	#undef KERNEL_ENVIRONMENT_GETTER
240
241	#define KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MEMBER) \
242	ConstantInt *get##MEMBER##FromKernelEnvironment( \
243	ConstantStruct *KernelEnvC) { \
244	ConstantStruct *ConfigC = \
245	getConfigurationFromKernelEnvironment(KernelEnvC); \
246	return dyn_cast<ConstantInt>(ConfigC->getAggregateElement(MEMBER##Idx)); \
247	}
248
249	KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(UseGenericStateMachine)
250	KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MayUseNestedParallelism)
251	KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(ExecMode)
252	KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MinThreads)
253	KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MaxThreads)
254	KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MinTeams)
255	KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MaxTeams)
256
257	#undef KERNEL_ENVIRONMENT_CONFIGURATION_GETTER
258
259	GlobalVariable *
260	getKernelEnvironementGVFromKernelInitCB(CallBase *KernelInitCB) {
261	constexpr const int InitKernelEnvironmentArgNo = `0`;
262	return cast<GlobalVariable>(
263	Val: KernelInitCB->getArgOperand(i: InitKernelEnvironmentArgNo)
264	->stripPointerCasts());
265	}
266
267	ConstantStruct getKernelEnvironementFromKernelInitCB(CallBase KernelInitCB) {
268	GlobalVariable *KernelEnvGV =
269	getKernelEnvironementGVFromKernelInitCB(KernelInitCB);
270	return cast<ConstantStruct>(Val: KernelEnvGV->getInitializer());
271	}
272	} // namespace KernelInfo
273
274	namespace {
275
276	struct AAHeapToShared;
277
278	struct AAICVTracker;
279
280	/// OpenMP specific information. For now, stores RFIs and ICVs also needed for
281	/// Attributor runs.
282	struct OMPInformationCache : public InformationCache {
283	OMPInformationCache(Module &M, AnalysisGetter &AG,
284	BumpPtrAllocator &Allocator, SetVector<Function > CGSCC,
285	bool OpenMPPostLink)
286	: InformationCache (M, AG, Allocator, CGSCC), OMPBuilder (M),
287	OpenMPPostLink(OpenMPPostLink) {
288
289	OMPBuilder.Config.IsTargetDevice = isOpenMPDevice(M&: OMPBuilder.M);
290	OMPBuilder.initialize();
291	initializeRuntimeFunctions(M);
292	initializeInternalControlVars();
293	}
294
295	/// Generic information that describes an internal control variable.
296	struct InternalControlVarInfo {
297	/// The kind, as described by InternalControlVar enum.
298	InternalControlVar Kind;
299
300	/// The name of the ICV.
301	StringRef Name;
302
303	/// Environment variable associated with this ICV.
304	StringRef EnvVarName;
305
306	/// Initial value kind.
307	ICVInitValue InitKind;
308
309	/// Initial value.
310	ConstantInt *InitValue;
311
312	/// Setter RTL function associated with this ICV.
313	RuntimeFunction Setter;
314
315	/// Getter RTL function associated with this ICV.
316	RuntimeFunction Getter;
317
318	/// RTL Function corresponding to the override clause of this ICV
319	RuntimeFunction Clause;
320	};
321
322	/// Generic information that describes a runtime function
323	struct RuntimeFunctionInfo {
324
325	/// The kind, as described by the RuntimeFunction enum.
326	RuntimeFunction Kind;
327
328	/// The name of the function.
329	StringRef Name;
330
331	/// Flag to indicate a variadic function.
332	bool IsVarArg;
333
334	/// The return type of the function.
335	Type *ReturnType;
336
337	/// The argument types of the function.
338	SmallVector<Type *, `8`> ArgumentTypes;
339
340	/// The declaration if available.
341	Function Declaration = nullptr*;
342
343	/// Uses of this runtime function per function containing the use.
344	using UseVector = SmallVector<Use *, `16`>;
345
346	/// Clear UsesMap for runtime function.
347	void clearUsesMap() { UsesMap.clear(); }
348
349	/// Boolean conversion that is true if the runtime function was found.
350	operator bool() const { return Declaration; }
351
352	/// Return the vector of uses in function \p F.
353	UseVector &getOrCreateUseVector(Function *F) {
354	std::shared_ptr<UseVector> &UV = UsesMap [F];
355	if (!UV)
356	UV = std::make_shared<UseVector>();
357	return *UV;
358	}
359
360	/// Return the vector of uses in function \p F or `nullptr` if there are
361	/// none.
362	const UseVector getUseVector(Function &F) const* {
363	auto I = UsesMap.find(Val: &F);
364	if (I != UsesMap.end())
365	return I ->second.get();
366	return nullptr;
367	}
368
369	/// Return how many functions contain uses of this runtime function.
370	size_t getNumFunctionsWithUses() const { return UsesMap.size(); }
371
372	/// Return the number of arguments (or the minimal number for variadic
373	/// functions).
374	size_t getNumArgs() const { return ArgumentTypes.size(); }
375
376	/// Run the callback \p CB on each use and forget the use if the result is
377	/// true. The callback will be fed the function in which the use was
378	/// encountered as second argument.
379	void foreachUse(SmallVectorImpl<Function *> &SCC,
380	function_ref<bool(Use &, Function &)> CB) {
381	for (Function *F : SCC)
382	foreachUse(CB, F);
383	}
384
385	/// Run the callback \p CB on each use within the function \p F and forget
386	/// the use if the result is true.
387	void foreachUse(function_ref<bool(Use &, Function &)> CB, Function *F) {
388	SmallVector<unsigned, `8`> ToBeDeleted;
389	ToBeDeleted.clear();
390
391	unsigned Idx = `0`;
392	UseVector &UV = getOrCreateUseVector(F);
393
394	for (Use *U : UV) {
395	if (CB (U, F))
396	ToBeDeleted.push_back(Elt: Idx);
397	++Idx;
398	}
399
400	// Remove the to-be-deleted indices in reverse order as prior
401	// modifications will not modify the smaller indices.
402	while (!ToBeDeleted.empty()) {
403	unsigned Idx = ToBeDeleted.pop_back_val();
404	UV [Idx] = UV.back();
405	UV.pop_back();
406	}
407	}
408
409	private:
410	/// Map from functions to all uses of this runtime function contained in
411	/// them.
412	DenseMap<Function *, std::shared_ptr<UseVector>> UsesMap;
413
414	public:
415	/// Iterators for the uses of this runtime function.
416	decltype(UsesMap)::iterator begin() { return UsesMap.begin(); }
417	decltype(UsesMap)::iterator end() { return UsesMap.end(); }
418	};
419
420	/// An OpenMP-IR-Builder instance
421	OpenMPIRBuilder OMPBuilder;
422
423	/// Map from runtime function kind to the runtime function description.
424	EnumeratedArray<RuntimeFunctionInfo, RuntimeFunction,
425	RuntimeFunction::OMPRTL___last>
426	RFIs;
427
428	/// Map from function declarations/definitions to their runtime enum type.
429	DenseMap<Function *, RuntimeFunction> RuntimeFunctionIDMap;
430
431	/// Map from ICV kind to the ICV description.
432	EnumeratedArray<InternalControlVarInfo, InternalControlVar,
433	InternalControlVar::ICV___last>
434	ICVs;
435
436	/// Helper to initialize all internal control variable information for those
437	/// defined in OMPKinds.def.
438	void initializeInternalControlVars() {
439	#define ICV_RT_SET(_Name, RTL) \
440	{ \
441	auto &ICV = ICVs[_Name]; \
442	ICV.Setter = RTL; \
443	}
444	#define ICV_RT_GET(Name, RTL) \
445	{ \
446	auto &ICV = ICVs[Name]; \
447	ICV.Getter = RTL; \
448	}
449	#define ICV_DATA_ENV(Enum, _Name, _EnvVarName, Init) \
450	{ \
451	auto &ICV = ICVs[Enum]; \
452	ICV.Name = _Name; \
453	ICV.Kind = Enum; \
454	ICV.InitKind = Init; \
455	ICV.EnvVarName = _EnvVarName; \
456	switch (ICV.InitKind) { \
457	case ICV_IMPLEMENTATION_DEFINED: \
458	ICV.InitValue = nullptr; \
459	break; \
460	case ICV_ZERO: \
461	ICV.InitValue = ConstantInt::get( \
462	Type::getInt32Ty(OMPBuilder.Int32->getContext()), 0); \
463	break; \
464	case ICV_FALSE: \
465	ICV.InitValue = ConstantInt::getFalse(OMPBuilder.Int1->getContext()); \
466	break; \
467	case ICV_LAST: \
468	break; \
469	} \
470	}
471	#include "llvm/Frontend/OpenMP/OMPKinds.def"
472	}
473
474	/// Returns true if the function declaration \p F matches the runtime
475	/// function types, that is, return type \p RTFRetType, and argument types
476	/// \p RTFArgTypes.
477	static bool declMatchesRTFTypes(Function F, Type RTFRetType,
478	SmallVector<Type *, `8`> &RTFArgTypes) {
479	// TODO: We should output information to the user (under debug output
480	// and via remarks).
481
482	if (!F)
483	return false;
484	if (F->getReturnType() != RTFRetType)
485	return false;
486	if (F->arg_size() != RTFArgTypes.size())
487	return false;
488
489	auto *RTFTyIt = RTFArgTypes.begin();
490	for (Argument &Arg : F->args()) {
491	if (Arg.getType() != *RTFTyIt)
492	return false;
493
494	++RTFTyIt;
495	}
496
497	return true;
498	}
499
500	// Helper to collect all uses of the declaration in the UsesMap.
501	unsigned collectUses(RuntimeFunctionInfo &RFI, bool CollectStats = true) {
502	unsigned NumUses = `0`;
503	if (!RFI.Declaration)
504	return NumUses;
505	OMPBuilder.addAttributes(FnID: RFI.Kind, Fn&: *RFI.Declaration);
506
507	if (CollectStats) {
508	NumOpenMPRuntimeFunctionsIdentified += `1`;
509	NumOpenMPRuntimeFunctionUsesIdentified += RFI.Declaration->getNumUses();
510	}
511
512	// TODO: We directly convert uses into proper calls and unknown uses.
513	for (Use &U : RFI.Declaration->uses()) {
514	if (Instruction *UserI = dyn_cast<Instruction>(Val: U.getUser())) {
515	if (!CGSCC \|\| CGSCC->empty() \|\| CGSCC->contains(key: UserI->getFunction())) {
516	RFI.getOrCreateUseVector(F: UserI->getFunction()).push_back(Elt: &U);
517	++NumUses;
518	}
519	} else {
520	RFI.getOrCreateUseVector(F: nullptr).push_back(Elt: &U);
521	++NumUses;
522	}
523	}
524	return NumUses;
525	}
526
527	// Helper function to recollect uses of a runtime function.
528	void recollectUsesForFunction(RuntimeFunction RTF) {
529	auto &RFI = RFIs [RTF];
530	RFI.clearUsesMap();
531	collectUses(RFI, /CollectStats/ false);
532	}
533
534	// Helper function to recollect uses of all runtime functions.
535	void recollectUses() {
536	for (int Idx = `0`; Idx < RFIs.size(); ++Idx)
537	recollectUsesForFunction(RTF: static_cast<RuntimeFunction>(Idx));
538	}
539
540	// Helper function to inherit the calling convention of the function callee.
541	void setCallingConvention(FunctionCallee Callee, CallInst *CI) {
542	if (Function *Fn = dyn_cast<Function>(Val: Callee.getCallee()))
543	CI->setCallingConv(Fn->getCallingConv());
544	}
545
546	// Helper function to determine if it's legal to create a call to the runtime
547	// functions.
548	bool runtimeFnsAvailable(ArrayRef<RuntimeFunction> Fns) {
549	// We can always emit calls if we haven't yet linked in the runtime.
550	if (!OpenMPPostLink)
551	return true;
552
553	// Once the runtime has been already been linked in we cannot emit calls to
554	// any undefined functions.
555	for (RuntimeFunction Fn : Fns) {
556	RuntimeFunctionInfo &RFI = RFIs [Fn];
557
558	if (RFI.Declaration && RFI.Declaration->isDeclaration())
559	return false;
560	}
561	return true;
562	}
563
564	/// Helper to initialize all runtime function information for those defined
565	/// in OpenMPKinds.def.
566	void initializeRuntimeFunctions(Module &M) {
567
568	// Helper macros for handling __VA_ARGS__ in OMP_RTL
569	#define OMP_TYPE(VarName, ...) \
570	Type *VarName = OMPBuilder.VarName; \
571	(void)VarName;
572
573	#define OMP_ARRAY_TYPE(VarName, ...) \
574	ArrayType *VarName##Ty = OMPBuilder.VarName##Ty; \
575	(void)VarName##Ty; \
576	PointerType *VarName##PtrTy = OMPBuilder.VarName##PtrTy; \
577	(void)VarName##PtrTy;
578
579	#define OMP_FUNCTION_TYPE(VarName, ...) \
580	FunctionType *VarName = OMPBuilder.VarName; \
581	(void)VarName; \
582	PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
583	(void)VarName##Ptr;
584
585	#define OMP_STRUCT_TYPE(VarName, ...) \
586	StructType *VarName = OMPBuilder.VarName; \
587	(void)VarName; \
588	PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
589	(void)VarName##Ptr;
590
591	#define OMP_RTL(_Enum, _Name, _IsVarArg, _ReturnType, ...) \
592	{ \
593	SmallVector<Type *, 8> ArgsTypes({__VA_ARGS__}); \
594	Function *F = M.getFunction(_Name); \
595	RTLFunctions.insert(F); \
596	if (declMatchesRTFTypes(F, OMPBuilder._ReturnType, ArgsTypes)) { \
597	RuntimeFunctionIDMap[F] = _Enum; \
598	auto &RFI = RFIs[_Enum]; \
599	RFI.Kind = _Enum; \
600	RFI.Name = _Name; \
601	RFI.IsVarArg = _IsVarArg; \
602	RFI.ReturnType = OMPBuilder._ReturnType; \
603	RFI.ArgumentTypes = std::move(ArgsTypes); \
604	RFI.Declaration = F; \
605	unsigned NumUses = collectUses(RFI); \
606	(void)NumUses; \
607	LLVM_DEBUG({ \
608	dbgs() << TAG << RFI.Name << (RFI.Declaration ? "" : " not") \
609	<< " found\n"; \
610	if (RFI.Declaration) \
611	dbgs() << TAG << "-> got " << NumUses << " uses in " \
612	<< RFI.getNumFunctionsWithUses() \
613	<< " different functions.\n"; \
614	}); \
615	} \
616	}
617	#include "llvm/Frontend/OpenMP/OMPKinds.def"
618
619	// Remove the `noinline` attribute from `__kmpc`, `ompx::` and `omp_`
620	// functions, except if `optnone` is present.
621	if (isOpenMPDevice(M)) {
622	for (Function &F : M) {
623	for (StringRef Prefix : {"__kmpc", "_ZN4ompx", "omp_"})
624	if (F.hasFnAttribute(Kind: Attribute::NoInline) &&
625	F.getName().starts_with(Prefix) &&
626	!F.hasFnAttribute(Kind: Attribute::OptimizeNone))
627	F.removeFnAttr(Kind: Attribute::NoInline);
628	}
629	}
630
631	// TODO: We should attach the attributes defined in OMPKinds.def.
632	}
633
634	/// Collection of known OpenMP runtime functions..
635	DenseSet<const Function *> RTLFunctions;
636
637	/// Indicates if we have already linked in the OpenMP device library.
638	bool OpenMPPostLink = false;
639	};
640
641	template <typename Ty, bool InsertInvalidates = true>
642	struct BooleanStateWithSetVector : public BooleanState {
643	bool contains(const Ty &Elem) const { return Set.contains(Elem); }
644	bool insert(const Ty &Elem) {
645	if (InsertInvalidates)
646	BooleanState::indicatePessimisticFixpoint();
647	return Set.insert(Elem);
648	}
649
650	const Ty &operator[](int Idx) const { return Set[Idx]; }
651	bool operator==(const BooleanStateWithSetVector &RHS) const {
652	return BooleanState::operator==(R: RHS) && Set == RHS.Set;
653	}
654	bool operator!=(const BooleanStateWithSetVector &RHS) const {
655	return !(*this == RHS);
656	}
657
658	bool empty() const { return Set.empty(); }
659	size_t size() const { return Set.size(); }
660
661	/// "Clamp" this state with \p RHS.
662	BooleanStateWithSetVector &operator^=(const BooleanStateWithSetVector &RHS) {
663	BooleanState::operator^=(R: RHS);
664	Set.insert(RHS.Set.begin(), RHS.Set.end());
665	return *this;
666	}
667
668	private:
669	/// A set to keep track of elements.
670	SetVector<Ty> Set;
671
672	public:
673	typename decltype(Set)::iterator begin() { return Set.begin(); }
674	typename decltype(Set)::iterator end() { return Set.end(); }
675	typename decltype(Set)::const_iterator begin() const { return Set.begin(); }
676	typename decltype(Set)::const_iterator end() const { return Set.end(); }
677	};
678
679	template <typename Ty, bool InsertInvalidates = true>
680	using BooleanStateWithPtrSetVector =
681	BooleanStateWithSetVector<Ty *, InsertInvalidates>;
682
683	struct KernelInfoState : AbstractState {
684	/// Flag to track if we reached a fixpoint.
685	bool IsAtFixpoint = false;
686
687	/// The parallel regions (identified by the outlined parallel functions) that
688	/// can be reached from the associated function.
689	BooleanStateWithPtrSetVector<CallBase, / InsertInvalidates / false>
690	ReachedKnownParallelRegions;
691
692	/// State to track what parallel region we might reach.
693	BooleanStateWithPtrSetVector<CallBase> ReachedUnknownParallelRegions;
694
695	/// State to track if we are in SPMD-mode, assumed or know, and why we decided
696	/// we cannot be. If it is assumed, then RequiresFullRuntime should also be
697	/// false.
698	BooleanStateWithPtrSetVector<Instruction, false> SPMDCompatibilityTracker;
699
700	/// The __kmpc_target_init call in this kernel, if any. If we find more than
701	/// one we abort as the kernel is malformed.
702	CallBase KernelInitCB = nullptr*;
703
704	/// The constant kernel environement as taken from and passed to
705	/// __kmpc_target_init.
706	ConstantStruct KernelEnvC = nullptr*;
707
708	/// The __kmpc_target_deinit call in this kernel, if any. If we find more than
709	/// one we abort as the kernel is malformed.
710	CallBase KernelDeinitCB = nullptr*;
711
712	/// Flag to indicate if the associated function is a kernel entry.
713	bool IsKernelEntry = false;
714
715	/// State to track what kernel entries can reach the associated function.
716	BooleanStateWithPtrSetVector<Function, false> ReachingKernelEntries;
717
718	/// State to indicate if we can track parallel level of the associated
719	/// function. We will give up tracking if we encounter unknown caller or the
720	/// caller is __kmpc_parallel_51.
721	BooleanStateWithSetVector<uint8_t> ParallelLevels;
722
723	/// Flag that indicates if the kernel has nested Parallelism
724	bool NestedParallelism = false;
725
726	/// Abstract State interface
727	///{
728
729	KernelInfoState() = default;
730	KernelInfoState(bool BestState) {
731	if (!BestState)
732	indicatePessimisticFixpoint();
733	}
734
735	/// See AbstractState::isValidState(...)
736	bool isValidState() const override { return true; }
737
738	/// See AbstractState::isAtFixpoint(...)
739	bool isAtFixpoint() const override { return IsAtFixpoint; }
740
741	/// See AbstractState::indicatePessimisticFixpoint(...)
742	ChangeStatus indicatePessimisticFixpoint() override {
743	IsAtFixpoint = true;
744	ParallelLevels.indicatePessimisticFixpoint();
745	ReachingKernelEntries.indicatePessimisticFixpoint();
746	SPMDCompatibilityTracker.indicatePessimisticFixpoint();
747	ReachedKnownParallelRegions.indicatePessimisticFixpoint();
748	ReachedUnknownParallelRegions.indicatePessimisticFixpoint();
749	NestedParallelism = true;
750	return ChangeStatus::CHANGED;
751	}
752
753	/// See AbstractState::indicateOptimisticFixpoint(...)
754	ChangeStatus indicateOptimisticFixpoint() override {
755	IsAtFixpoint = true;
756	ParallelLevels.indicateOptimisticFixpoint();
757	ReachingKernelEntries.indicateOptimisticFixpoint();
758	SPMDCompatibilityTracker.indicateOptimisticFixpoint();
759	ReachedKnownParallelRegions.indicateOptimisticFixpoint();
760	ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
761	return ChangeStatus::UNCHANGED;
762	}
763
764	/// Return the assumed state
765	KernelInfoState &getAssumed() { return *this; }
766	const KernelInfoState &getAssumed() const { return *this; }
767
768	bool operator==(const KernelInfoState &RHS) const {
769	if (SPMDCompatibilityTracker != RHS.SPMDCompatibilityTracker)
770	return false;
771	if (ReachedKnownParallelRegions != RHS.ReachedKnownParallelRegions)
772	return false;
773	if (ReachedUnknownParallelRegions != RHS.ReachedUnknownParallelRegions)
774	return false;
775	if (ReachingKernelEntries != RHS.ReachingKernelEntries)
776	return false;
777	if (ParallelLevels != RHS.ParallelLevels)
778	return false;
779	if (NestedParallelism != RHS.NestedParallelism)
780	return false;
781	return true;
782	}
783
784	/// Returns true if this kernel contains any OpenMP parallel regions.
785	bool mayContainParallelRegion() {
786	return !ReachedKnownParallelRegions.empty() \|\|
787	!ReachedUnknownParallelRegions.empty();
788	}
789
790	/// Return empty set as the best state of potential values.
791	static KernelInfoState getBestState() { return KernelInfoState (true); }
792
793	static KernelInfoState getBestState(KernelInfoState &KIS) {
794	return getBestState();
795	}
796
797	/// Return full set as the worst state of potential values.
798	static KernelInfoState getWorstState() { return KernelInfoState (false); }
799
800	/// "Clamp" this state with \p KIS.
801	KernelInfoState operator^=(const KernelInfoState &KIS) {
802	// Do not merge two different _init and _deinit call sites.
803	if (KIS.KernelInitCB) {
804	if (KernelInitCB && KernelInitCB != KIS.KernelInitCB)
805	llvm_unreachable("Kernel that calls another kernel violates OpenMP-Opt "
806	"assumptions.");
807	KernelInitCB = KIS.KernelInitCB;
808	}
809	if (KIS.KernelDeinitCB) {
810	if (KernelDeinitCB && KernelDeinitCB != KIS.KernelDeinitCB)
811	llvm_unreachable("Kernel that calls another kernel violates OpenMP-Opt "
812	"assumptions.");
813	KernelDeinitCB = KIS.KernelDeinitCB;
814	}
815	if (KIS.KernelEnvC) {
816	if (KernelEnvC && KernelEnvC != KIS.KernelEnvC)
817	llvm_unreachable("Kernel that calls another kernel violates OpenMP-Opt "
818	"assumptions.");
819	KernelEnvC = KIS.KernelEnvC;
820	}
821	SPMDCompatibilityTracker ^= KIS.SPMDCompatibilityTracker;
822	ReachedKnownParallelRegions ^= KIS.ReachedKnownParallelRegions;
823	ReachedUnknownParallelRegions ^= KIS.ReachedUnknownParallelRegions;
824	NestedParallelism \|= KIS.NestedParallelism;
825	return *this;
826	}
827
828	KernelInfoState operator&=(const KernelInfoState &KIS) {
829	return (*this ^= KIS);
830	}
831
832	///}
833	};
834
835	/// Used to map the values physically (in the IR) stored in an offload
836	/// array, to a vector in memory.
837	struct OffloadArray {
838	/// Physical array (in the IR).
839	AllocaInst Array = nullptr*;
840	/// Mapped values.
841	SmallVector<Value *, `8`> StoredValues;
842	/// Last stores made in the offload array.
843	SmallVector<StoreInst *, `8`> LastAccesses;
844
845	OffloadArray() = default;
846
847	/// Initializes the OffloadArray with the values stored in \p Array before
848	/// instruction \p Before is reached. Returns false if the initialization
849	/// fails.
850	/// This MUST be used immediately after the construction of the object.
851	bool initialize(AllocaInst &Array, Instruction &Before) {
852	if (!Array.getAllocatedType()->isArrayTy())
853	return false;
854
855	if (!getValues(Array, Before))
856	return false;
857
858	this->Array = &Array;
859	return true;
860	}
861
862	static const unsigned DeviceIDArgNum = `1`;
863	static const unsigned BasePtrsArgNum = `3`;
864	static const unsigned PtrsArgNum = `4`;
865	static const unsigned SizesArgNum = `5`;
866
867	private:
868	/// Traverses the BasicBlock where \p Array is, collecting the stores made to
869	/// \p Array, leaving StoredValues with the values stored before the
870	/// instruction \p Before is reached.
871	bool getValues(AllocaInst &Array, Instruction &Before) {
872	// Initialize container.
873	const uint64_t NumValues = Array.getAllocatedType()->getArrayNumElements();
874	StoredValues.assign(NumElts: NumValues, Elt: nullptr);
875	LastAccesses.assign(NumElts: NumValues, Elt: nullptr);
876
877	// TODO: This assumes the instruction \p Before is in the same
878	// BasicBlock as Array. Make it general, for any control flow graph.
879	BasicBlock *BB = Array.getParent();
880	if (BB != Before.getParent())
881	return false;
882
883	const DataLayout &DL = Array.getDataLayout();
884	const unsigned int PointerSize = DL.getPointerSize();
885
886	for (Instruction &I : *BB) {
887	if (&I == &Before)
888	break;
889
890	if (!isa<StoreInst>(Val: &I))
891	continue;
892
893	auto *S = cast<StoreInst>(Val: &I);
894	int64_t Offset = -`1`;
895	auto *Dst =
896	GetPointerBaseWithConstantOffset(Ptr: S->getPointerOperand(), Offset, DL);
897	if (Dst == &Array) {
898	int64_t Idx = Offset / PointerSize;
899	StoredValues [Idx] = getUnderlyingObject(V: S->getValueOperand());
900	LastAccesses [Idx] = S;
901	}
902	}
903
904	return isFilled();
905	}
906
907	/// Returns true if all values in StoredValues and
908	/// LastAccesses are not nullptrs.
909	bool isFilled() {
910	const unsigned NumValues = StoredValues.size();
911	for (unsigned I = `0`; I < NumValues; ++I) {
912	if (!StoredValues [I] \|\| !LastAccesses [I])
913	return false;
914	}
915
916	return true;
917	}
918	};
919
920	struct OpenMPOpt {
921
922	using OptimizationRemarkGetter =
923	function_ref<OptimizationRemarkEmitter &(Function *)>;
924
925	OpenMPOpt(SmallVectorImpl<Function *> &SCC, CallGraphUpdater &CGUpdater,
926	OptimizationRemarkGetter OREGetter,
927	OMPInformationCache &OMPInfoCache, Attributor &A)
928	: M((SCC.begin())->getParent()), SCC(SCC), CGUpdater(CGUpdater),
929	OREGetter (OREGetter), OMPInfoCache(OMPInfoCache), A(A) {}
930
931	/// Check if any remarks are enabled for openmp-opt
932	bool remarksEnabled() {
933	auto &Ctx = M.getContext();
934	return Ctx.getDiagHandlerPtr()->isAnyRemarkEnabled(DEBUG_TYPE);
935	}
936
937	/// Run all OpenMP optimizations on the underlying SCC.
938	bool run(bool IsModulePass) {
939	if (SCC.empty())
940	return false;
941
942	bool Changed = false;
943
944	LLVM_DEBUG(dbgs() << TAG << "Run on SCC with " << SCC.size()
945	<< " functions\n");
946
947	if (IsModulePass) {
948	Changed \|= runAttributor(IsModulePass);
949
950	// Recollect uses, in case Attributor deleted any.
951	OMPInfoCache.recollectUses();
952
953	// TODO: This should be folded into buildCustomStateMachine.
954	Changed \|= rewriteDeviceCodeStateMachine();
955
956	if (remarksEnabled())
957	analysisGlobalization();
958	} else {
959	if (PrintICVValues)
960	printICVs();
961	if (PrintOpenMPKernels)
962	printKernels();
963
964	Changed \|= runAttributor(IsModulePass);
965
966	// Recollect uses, in case Attributor deleted any.
967	OMPInfoCache.recollectUses();
968
969	Changed \|= deleteParallelRegions();
970
971	if (HideMemoryTransferLatency)
972	Changed \|= hideMemTransfersLatency();
973	Changed \|= deduplicateRuntimeCalls();
974	if (EnableParallelRegionMerging) {
975	if (mergeParallelRegions()) {
976	deduplicateRuntimeCalls();
977	Changed = true;
978	}
979	}
980	}
981
982	if (OMPInfoCache.OpenMPPostLink)
983	Changed \|= removeRuntimeSymbols();
984
985	return Changed;
986	}
987
988	/// Print initial ICV values for testing.
989	/// FIXME: This should be done from the Attributor once it is added.
990	void printICVs() const {
991	InternalControlVar ICVs[] = {ICV_nthreads, ICV_active_levels, ICV_cancel,
992	ICV_proc_bind};
993
994	for (Function *F : SCC) {
995	for (auto ICV : ICVs) {
996	auto ICVInfo = OMPInfoCache.ICVs [ICV];
997	auto Remark = [&](OptimizationRemarkAnalysis ORA) {
998	return ORA << "OpenMP ICV " << ore::NV ("OpenMPICV", ICVInfo.Name)
999	<< " Value: "
1000	<< (ICVInfo.InitValue
1001	? toString(I: ICVInfo.InitValue->getValue(), Radix: `10`, Signed: true)
1002	: "IMPLEMENTATION_DEFINED");
1003	};
1004
1005	emitRemark<OptimizationRemarkAnalysis>(F, RemarkName: "OpenMPICVTracker", RemarkCB&: Remark);
1006	}
1007	}
1008	}
1009
1010	/// Print OpenMP GPU kernels for testing.
1011	void printKernels() const {
1012	for (Function *F : SCC) {
1013	if (!omp::isOpenMPKernel(Fn&: *F))
1014	continue;
1015
1016	auto Remark = [&](OptimizationRemarkAnalysis ORA) {
1017	return ORA << "OpenMP GPU kernel "
1018	<< ore::NV ("OpenMPGPUKernel", F->getName()) << "\n";
1019	};
1020
1021	emitRemark<OptimizationRemarkAnalysis>(F, RemarkName: "OpenMPGPU", RemarkCB&: Remark);
1022	}
1023	}
1024
1025	/// Return the call if \p U is a callee use in a regular call. If \p RFI is
1026	/// given it has to be the callee or a nullptr is returned.
1027	static CallInst *getCallIfRegularCall(
1028	Use &U, OMPInformationCache::RuntimeFunctionInfo RFI = nullptr*) {
1029	CallInst *CI = dyn_cast<CallInst>(Val: U.getUser());
1030	if (CI && CI->isCallee(U: &U) && !CI->hasOperandBundles() &&
1031	(!RFI \|\|
1032	(RFI->Declaration && CI->getCalledFunction() == RFI->Declaration)))
1033	return CI;
1034	return nullptr;
1035	}
1036
1037	/// Return the call if \p V is a regular call. If \p RFI is given it has to be
1038	/// the callee or a nullptr is returned.
1039	static CallInst *getCallIfRegularCall(
1040	Value &V, OMPInformationCache::RuntimeFunctionInfo RFI = nullptr*) {
1041	CallInst *CI = dyn_cast<CallInst>(Val: &V);
1042	if (CI && !CI->hasOperandBundles() &&
1043	(!RFI \|\|
1044	(RFI->Declaration && CI->getCalledFunction() == RFI->Declaration)))
1045	return CI;
1046	return nullptr;
1047	}
1048
1049	private:
1050	/// Merge parallel regions when it is safe.
1051	bool mergeParallelRegions() {
1052	const unsigned CallbackCalleeOperand = `2`;
1053	const unsigned CallbackFirstArgOperand = `3`;
1054	using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
1055
1056	// Check if there are any __kmpc_fork_call calls to merge.
1057	OMPInformationCache::RuntimeFunctionInfo &RFI =
1058	OMPInfoCache.RFIs [OMPRTL___kmpc_fork_call];
1059
1060	if (!RFI.Declaration)
1061	return false;
1062
1063	// Unmergable calls that prevent merging a parallel region.
1064	OMPInformationCache::RuntimeFunctionInfo UnmergableCallsInfo[] = {
1065	OMPInfoCache.RFIs [OMPRTL___kmpc_push_proc_bind],
1066	OMPInfoCache.RFIs [OMPRTL___kmpc_push_num_threads],
1067	};
1068
1069	bool Changed = false;
1070	LoopInfo LI = nullptr*;
1071	DominatorTree DT = nullptr*;
1072
1073	SmallDenseMap<BasicBlock , SmallPtrSet<Instruction , `4`>> BB2PRMap;
1074
1075	BasicBlock StartBB = nullptr, EndBB = nullptr;
1076	auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
1077	BasicBlock *CGStartBB = CodeGenIP.getBlock();
1078	BasicBlock *CGEndBB =
1079	SplitBlock(Old: CGStartBB, SplitPt: &*CodeGenIP.getPoint(), DT, LI);
1080	assert(StartBB != nullptr && "StartBB should not be null");
1081	CGStartBB->getTerminator()->setSuccessor(Idx: `0`, BB: StartBB);
1082	assert(EndBB != nullptr && "EndBB should not be null");
1083	EndBB->getTerminator()->setSuccessor(Idx: `0`, BB: CGEndBB);
1084	};
1085
1086	auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &,
1087	Value &Inner, Value *&ReplacementValue) -> InsertPointTy {
1088	ReplacementValue = &Inner;
1089	return CodeGenIP;
1090	};
1091
1092	auto FiniCB = [&](InsertPointTy CodeGenIP) {};
1093
1094	/// Create a sequential execution region within a merged parallel region,
1095	/// encapsulated in a master construct with a barrier for synchronization.
1096	auto CreateSequentialRegion = [&](Function *OuterFn,
1097	BasicBlock *OuterPredBB,
1098	Instruction *SeqStartI,
1099	Instruction *SeqEndI) {
1100	// Isolate the instructions of the sequential region to a separate
1101	// block.
1102	BasicBlock *ParentBB = SeqStartI->getParent();
1103	BasicBlock *SeqEndBB =
1104	SplitBlock(Old: ParentBB, SplitPt: SeqEndI->getNextNode(), DT, LI);
1105	BasicBlock *SeqAfterBB =
1106	SplitBlock(Old: SeqEndBB, SplitPt: &*SeqEndBB->getFirstInsertionPt(), DT, LI);
1107	BasicBlock *SeqStartBB =
1108	SplitBlock(Old: ParentBB, SplitPt: SeqStartI, DT, LI, MSSAU: nullptr, BBName: "seq.par.merged");
1109
1110	assert(ParentBB->getUniqueSuccessor() == SeqStartBB &&
1111	"Expected a different CFG");
1112	const DebugLoc DL = ParentBB->getTerminator()->getDebugLoc();
1113	ParentBB->getTerminator()->eraseFromParent();
1114
1115	auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
1116	BasicBlock *CGStartBB = CodeGenIP.getBlock();
1117	BasicBlock *CGEndBB =
1118	SplitBlock(Old: CGStartBB, SplitPt: &*CodeGenIP.getPoint(), DT, LI);
1119	assert(SeqStartBB != nullptr && "SeqStartBB should not be null");
1120	CGStartBB->getTerminator()->setSuccessor(Idx: `0`, BB: SeqStartBB);
1121	assert(SeqEndBB != nullptr && "SeqEndBB should not be null");
1122	SeqEndBB->getTerminator()->setSuccessor(Idx: `0`, BB: CGEndBB);
1123	};
1124	auto FiniCB = [&](InsertPointTy CodeGenIP) {};
1125
1126	// Find outputs from the sequential region to outside users and
1127	// broadcast their values to them.
1128	for (Instruction &I : *SeqStartBB) {
1129	SmallPtrSet<Instruction *, `4`> OutsideUsers;
1130	for (User *Usr : I.users()) {
1131	Instruction &UsrI = *cast<Instruction>(Val: Usr);
1132	// Ignore outputs to LT intrinsics, code extraction for the merged
1133	// parallel region will fix them.
1134	if (UsrI.isLifetimeStartOrEnd())
1135	continue;
1136
1137	if (UsrI.getParent() != SeqStartBB)
1138	OutsideUsers.insert(Ptr: &UsrI);
1139	}
1140
1141	if (OutsideUsers.empty())
1142	continue;
1143
1144	// Emit an alloca in the outer region to store the broadcasted
1145	// value.
1146	const DataLayout &DL = M.getDataLayout();
1147	AllocaInst AllocaI = new* AllocaInst (
1148	I.getType(), DL.getAllocaAddrSpace(), nullptr,
1149	I.getName() + ".seq.output.alloc", OuterFn->front().begin());
1150
1151	// Emit a store instruction in the sequential BB to update the
1152	// value.
1153	new StoreInst (&I, AllocaI, SeqStartBB->getTerminator()->getIterator());
1154
1155	// Emit a load instruction and replace the use of the output value
1156	// with it.
1157	for (Instruction *UsrI : OutsideUsers) {
1158	LoadInst LoadI = new* LoadInst (I.getType(), AllocaI,
1159	I.getName() + ".seq.output.load",
1160	UsrI->getIterator());
1161	UsrI->replaceUsesOfWith(From: &I, To: LoadI);
1162	}
1163	}
1164
1165	OpenMPIRBuilder::LocationDescription Loc(
1166	InsertPointTy (ParentBB, ParentBB->end()), DL);
1167	InsertPointTy SeqAfterIP =
1168	OMPInfoCache.OMPBuilder.createMaster(Loc, BodyGenCB, FiniCB);
1169
1170	OMPInfoCache.OMPBuilder.createBarrier(Loc: SeqAfterIP, Kind: OMPD_parallel);
1171
1172	BranchInst::Create(IfTrue: SeqAfterBB, InsertBefore: SeqAfterIP.getBlock());
1173
1174	LLVM_DEBUG(dbgs() << TAG << "After sequential inlining " << *OuterFn
1175	<< "\n");
1176	};
1177
1178	// Helper to merge the __kmpc_fork_call calls in MergableCIs. They are all
1179	// contained in BB and only separated by instructions that can be
1180	// redundantly executed in parallel. The block BB is split before the first
1181	// call (in MergableCIs) and after the last so the entire region we merge
1182	// into a single parallel region is contained in a single basic block
1183	// without any other instructions. We use the OpenMPIRBuilder to outline
1184	// that block and call the resulting function via __kmpc_fork_call.
1185	auto Merge = [&](const SmallVectorImpl<CallInst *> &MergableCIs,
1186	BasicBlock *BB) {
1187	// TODO: Change the interface to allow single CIs expanded, e.g, to
1188	// include an outer loop.
1189	assert(MergableCIs.size() > `1` && "Assumed multiple mergable CIs");
1190
1191	auto Remark = [&](OptimizationRemark OR) {
1192	OR << "Parallel region merged with parallel region"
1193	<< (MergableCIs.size() > `2` ? "s" : "") << " at ";
1194	for (auto *CI : llvm::drop_begin(RangeOrContainer: MergableCIs)) {
1195	OR << ore::NV ("OpenMPParallelMerge", CI->getDebugLoc());
1196	if (CI != MergableCIs.back())
1197	OR << ", ";
1198	}
1199	return OR << ".";
1200	};
1201
1202	emitRemark<OptimizationRemark>(I: MergableCIs.front(), RemarkName: "OMP150", RemarkCB&: Remark);
1203
1204	Function *OriginalFn = BB->getParent();
1205	LLVM_DEBUG(dbgs() << TAG << "Merge " << MergableCIs.size()
1206	<< " parallel regions in " << OriginalFn->getName()
1207	<< "\n");
1208
1209	// Isolate the calls to merge in a separate block.
1210	EndBB = SplitBlock(Old: BB, SplitPt: MergableCIs.back()->getNextNode(), DT, LI);
1211	BasicBlock *AfterBB =
1212	SplitBlock(Old: EndBB, SplitPt: &*EndBB->getFirstInsertionPt(), DT, LI);
1213	StartBB = SplitBlock(Old: BB, SplitPt: MergableCIs.front(), DT, LI, MSSAU: nullptr,
1214	BBName: "omp.par.merged");
1215
1216	assert(BB->getUniqueSuccessor() == StartBB && "Expected a different CFG");
1217	const DebugLoc DL = BB->getTerminator()->getDebugLoc();
1218	BB->getTerminator()->eraseFromParent();
1219
1220	// Create sequential regions for sequential instructions that are
1221	// in-between mergable parallel regions.
1222	for (auto It = MergableCIs.begin(), End = MergableCIs.end() - `1`;
1223	It != End; ++It) {
1224	Instruction ForkCI = It;
1225	Instruction NextForkCI = (It + `1`);
1226
1227	// Continue if there are not in-between instructions.
1228	if (ForkCI->getNextNode() == NextForkCI)
1229	continue;
1230
1231	CreateSequentialRegion(OriginalFn, BB, ForkCI->getNextNode(),
1232	NextForkCI->getPrevNode());
1233	}
1234
1235	OpenMPIRBuilder::LocationDescription Loc(InsertPointTy (BB, BB->end()),
1236	DL);
1237	IRBuilder<>::InsertPoint AllocaIP(
1238	&OriginalFn->getEntryBlock(),
1239	OriginalFn->getEntryBlock().getFirstInsertionPt());
1240	// Create the merged parallel region with default proc binding, to
1241	// avoid overriding binding settings, and without explicit cancellation.
1242	InsertPointTy AfterIP = OMPInfoCache.OMPBuilder.createParallel(
1243	Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB, IfCondition: nullptr, NumThreads: nullptr,
1244	ProcBind: OMP_PROC_BIND_default, / IsCancellable / false);
1245	BranchInst::Create(IfTrue: AfterBB, InsertBefore: AfterIP.getBlock());
1246
1247	// Perform the actual outlining.
1248	OMPInfoCache.OMPBuilder.finalize(Fn: OriginalFn);
1249
1250	Function *OutlinedFn = MergableCIs.front()->getCaller();
1251
1252	// Replace the __kmpc_fork_call calls with direct calls to the outlined
1253	// callbacks.
1254	SmallVector<Value *, `8`> Args;
1255	for (auto *CI : MergableCIs) {
1256	Value *Callee = CI->getArgOperand(i: CallbackCalleeOperand);
1257	FunctionType *FT = OMPInfoCache.OMPBuilder.ParallelTask;
1258	Args.clear();
1259	Args.push_back(Elt: OutlinedFn->getArg(i: `0`));
1260	Args.push_back(Elt: OutlinedFn->getArg(i: `1`));
1261	for (unsigned U = CallbackFirstArgOperand, E = CI->arg_size(); U < E;
1262	++U)
1263	Args.push_back(Elt: CI->getArgOperand(i: U));
1264
1265	CallInst *NewCI =
1266	CallInst::Create(Ty: FT, Func: Callee, Args, NameStr: "", InsertBefore: CI->getIterator());
1267	if (CI->getDebugLoc())
1268	NewCI->setDebugLoc(CI->getDebugLoc());
1269
1270	// Forward parameter attributes from the callback to the callee.
1271	for (unsigned U = CallbackFirstArgOperand, E = CI->arg_size(); U < E;
1272	++U)
1273	for (const Attribute &A : CI->getAttributes().getParamAttrs(ArgNo: U))
1274	NewCI->addParamAttr(
1275	ArgNo: U - (CallbackFirstArgOperand - CallbackCalleeOperand), Attr: A);
1276
1277	// Emit an explicit barrier to replace the implicit fork-join barrier.
1278	if (CI != MergableCIs.back()) {
1279	// TODO: Remove barrier if the merged parallel region includes the
1280	// 'nowait' clause.
1281	OMPInfoCache.OMPBuilder.createBarrier(
1282	Loc: InsertPointTy (NewCI->getParent(),
1283	NewCI->getNextNode()->getIterator()),
1284	Kind: OMPD_parallel);
1285	}
1286
1287	CI->eraseFromParent();
1288	}
1289
1290	assert(OutlinedFn != OriginalFn && "Outlining failed");
1291	CGUpdater.registerOutlinedFunction(OriginalFn&: OriginalFn, NewFn&: OutlinedFn);
1292	CGUpdater.reanalyzeFunction(Fn&: *OriginalFn);
1293
1294	NumOpenMPParallelRegionsMerged += MergableCIs.size();
1295
1296	return true;
1297	};
1298
1299	// Helper function that identifes sequences of
1300	// __kmpc_fork_call uses in a basic block.
1301	auto DetectPRsCB = [&](Use &U, Function &F) {
1302	CallInst *CI = getCallIfRegularCall(U, RFI: &RFI);
1303	BB2PRMap [CI->getParent()].insert(Ptr: CI);
1304
1305	return false;
1306	};
1307
1308	BB2PRMap.clear();
1309	RFI.foreachUse(SCC, CB: DetectPRsCB);
1310	SmallVector<SmallVector<CallInst *, `4`>, `4`> MergableCIsVector;
1311	// Find mergable parallel regions within a basic block that are
1312	// safe to merge, that is any in-between instructions can safely
1313	// execute in parallel after merging.
1314	// TODO: support merging across basic-blocks.
1315	for (auto &It : BB2PRMap) {
1316	auto &CIs = It.getSecond();
1317	if (CIs.size() < `2`)
1318	continue;
1319
1320	BasicBlock *BB = It.getFirst();
1321	SmallVector<CallInst *, `4`> MergableCIs;
1322
1323	/// Returns true if the instruction is mergable, false otherwise.
1324	/// A terminator instruction is unmergable by definition since merging
1325	/// works within a BB. Instructions before the mergable region are
1326	/// mergable if they are not calls to OpenMP runtime functions that may
1327	/// set different execution parameters for subsequent parallel regions.
1328	/// Instructions in-between parallel regions are mergable if they are not
1329	/// calls to any non-intrinsic function since that may call a non-mergable
1330	/// OpenMP runtime function.
1331	auto IsMergable = [&](Instruction &I, bool IsBeforeMergableRegion) {
1332	// We do not merge across BBs, hence return false (unmergable) if the
1333	// instruction is a terminator.
1334	if (I.isTerminator())
1335	return false;
1336
1337	if (!isa<CallInst>(Val: &I))
1338	return true;
1339
1340	CallInst *CI = cast<CallInst>(Val: &I);
1341	if (IsBeforeMergableRegion) {
1342	Function *CalledFunction = CI->getCalledFunction();
1343	if (!CalledFunction)
1344	return false;
1345	// Return false (unmergable) if the call before the parallel
1346	// region calls an explicit affinity (proc_bind) or number of
1347	// threads (num_threads) compiler-generated function. Those settings
1348	// may be incompatible with following parallel regions.
1349	// TODO: ICV tracking to detect compatibility.
1350	for (const auto &RFI : UnmergableCallsInfo) {
1351	if (CalledFunction == RFI.Declaration)
1352	return false;
1353	}
1354	} else {
1355	// Return false (unmergable) if there is a call instruction
1356	// in-between parallel regions when it is not an intrinsic. It
1357	// may call an unmergable OpenMP runtime function in its callpath.
1358	// TODO: Keep track of possible OpenMP calls in the callpath.
1359	if (!isa<IntrinsicInst>(Val: CI))
1360	return false;
1361	}
1362
1363	return true;
1364	};
1365	// Find maximal number of parallel region CIs that are safe to merge.
1366	for (auto It = BB->begin(), End = BB->end(); It != End;) {
1367	Instruction &I = *It;
1368	++It;
1369
1370	if (CIs.count(Ptr: &I)) {
1371	MergableCIs.push_back(Elt: cast<CallInst>(Val: &I));
1372	continue;
1373	}
1374
1375	// Continue expanding if the instruction is mergable.
1376	if (IsMergable(I, MergableCIs.empty()))
1377	continue;
1378
1379	// Forward the instruction iterator to skip the next parallel region
1380	// since there is an unmergable instruction which can affect it.
1381	for (; It != End; ++It) {
1382	Instruction &SkipI = *It;
1383	if (CIs.count(Ptr: &SkipI)) {
1384	LLVM_DEBUG(dbgs() << TAG << "Skip parallel region " << SkipI
1385	<< " due to " << I << "\n");
1386	++It;
1387	break;
1388	}
1389	}
1390
1391	// Store mergable regions found.
1392	if (MergableCIs.size() > `1`) {
1393	MergableCIsVector.push_back(Elt: MergableCIs);
1394	LLVM_DEBUG(dbgs() << TAG << "Found " << MergableCIs.size()
1395	<< " parallel regions in block " << BB->getName()
1396	<< " of function " << BB->getParent()->getName()
1397	<< "\n";);
1398	}
1399
1400	MergableCIs.clear();
1401	}
1402
1403	if (!MergableCIsVector.empty()) {
1404	Changed = true;
1405
1406	for (auto &MergableCIs : MergableCIsVector)
1407	Merge(MergableCIs, BB);
1408	MergableCIsVector.clear();
1409	}
1410	}
1411
1412	if (Changed) {
1413	/// Re-collect use for fork calls, emitted barrier calls, and
1414	/// any emitted master/end_master calls.
1415	OMPInfoCache.recollectUsesForFunction(RTF: OMPRTL___kmpc_fork_call);
1416	OMPInfoCache.recollectUsesForFunction(RTF: OMPRTL___kmpc_barrier);
1417	OMPInfoCache.recollectUsesForFunction(RTF: OMPRTL___kmpc_master);
1418	OMPInfoCache.recollectUsesForFunction(RTF: OMPRTL___kmpc_end_master);
1419	}
1420
1421	return Changed;
1422	}
1423
1424	/// Try to delete parallel regions if possible.
1425	bool deleteParallelRegions() {
1426	const unsigned CallbackCalleeOperand = `2`;
1427
1428	OMPInformationCache::RuntimeFunctionInfo &RFI =
1429	OMPInfoCache.RFIs [OMPRTL___kmpc_fork_call];
1430
1431	if (!RFI.Declaration)
1432	return false;
1433
1434	bool Changed = false;
1435	auto DeleteCallCB = [&](Use &U, Function &) {
1436	CallInst *CI = getCallIfRegularCall(U);
1437	if (!CI)
1438	return false;
1439	auto *Fn = dyn_cast<Function>(
1440	Val: CI->getArgOperand(i: CallbackCalleeOperand)->stripPointerCasts());
1441	if (!Fn)
1442	return false;
1443	if (!Fn->onlyReadsMemory())
1444	return false;
1445	if (!Fn->hasFnAttribute(Kind: Attribute::WillReturn))
1446	return false;
1447
1448	LLVM_DEBUG(dbgs() << TAG << "Delete read-only parallel region in "
1449	<< CI->getCaller()->getName() << "\n");
1450
1451	auto Remark = [&](OptimizationRemark OR) {
1452	return OR << "Removing parallel region with no side-effects.";
1453	};
1454	emitRemark<OptimizationRemark>(I: CI, RemarkName: "OMP160", RemarkCB&: Remark);
1455
1456	CI->eraseFromParent();
1457	Changed = true;
1458	++NumOpenMPParallelRegionsDeleted;
1459	return true;
1460	};
1461
1462	RFI.foreachUse(SCC, CB: DeleteCallCB);
1463
1464	return Changed;
1465	}
1466
1467	/// Try to eliminate runtime calls by reusing existing ones.
1468	bool deduplicateRuntimeCalls() {
1469	bool Changed = false;
1470
1471	RuntimeFunction DeduplicableRuntimeCallIDs[] = {
1472	OMPRTL_omp_get_num_threads,
1473	OMPRTL_omp_in_parallel,
1474	OMPRTL_omp_get_cancellation,
1475	OMPRTL_omp_get_supported_active_levels,
1476	OMPRTL_omp_get_level,
1477	OMPRTL_omp_get_ancestor_thread_num,
1478	OMPRTL_omp_get_team_size,
1479	OMPRTL_omp_get_active_level,
1480	OMPRTL_omp_in_final,
1481	OMPRTL_omp_get_proc_bind,
1482	OMPRTL_omp_get_num_places,
1483	OMPRTL_omp_get_num_procs,
1484	OMPRTL_omp_get_place_num,
1485	OMPRTL_omp_get_partition_num_places,
1486	OMPRTL_omp_get_partition_place_nums};
1487
1488	// Global-tid is handled separately.
1489	SmallSetVector<Value *, `16`> GTIdArgs;
1490	collectGlobalThreadIdArguments(GTIdArgs);
1491	LLVM_DEBUG(dbgs() << TAG << "Found " << GTIdArgs.size()
1492	<< " global thread ID arguments\n");
1493
1494	for (Function *F : SCC) {
1495	for (auto DeduplicableRuntimeCallID : DeduplicableRuntimeCallIDs)
1496	Changed \|= deduplicateRuntimeCalls(
1497	F&: *F, RFI&: OMPInfoCache.RFIs [DeduplicableRuntimeCallID]);
1498
1499	// __kmpc_global_thread_num is special as we can replace it with an
1500	// argument in enough cases to make it worth trying.
1501	Value GTIdArg = nullptr*;
1502	for (Argument &Arg : F->args())
1503	if (GTIdArgs.count(key: &Arg)) {
1504	GTIdArg = &Arg;
1505	break;
1506	}
1507	Changed \|= deduplicateRuntimeCalls(
1508	F&: *F, RFI&: OMPInfoCache.RFIs [OMPRTL___kmpc_global_thread_num], ReplVal: GTIdArg);
1509	}
1510
1511	return Changed;
1512	}
1513
1514	/// Tries to remove known runtime symbols that are optional from the module.
1515	bool removeRuntimeSymbols() {
1516	// The RPC client symbol is defined in `libc` and indicates that something
1517	// required an RPC server. If its users were all optimized out then we can
1518	// safely remove it.
1519	// TODO: This should be somewhere more common in the future.
1520	if (GlobalVariable *GV = M.getNamedGlobal(Name: "__llvm_libc_rpc_client")) {
1521	if (!GV->getType()->isPointerTy())
1522	return false;
1523
1524	Constant *C = GV->getInitializer();
1525	if (!C)
1526	return false;
1527
1528	// Check to see if the only user of the RPC client is the external handle.
1529	GlobalVariable *Client = dyn_cast<GlobalVariable>(Val: C->stripPointerCasts());
1530	if (!Client \|\| Client->getNumUses() > `1` \|\|
1531	Client->user_back() != GV->getInitializer())
1532	return false;
1533
1534	Client->replaceAllUsesWith(V: PoisonValue::get(T: Client->getType()));
1535	Client->eraseFromParent();
1536
1537	GV->replaceAllUsesWith(V: PoisonValue::get(T: GV->getType()));
1538	GV->eraseFromParent();
1539
1540	return true;
1541	}
1542	return false;
1543	}
1544
1545	/// Tries to hide the latency of runtime calls that involve host to
1546	/// device memory transfers by splitting them into their "issue" and "wait"
1547	/// versions. The "issue" is moved upwards as much as possible. The "wait" is
1548	/// moved downards as much as possible. The "issue" issues the memory transfer
1549	/// asynchronously, returning a handle. The "wait" waits in the returned
1550	/// handle for the memory transfer to finish.
1551	bool hideMemTransfersLatency() {
1552	auto &RFI = OMPInfoCache.RFIs [OMPRTL___tgt_target_data_begin_mapper];
1553	bool Changed = false;
1554	auto SplitMemTransfers = [&](Use &U, Function &Decl) {
1555	auto *RTCall = getCallIfRegularCall(U, RFI: &RFI);
1556	if (!RTCall)
1557	return false;
1558
1559	OffloadArray OffloadArrays[`3`];
1560	if (!getValuesInOffloadArrays(RuntimeCall&: *RTCall, OAs: OffloadArrays))
1561	return false;
1562
1563	LLVM_DEBUG(dumpValuesInOffloadArrays(OffloadArrays));
1564
1565	// TODO: Check if can be moved upwards.
1566	bool WasSplit = false;
1567	Instruction WaitMovementPoint = canBeMovedDownwards(RuntimeCall&: RTCall);
1568	if (WaitMovementPoint)
1569	WasSplit = splitTargetDataBeginRTC(RuntimeCall&: RTCall, WaitMovementPoint&: WaitMovementPoint);
1570
1571	Changed \|= WasSplit;
1572	return WasSplit;
1573	};
1574	if (OMPInfoCache.runtimeFnsAvailable(
1575	Fns: {OMPRTL___tgt_target_data_begin_mapper_issue,
1576	OMPRTL___tgt_target_data_begin_mapper_wait}))
1577	RFI.foreachUse(SCC, CB: SplitMemTransfers);
1578
1579	return Changed;
1580	}
1581
1582	void analysisGlobalization() {
1583	auto &RFI = OMPInfoCache.RFIs [OMPRTL___kmpc_alloc_shared];
1584
1585	auto CheckGlobalization = [&](Use &U, Function &Decl) {
1586	if (CallInst *CI = getCallIfRegularCall(U, RFI: &RFI)) {
1587	auto Remark = [&](OptimizationRemarkMissed ORM) {
1588	return ORM
1589	<< "Found thread data sharing on the GPU. "
1590	<< "Expect degraded performance due to data globalization.";
1591	};
1592	emitRemark<OptimizationRemarkMissed>(I: CI, RemarkName: "OMP112", RemarkCB&: Remark);
1593	}
1594
1595	return false;
1596	};
1597
1598	RFI.foreachUse(SCC, CB: CheckGlobalization);
1599	}
1600
1601	/// Maps the values stored in the offload arrays passed as arguments to
1602	/// \p RuntimeCall into the offload arrays in \p OAs.
1603	bool getValuesInOffloadArrays(CallInst &RuntimeCall,
1604	MutableArrayRef<OffloadArray> OAs) {
1605	assert(OAs.size() == `3` && "Need space for three offload arrays!");
1606
1607	// A runtime call that involves memory offloading looks something like:
1608	// call void @__tgt_target_data_begin_mapper(arg0, arg1,
1609	// i8* %offload_baseptrs, i8** %offload_ptrs, i64* %offload_sizes,*
1610	// ...)
1611	// So, the idea is to access the allocas that allocate space for these
1612	// offload arrays, offload_baseptrs, offload_ptrs, offload_sizes.
1613	// Therefore:
1614	// i8* %offload_baseptrs.*
1615	Value *BasePtrsArg =
1616	RuntimeCall.getArgOperand(i: OffloadArray::BasePtrsArgNum);
1617	// i8* %offload_ptrs.*
1618	Value *PtrsArg = RuntimeCall.getArgOperand(i: OffloadArray::PtrsArgNum);
1619	// i8* %offload_sizes.*
1620	Value *SizesArg = RuntimeCall.getArgOperand(i: OffloadArray::SizesArgNum);
1621
1622	// Get values stored in offload_baseptrs.
1623	auto *V = getUnderlyingObject(V: BasePtrsArg);
1624	if (!isa<AllocaInst>(Val: V))
1625	return false;
1626	auto *BasePtrsArray = cast<AllocaInst>(Val: V);
1627	if (!OAs [`0`].initialize(Array&: *BasePtrsArray, Before&: RuntimeCall))
1628	return false;
1629
1630	// Get values stored in offload_baseptrs.
1631	V = getUnderlyingObject(V: PtrsArg);
1632	if (!isa<AllocaInst>(Val: V))
1633	return false;
1634	auto *PtrsArray = cast<AllocaInst>(Val: V);
1635	if (!OAs [`1`].initialize(Array&: *PtrsArray, Before&: RuntimeCall))
1636	return false;
1637
1638	// Get values stored in offload_sizes.
1639	V = getUnderlyingObject(V: SizesArg);
1640	// If it's a [constant] global array don't analyze it.
1641	if (isa<GlobalValue>(Val: V))
1642	return isa<Constant>(Val: V);
1643	if (!isa<AllocaInst>(Val: V))
1644	return false;
1645
1646	auto *SizesArray = cast<AllocaInst>(Val: V);
1647	if (!OAs [`2`].initialize(Array&: *SizesArray, Before&: RuntimeCall))
1648	return false;
1649
1650	return true;
1651	}
1652
1653	/// Prints the values in the OffloadArrays \p OAs using LLVM_DEBUG.
1654	/// For now this is a way to test that the function getValuesInOffloadArrays
1655	/// is working properly.
1656	/// TODO: Move this to a unittest when unittests are available for OpenMPOpt.
1657	void dumpValuesInOffloadArrays(ArrayRef<OffloadArray> OAs) {
1658	assert(OAs.size() == `3` && "There are three offload arrays to debug!");
1659
1660	LLVM_DEBUG(dbgs() << TAG << " Successfully got offload values:\n");
1661	std::string ValuesStr;
1662	raw_string_ostream Printer(ValuesStr);
1663	std::string Separator = " --- ";
1664
1665	for (auto *BP : OAs [`0`].StoredValues) {
1666	BP->print(O&: Printer);
1667	Printer << Separator;
1668	}
1669	LLVM_DEBUG(dbgs() << "\t\toffload_baseptrs: " << ValuesStr << "\n");
1670	ValuesStr.clear();
1671
1672	for (auto *P : OAs [`1`].StoredValues) {
1673	P->print(O&: Printer);
1674	Printer << Separator;
1675	}
1676	LLVM_DEBUG(dbgs() << "\t\toffload_ptrs: " << ValuesStr << "\n");
1677	ValuesStr.clear();
1678
1679	for (auto *S : OAs [`2`].StoredValues) {
1680	S->print(O&: Printer);
1681	Printer << Separator;
1682	}
1683	LLVM_DEBUG(dbgs() << "\t\toffload_sizes: " << ValuesStr << "\n");
1684	}
1685
1686	/// Returns the instruction where the "wait" counterpart \p RuntimeCall can be
1687	/// moved. Returns nullptr if the movement is not possible, or not worth it.
1688	Instruction *canBeMovedDownwards(CallInst &RuntimeCall) {
1689	// FIXME: This traverses only the BasicBlock where RuntimeCall is.
1690	// Make it traverse the CFG.
1691
1692	Instruction *CurrentI = &RuntimeCall;
1693	bool IsWorthIt = false;
1694	while ((CurrentI = CurrentI->getNextNode())) {
1695
1696	// TODO: Once we detect the regions to be offloaded we should use the
1697	// alias analysis manager to check if CurrentI may modify one of
1698	// the offloaded regions.
1699	if (CurrentI->mayHaveSideEffects() \|\| CurrentI->mayReadFromMemory()) {
1700	if (IsWorthIt)
1701	return CurrentI;
1702
1703	return nullptr;
1704	}
1705
1706	// FIXME: For now if we move it over anything without side effect
1707	// is worth it.
1708	IsWorthIt = true;
1709	}
1710
1711	// Return end of BasicBlock.
1712	return RuntimeCall.getParent()->getTerminator();
1713	}
1714
1715	/// Splits \p RuntimeCall into its "issue" and "wait" counterparts.
1716	bool splitTargetDataBeginRTC(CallInst &RuntimeCall,
1717	Instruction &WaitMovementPoint) {
1718	// Create stack allocated handle (__tgt_async_info) at the beginning of the
1719	// function. Used for storing information of the async transfer, allowing to
1720	// wait on it later.
1721	auto &IRBuilder = OMPInfoCache.OMPBuilder;
1722	Function *F = RuntimeCall.getCaller();
1723	BasicBlock &Entry = F->getEntryBlock();
1724	IRBuilder.Builder.SetInsertPoint(TheBB: &Entry,
1725	IP: Entry.getFirstNonPHIOrDbgOrAlloca());
1726	Value *Handle = IRBuilder.Builder.CreateAlloca(
1727	Ty: IRBuilder.AsyncInfo, /ArraySize=/nullptr, Name: "handle");
1728	Handle =
1729	IRBuilder.Builder.CreateAddrSpaceCast(V: Handle, DestTy: IRBuilder.AsyncInfoPtr);
1730
1731	// Add "issue" runtime call declaration:
1732	// declare %struct.tgt_async_info @__tgt_target_data_begin_issue(i64, i32,
1733	// i8, i8, i64, i64)
1734	FunctionCallee IssueDecl = IRBuilder.getOrCreateRuntimeFunction(
1735	M, FnID: OMPRTL___tgt_target_data_begin_mapper_issue);
1736
1737	// Change RuntimeCall call site for its asynchronous version.
1738	SmallVector<Value *, `16`> Args;
1739	for (auto &Arg : RuntimeCall.args())
1740	Args.push_back(Elt: Arg.get());
1741	Args.push_back(Elt: Handle);
1742
1743	CallInst IssueCallsite = CallInst::Create(Func: IssueDecl, Args, /NameStr=/*"",
1744	InsertBefore: RuntimeCall.getIterator());
1745	OMPInfoCache.setCallingConvention(Callee: IssueDecl, CI: IssueCallsite);
1746	RuntimeCall.eraseFromParent();
1747
1748	// Add "wait" runtime call declaration:
1749	// declare void @__tgt_target_data_begin_wait(i64, %struct.__tgt_async_info)
1750	FunctionCallee WaitDecl = IRBuilder.getOrCreateRuntimeFunction(
1751	M, FnID: OMPRTL___tgt_target_data_begin_mapper_wait);
1752
1753	Value *WaitParams[`2`] = {
1754	IssueCallsite->getArgOperand(
1755	i: OffloadArray::DeviceIDArgNum), // device_id.
1756	Handle // handle to wait on.
1757	};
1758	CallInst *WaitCallsite = CallInst::Create(
1759	Func: WaitDecl, Args: WaitParams, /NameStr=/"", InsertBefore: WaitMovementPoint.getIterator());
1760	OMPInfoCache.setCallingConvention(Callee: WaitDecl, CI: WaitCallsite);
1761
1762	return true;
1763	}
1764
1765	static Value combinedIdentStruct(Value CurrentIdent, Value *NextIdent,
1766	bool GlobalOnly, bool &SingleChoice) {
1767	if (CurrentIdent == NextIdent)
1768	return CurrentIdent;
1769
1770	// TODO: Figure out how to actually combine multiple debug locations. For
1771	// now we just keep an existing one if there is a single choice.
1772	if (!GlobalOnly \|\| isa<GlobalValue>(Val: NextIdent)) {
1773	SingleChoice = !CurrentIdent;
1774	return NextIdent;
1775	}
1776	return nullptr;
1777	}
1778
1779	/// Return an `struct ident_t` value that represents the ones used in the*
1780	/// calls of \p RFI inside of \p F. If \p GlobalOnly is true, we will not
1781	/// return a local `struct ident_t`. For now, if we cannot find a suitable*
1782	/// return value we create one from scratch. We also do not yet combine
1783	/// information, e.g., the source locations, see combinedIdentStruct.
1784	Value *
1785	getCombinedIdentFromCallUsesIn(OMPInformationCache::RuntimeFunctionInfo &RFI,
1786	Function &F, bool GlobalOnly) {
1787	bool SingleChoice = true;
1788	Value Ident = nullptr*;
1789	auto CombineIdentStruct = [&](Use &U, Function &Caller) {
1790	CallInst *CI = getCallIfRegularCall(U, RFI: &RFI);
1791	if (!CI \|\| &F != &Caller)
1792	return false;
1793	Ident = combinedIdentStruct(CurrentIdent: Ident, NextIdent: CI->getArgOperand(i: `0`),
1794	/ GlobalOnly / true, SingleChoice);
1795	return false;
1796	};
1797	RFI.foreachUse(SCC, CB: CombineIdentStruct);
1798
1799	if (!Ident \|\| !SingleChoice) {
1800	// The IRBuilder uses the insertion block to get to the module, this is
1801	// unfortunate but we work around it for now.
1802	if (!OMPInfoCache.OMPBuilder.getInsertionPoint().getBlock())
1803	OMPInfoCache.OMPBuilder.updateToLocation(Loc: OpenMPIRBuilder::InsertPointTy (
1804	&F.getEntryBlock(), F.getEntryBlock().begin()));
1805	// Create a fallback location if non was found.
1806	// TODO: Use the debug locations of the calls instead.
1807	uint32_t SrcLocStrSize;
1808	Constant *Loc =
1809	OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
1810	Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(SrcLocStr: Loc, SrcLocStrSize);
1811	}
1812	return Ident;
1813	}
1814
1815	/// Try to eliminate calls of \p RFI in \p F by reusing an existing one or
1816	/// \p ReplVal if given.
1817	bool deduplicateRuntimeCalls(Function &F,
1818	OMPInformationCache::RuntimeFunctionInfo &RFI,
1819	Value ReplVal = nullptr*) {
1820	auto *UV = RFI.getUseVector(F);
1821	if (!UV \|\| UV->size() + (ReplVal != nullptr) < `2`)
1822	return false;
1823
1824	LLVM_DEBUG(
1825	dbgs() << TAG << "Deduplicate " << UV->size() << " uses of " << RFI.Name
1826	<< (ReplVal ? " with an existing value\n" : "\n") << "\n");
1827
1828	assert((!ReplVal \|\| (isa<Argument>(ReplVal) &&
1829	cast<Argument>(ReplVal)->getParent() == &F)) &&
1830	"Unexpected replacement value!");
1831
1832	// TODO: Use dominance to find a good position instead.
1833	auto CanBeMoved = [this](CallBase &CB) {
1834	unsigned NumArgs = CB.arg_size();
1835	if (NumArgs == `0`)
1836	return true;
1837	if (CB.getArgOperand(i: `0`)->getType() != OMPInfoCache.OMPBuilder.IdentPtr)
1838	return false;
1839	for (unsigned U = `1`; U < NumArgs; ++U)
1840	if (isa<Instruction>(Val: CB.getArgOperand(i: U)))
1841	return false;
1842	return true;
1843	};
1844
1845	if (!ReplVal) {
1846	auto *DT =
1847	OMPInfoCache.getAnalysisResultForFunction<DominatorTreeAnalysis>(F);
1848	if (!DT)
1849	return false;
1850	Instruction IP = nullptr*;
1851	for (Use U : UV) {
1852	if (CallInst CI = getCallIfRegularCall(U&: U, RFI: &RFI)) {
1853	if (IP)
1854	IP = DT->findNearestCommonDominator(I1: IP, I2: CI);
1855	else
1856	IP = CI;
1857	if (!CanBeMoved(*CI))
1858	continue;
1859	if (!ReplVal)
1860	ReplVal = CI;
1861	}
1862	}
1863	if (!ReplVal)
1864	return false;
1865	assert(IP && "Expected insertion point!");
1866	cast<Instruction>(Val: ReplVal)->moveBefore(MovePos: IP);
1867	}
1868
1869	// If we use a call as a replacement value we need to make sure the ident is
1870	// valid at the new location. For now we just pick a global one, either
1871	// existing and used by one of the calls, or created from scratch.
1872	if (CallBase *CI = dyn_cast<CallBase>(Val: ReplVal)) {
1873	if (!CI->arg_empty() &&
1874	CI->getArgOperand(i: `0`)->getType() == OMPInfoCache.OMPBuilder.IdentPtr) {
1875	Value *Ident = getCombinedIdentFromCallUsesIn(RFI, F,
1876	/ GlobalOnly / true);
1877	CI->setArgOperand(i: `0`, v: Ident);
1878	}
1879	}
1880
1881	bool Changed = false;
1882	auto ReplaceAndDeleteCB = [&](Use &U, Function &Caller) {
1883	CallInst *CI = getCallIfRegularCall(U, RFI: &RFI);
1884	if (!CI \|\| CI == ReplVal \|\| &F != &Caller)
1885	return false;
1886	assert(CI->getCaller() == &F && "Unexpected call!");
1887
1888	auto Remark = [&](OptimizationRemark OR) {
1889	return OR << "OpenMP runtime call "
1890	<< ore::NV ("OpenMPOptRuntime", RFI.Name) << " deduplicated.";
1891	};
1892	if (CI->getDebugLoc())
1893	emitRemark<OptimizationRemark>(I: CI, RemarkName: "OMP170", RemarkCB&: Remark);
1894	else
1895	emitRemark<OptimizationRemark>(F: &F, RemarkName: "OMP170", RemarkCB&: Remark);
1896
1897	CI->replaceAllUsesWith(V: ReplVal);
1898	CI->eraseFromParent();
1899	++NumOpenMPRuntimeCallsDeduplicated;
1900	Changed = true;
1901	return true;
1902	};
1903	RFI.foreachUse(SCC, CB: ReplaceAndDeleteCB);
1904
1905	return Changed;
1906	}
1907
1908	/// Collect arguments that represent the global thread id in \p GTIdArgs.
1909	void collectGlobalThreadIdArguments(SmallSetVector<Value *, `16`> &GTIdArgs) {
1910	// TODO: Below we basically perform a fixpoint iteration with a pessimistic
1911	// initialization. We could define an AbstractAttribute instead and
1912	// run the Attributor here once it can be run as an SCC pass.
1913
1914	// Helper to check the argument \p ArgNo at all call sites of \p F for
1915	// a GTId.
1916	auto CallArgOpIsGTId = [&](Function &F, unsigned ArgNo, CallInst &RefCI) {
1917	if (!F.hasLocalLinkage())
1918	return false;
1919	for (Use &U : F.uses()) {
1920	if (CallInst *CI = getCallIfRegularCall(U)) {
1921	Value *ArgOp = CI->getArgOperand(i: ArgNo);
1922	if (CI == &RefCI \|\| GTIdArgs.count(key: ArgOp) \|\|
1923	getCallIfRegularCall(
1924	V&: *ArgOp, RFI: &OMPInfoCache.RFIs [OMPRTL___kmpc_global_thread_num]))
1925	continue;
1926	}
1927	return false;
1928	}
1929	return true;
1930	};
1931
1932	// Helper to identify uses of a GTId as GTId arguments.
1933	auto AddUserArgs = [&](Value &GTId) {
1934	for (Use &U : GTId.uses())
1935	if (CallInst *CI = dyn_cast<CallInst>(Val: U.getUser()))
1936	if (CI->isArgOperand(U: &U))
1937	if (Function *Callee = CI->getCalledFunction())
1938	if (CallArgOpIsGTId(Callee, U.getOperandNo(), CI))
1939	GTIdArgs.insert(X: Callee->getArg(i: U.getOperandNo()));
1940	};
1941
1942	// The argument users of __kmpc_global_thread_num calls are GTIds.
1943	OMPInformationCache::RuntimeFunctionInfo &GlobThreadNumRFI =
1944	OMPInfoCache.RFIs [OMPRTL___kmpc_global_thread_num];
1945
1946	GlobThreadNumRFI.foreachUse(SCC, CB: [&](Use &U, Function &F) {
1947	if (CallInst *CI = getCallIfRegularCall(U, RFI: &GlobThreadNumRFI))
1948	AddUserArgs(*CI);
1949	return false;
1950	});
1951
1952	// Transitively search for more arguments by looking at the users of the
1953	// ones we know already. During the search the GTIdArgs vector is extended
1954	// so we cannot cache the size nor can we use a range based for.
1955	for (unsigned U = `0`; U < GTIdArgs.size(); ++U)
1956	AddUserArgs(*GTIdArgs [U]);
1957	}
1958
1959	/// Kernel (=GPU) optimizations and utility functions
1960	///
1961	///{{
1962
1963	/// Cache to remember the unique kernel for a function.
1964	DenseMap<Function *, std::optional<Kernel>> UniqueKernelMap;
1965
1966	/// Find the unique kernel that will execute \p F, if any.
1967	Kernel getUniqueKernelFor(Function &F);
1968
1969	/// Find the unique kernel that will execute \p I, if any.
1970	Kernel getUniqueKernelFor(Instruction &I) {
1971	return getUniqueKernelFor(F&: *I.getFunction());
1972	}
1973
1974	/// Rewrite the device (=GPU) code state machine create in non-SPMD mode in
1975	/// the cases we can avoid taking the address of a function.
1976	bool rewriteDeviceCodeStateMachine();
1977
1978	///
1979	///}}
1980
1981	/// Emit a remark generically
1982	///
1983	/// This template function can be used to generically emit a remark. The
1984	/// RemarkKind should be one of the following:
1985	/// - OptimizationRemark to indicate a successful optimization attempt
1986	/// - OptimizationRemarkMissed to report a failed optimization attempt
1987	/// - OptimizationRemarkAnalysis to provide additional information about an
1988	/// optimization attempt
1989	///
1990	/// The remark is built using a callback function provided by the caller that
1991	/// takes a RemarkKind as input and returns a RemarkKind.
1992	template <typename RemarkKind, typename RemarkCallBack>
1993	void emitRemark(Instruction *I, StringRef RemarkName,
1994	RemarkCallBack &&RemarkCB) const {
1995	Function *F = I->getParent()->getParent();
1996	auto &ORE = OREGetter (F);
1997
1998	if (RemarkName.starts_with(Prefix: "OMP"))
1999	ORE.emit([&]() {
2000	return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, I))
2001	<< " [" << RemarkName << "]";
2002	});
2003	else
2004	ORE.emit(
2005	[&]() { return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, I)); });
2006	}
2007
2008	/// Emit a remark on a function.
2009	template <typename RemarkKind, typename RemarkCallBack>
2010	void emitRemark(Function *F, StringRef RemarkName,
2011	RemarkCallBack &&RemarkCB) const {
2012	auto &ORE = OREGetter (F);
2013
2014	if (RemarkName.starts_with(Prefix: "OMP"))
2015	ORE.emit([&]() {
2016	return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, F))
2017	<< " [" << RemarkName << "]";
2018	});
2019	else
2020	ORE.emit(
2021	[&]() { return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, F)); });
2022	}
2023
2024	/// The underlying module.
2025	Module &M;
2026
2027	/// The SCC we are operating on.
2028	SmallVectorImpl<Function *> &SCC;
2029
2030	/// Callback to update the call graph, the first argument is a removed call,
2031	/// the second an optional replacement call.
2032	CallGraphUpdater &CGUpdater;
2033
2034	/// Callback to get an OptimizationRemarkEmitter from a Function *
2035	OptimizationRemarkGetter OREGetter;
2036
2037	/// OpenMP-specific information cache. Also Used for Attributor runs.
2038	OMPInformationCache &OMPInfoCache;
2039
2040	/// Attributor instance.
2041	Attributor &A;
2042
2043	/// Helper function to run Attributor on SCC.
2044	bool runAttributor(bool IsModulePass) {
2045	if (SCC.empty())
2046	return false;
2047
2048	registerAAs(IsModulePass);
2049
2050	ChangeStatus Changed = A.run();
2051
2052	LLVM_DEBUG(dbgs() << "[Attributor] Done with " << SCC.size()
2053	<< " functions, result: " << Changed << ".\n");
2054
2055	if (Changed == ChangeStatus::CHANGED)
2056	OMPInfoCache.invalidateAnalyses();
2057
2058	return Changed == ChangeStatus::CHANGED;
2059	}
2060
2061	void registerFoldRuntimeCall(RuntimeFunction RF);
2062
2063	/// Populate the Attributor with abstract attribute opportunities in the
2064	/// functions.
2065	void registerAAs(bool IsModulePass);
2066
2067	public:
2068	/// Callback to register AAs for live functions, including internal functions
2069	/// marked live during the traversal.
2070	static void registerAAsForFunction(Attributor &A, const Function &F);
2071	};
2072
2073	Kernel OpenMPOpt::getUniqueKernelFor(Function &F) {
2074	if (OMPInfoCache.CGSCC && !OMPInfoCache.CGSCC->empty() &&
2075	!OMPInfoCache.CGSCC->contains(key: &F))
2076	return nullptr;
2077
2078	// Use a scope to keep the lifetime of the CachedKernel short.
2079	{
2080	std::optional<Kernel> &CachedKernel = UniqueKernelMap [&F];
2081	if (CachedKernel)
2082	return *CachedKernel;
2083
2084	// TODO: We should use an AA to create an (optimistic and callback
2085	// call-aware) call graph. For now we stick to simple patterns that
2086	// are less powerful, basically the worst fixpoint.
2087	if (isOpenMPKernel(Fn&: F)) {
2088	CachedKernel = Kernel(&F);
2089	return *CachedKernel;
2090	}
2091
2092	CachedKernel = nullptr;
2093	if (!F.hasLocalLinkage()) {
2094
2095	// See https://openmp.llvm.org/remarks/OptimizationRemarks.html
2096	auto Remark = [&](OptimizationRemarkAnalysis ORA) {
2097	return ORA << "Potentially unknown OpenMP target region caller.";
2098	};
2099	emitRemark<OptimizationRemarkAnalysis>(F: &F, RemarkName: "OMP100", RemarkCB&: Remark);
2100
2101	return nullptr;
2102	}
2103	}
2104
2105	auto GetUniqueKernelForUse = [&](const Use &U) -> Kernel {
2106	if (auto *Cmp = dyn_cast<ICmpInst>(Val: U.getUser())) {
2107	// Allow use in equality comparisons.
2108	if (Cmp->isEquality())
2109	return getUniqueKernelFor(I&: *Cmp);
2110	return nullptr;
2111	}
2112	if (auto *CB = dyn_cast<CallBase>(Val: U.getUser())) {
2113	// Allow direct calls.
2114	if (CB->isCallee(U: &U))
2115	return getUniqueKernelFor(I&: *CB);
2116
2117	OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
2118	OMPInfoCache.RFIs [OMPRTL___kmpc_parallel_51];
2119	// Allow the use in __kmpc_parallel_51 calls.
2120	if (OpenMPOpt::getCallIfRegularCall(V&: *U.getUser(), RFI: &KernelParallelRFI))
2121	return getUniqueKernelFor(I&: *CB);
2122	return nullptr;
2123	}
2124	// Disallow every other use.
2125	return nullptr;
2126	};
2127
2128	// TODO: In the future we want to track more than just a unique kernel.
2129	SmallPtrSet<Kernel, `2`> PotentialKernels;
2130	OMPInformationCache::foreachUse(F, CB: [&](const Use &U) {
2131	PotentialKernels.insert(Ptr: GetUniqueKernelForUse (U));
2132	});
2133
2134	Kernel K = nullptr;
2135	if (PotentialKernels.size() == `1`)
2136	K = *PotentialKernels.begin();
2137
2138	// Cache the result.
2139	UniqueKernelMap [&F] = K;
2140
2141	return K;
2142	}
2143
2144	bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
2145	OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
2146	OMPInfoCache.RFIs [OMPRTL___kmpc_parallel_51];
2147
2148	bool Changed = false;
2149	if (!KernelParallelRFI)
2150	return Changed;
2151
2152	// If we have disabled state machine changes, exit
2153	if (DisableOpenMPOptStateMachineRewrite)
2154	return Changed;
2155
2156	for (Function *F : SCC) {
2157
2158	// Check if the function is a use in a __kmpc_parallel_51 call at
2159	// all.
2160	bool UnknownUse = false;
2161	bool KernelParallelUse = false;
2162	unsigned NumDirectCalls = `0`;
2163
2164	SmallVector<Use *, `2`> ToBeReplacedStateMachineUses;
2165	OMPInformationCache::foreachUse(F&: *F, CB: [&](Use &U) {
2166	if (auto *CB = dyn_cast<CallBase>(Val: U.getUser()))
2167	if (CB->isCallee(U: &U)) {
2168	++NumDirectCalls;
2169	return;
2170	}
2171
2172	if (isa<ICmpInst>(Val: U.getUser())) {
2173	ToBeReplacedStateMachineUses.push_back(Elt: &U);
2174	return;
2175	}
2176
2177	// Find wrapper functions that represent parallel kernels.
2178	CallInst *CI =
2179	OpenMPOpt::getCallIfRegularCall(V&: *U.getUser(), RFI: &KernelParallelRFI);
2180	const unsigned int WrapperFunctionArgNo = `6`;
2181	if (!KernelParallelUse && CI &&
2182	CI->getArgOperandNo(U: &U) == WrapperFunctionArgNo) {
2183	KernelParallelUse = true;
2184	ToBeReplacedStateMachineUses.push_back(Elt: &U);
2185	return;
2186	}
2187	UnknownUse = true;
2188	});
2189
2190	// Do not emit a remark if we haven't seen a __kmpc_parallel_51
2191	// use.
2192	if (!KernelParallelUse)
2193	continue;
2194
2195	// If this ever hits, we should investigate.
2196	// TODO: Checking the number of uses is not a necessary restriction and
2197	// should be lifted.
2198	if (UnknownUse \|\| NumDirectCalls != `1` \|\|
2199	ToBeReplacedStateMachineUses.size() > `2`) {
2200	auto Remark = [&](OptimizationRemarkAnalysis ORA) {
2201	return ORA << "Parallel region is used in "
2202	<< (UnknownUse ? "unknown" : "unexpected")
2203	<< " ways. Will not attempt to rewrite the state machine.";
2204	};
2205	emitRemark<OptimizationRemarkAnalysis>(F, RemarkName: "OMP101", RemarkCB&: Remark);
2206	continue;
2207	}
2208
2209	// Even if we have __kmpc_parallel_51 calls, we (for now) give
2210	// up if the function is not called from a unique kernel.
2211	Kernel K = getUniqueKernelFor(F&: *F);
2212	if (!K) {
2213	auto Remark = [&](OptimizationRemarkAnalysis ORA) {
2214	return ORA << "Parallel region is not called from a unique kernel. "
2215	"Will not attempt to rewrite the state machine.";
2216	};
2217	emitRemark<OptimizationRemarkAnalysis>(F, RemarkName: "OMP102", RemarkCB&: Remark);
2218	continue;
2219	}
2220
2221	// We now know F is a parallel body function called only from the kernel K.
2222	// We also identified the state machine uses in which we replace the
2223	// function pointer by a new global symbol for identification purposes. This
2224	// ensures only direct calls to the function are left.
2225
2226	Module &M = *F->getParent();
2227	Type *Int8Ty = Type::getInt8Ty(C&: M.getContext());
2228
2229	auto ID = new* GlobalVariable (
2230	M, Int8Ty, / isConstant / true, GlobalValue::PrivateLinkage,
2231	UndefValue::get(T: Int8Ty), F->getName() + ".ID");
2232
2233	for (Use *U : ToBeReplacedStateMachineUses)
2234	U->set(ConstantExpr::getPointerBitCastOrAddrSpaceCast(
2235	C: ID, Ty: U->get()->getType()));
2236
2237	++NumOpenMPParallelRegionsReplacedInGPUStateMachine;
2238
2239	Changed = true;
2240	}
2241
2242	return Changed;
2243	}
2244
2245	/// Abstract Attribute for tracking ICV values.
2246	struct AAICVTracker : public StateWrapper<BooleanState, AbstractAttribute> {
2247	using Base = StateWrapper<BooleanState, AbstractAttribute>;
2248	AAICVTracker(const IRPosition &IRP, Attributor &A) : Base (IRP) {}
2249
2250	/// Returns true if value is assumed to be tracked.
2251	bool isAssumedTracked() const { return getAssumed(); }
2252
2253	/// Returns true if value is known to be tracked.
2254	bool isKnownTracked() const { return getAssumed(); }
2255
2256	/// Create an abstract attribute biew for the position \p IRP.
2257	static AAICVTracker &createForPosition(const IRPosition &IRP, Attributor &A);
2258
2259	/// Return the value with which \p I can be replaced for specific \p ICV.
2260	virtual std::optional<Value *> getReplacementValue(InternalControlVar ICV,
2261	const Instruction *I,
2262	Attributor &A) const {
2263	return std::nullopt;
2264	}
2265
2266	/// Return an assumed unique ICV value if a single candidate is found. If
2267	/// there cannot be one, return a nullptr. If it is not clear yet, return
2268	/// std::nullopt.
2269	virtual std::optional<Value *>
2270	getUniqueReplacementValue(InternalControlVar ICV) const = `0`;
2271
2272	// Currently only nthreads is being tracked.
2273	// this array will only grow with time.
2274	InternalControlVar TrackableICVs[`1`] = {ICV_nthreads};
2275
2276	/// See AbstractAttribute::getName()
2277	const std::string getName() const override { return "AAICVTracker"; }
2278
2279	/// See AbstractAttribute::getIdAddr()
2280	const char getIdAddr() const* override { return &ID; }
2281
2282	/// This function should return true if the type of the \p AA is AAICVTracker
2283	static bool classof(const AbstractAttribute *AA) {
2284	return (AA->getIdAddr() == &ID);
2285	}
2286
2287	static const char ID;
2288	};
2289
2290	struct AAICVTrackerFunction : public AAICVTracker {
2291	AAICVTrackerFunction(const IRPosition &IRP, Attributor &A)
2292	: AAICVTracker (IRP, A) {}
2293
2294	// FIXME: come up with better string.
2295	const std::string getAsStr(Attributor ) const* override {
2296	return "ICVTrackerFunction";
2297	}
2298
2299	// FIXME: come up with some stats.
2300	void trackStatistics() const override {}
2301
2302	/// We don't manifest anything for this AA.
2303	ChangeStatus manifest(Attributor &A) override {
2304	return ChangeStatus::UNCHANGED;
2305	}
2306
2307	// Map of ICV to their values at specific program point.
2308	EnumeratedArray<DenseMap<Instruction , Value >, InternalControlVar,
2309	InternalControlVar::ICV___last>
2310	ICVReplacementValuesMap;
2311
2312	ChangeStatus updateImpl(Attributor &A) override {
2313	ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
2314
2315	Function *F = getAnchorScope();
2316
2317	auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2318
2319	for (InternalControlVar ICV : TrackableICVs) {
2320	auto &SetterRFI = OMPInfoCache.RFIs [OMPInfoCache.ICVs [ICV].Setter];
2321
2322	auto &ValuesMap = ICVReplacementValuesMap [ICV];
2323	auto TrackValues = [&](Use &U, Function &) {
2324	CallInst *CI = OpenMPOpt::getCallIfRegularCall(U);
2325	if (!CI)
2326	return false;
2327
2328	// FIXME: handle setters with more that 1 arguments.
2329	/// Track new value.
2330	if (ValuesMap.insert(KV: std::make_pair(x&: CI, y: CI->getArgOperand(i: `0`))).second)
2331	HasChanged = ChangeStatus::CHANGED;
2332
2333	return false;
2334	};
2335
2336	auto CallCheck = [&](Instruction &I) {
2337	std::optional<Value *> ReplVal = getValueForCall(A, I, ICV);
2338	if (ReplVal && ValuesMap.insert(KV: std::make_pair(x: &I, y&: *ReplVal)).second)
2339	HasChanged = ChangeStatus::CHANGED;
2340
2341	return true;
2342	};
2343
2344	// Track all changes of an ICV.
2345	SetterRFI.foreachUse(CB: TrackValues, F);
2346
2347	bool UsedAssumedInformation = false;
2348	A.checkForAllInstructions(Pred: CallCheck, QueryingAA: *this, Opcodes: {Instruction::Call},
2349	UsedAssumedInformation,
2350	/ CheckBBLivenessOnly / true);
2351
2352	/// TODO: Figure out a way to avoid adding entry in
2353	/// ICVReplacementValuesMap
2354	Instruction *Entry = &F->getEntryBlock().front();
2355	if (HasChanged == ChangeStatus::CHANGED && !ValuesMap.count(Val: Entry))
2356	ValuesMap.insert(KV: std::make_pair(x&: Entry, y: nullptr));
2357	}
2358
2359	return HasChanged;
2360	}
2361
2362	/// Helper to check if \p I is a call and get the value for it if it is
2363	/// unique.
2364	std::optional<Value > getValueForCall(Attributor &A, const* Instruction &I,
2365	InternalControlVar &ICV) const {
2366
2367	const auto *CB = dyn_cast<CallBase>(Val: &I);
2368	if (!CB \|\| CB->hasFnAttr(Kind: "no_openmp") \|\|
2369	CB->hasFnAttr(Kind: "no_openmp_routines"))
2370	return std::nullopt;
2371
2372	auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2373	auto &GetterRFI = OMPInfoCache.RFIs [OMPInfoCache.ICVs [ICV].Getter];
2374	auto &SetterRFI = OMPInfoCache.RFIs [OMPInfoCache.ICVs [ICV].Setter];
2375	Function *CalledFunction = CB->getCalledFunction();
2376
2377	// Indirect call, assume ICV changes.
2378	if (CalledFunction == nullptr)
2379	return nullptr;
2380	if (CalledFunction == GetterRFI.Declaration)
2381	return std::nullopt;
2382	if (CalledFunction == SetterRFI.Declaration) {
2383	if (ICVReplacementValuesMap [ICV].count(Val: &I))
2384	return ICVReplacementValuesMap [ICV].lookup(Val: &I);
2385
2386	return nullptr;
2387	}
2388
2389	// Since we don't know, assume it changes the ICV.
2390	if (CalledFunction->isDeclaration())
2391	return nullptr;
2392
2393	const auto *ICVTrackingAA = A.getAAFor<AAICVTracker>(
2394	QueryingAA: *this, IRP: IRPosition::callsite_returned(CB: *CB), DepClass: DepClassTy::REQUIRED);
2395
2396	if (ICVTrackingAA->isAssumedTracked()) {
2397	std::optional<Value *> URV =
2398	ICVTrackingAA->getUniqueReplacementValue(ICV);
2399	if (!URV \|\| (URV && AA::isValidAtPosition(VAC: AA::ValueAndContext (*URV, I),
2400	InfoCache&: OMPInfoCache)))
2401	return URV;
2402	}
2403
2404	// If we don't know, assume it changes.
2405	return nullptr;
2406	}
2407
2408	// We don't check unique value for a function, so return std::nullopt.
2409	std::optional<Value *>
2410	getUniqueReplacementValue(InternalControlVar ICV) const override {
2411	return std::nullopt;
2412	}
2413
2414	/// Return the value with which \p I can be replaced for specific \p ICV.
2415	std::optional<Value *> getReplacementValue(InternalControlVar ICV,
2416	const Instruction *I,
2417	Attributor &A) const override {
2418	const auto &ValuesMap = ICVReplacementValuesMap [ICV];
2419	if (ValuesMap.count(Val: I))
2420	return ValuesMap.lookup(Val: I);
2421
2422	SmallVector<const Instruction *, `16`> Worklist;
2423	SmallPtrSet<const Instruction *, `16`> Visited;
2424	Worklist.push_back(Elt: I);
2425
2426	std::optional<Value *> ReplVal;
2427
2428	while (!Worklist.empty()) {
2429	const Instruction *CurrInst = Worklist.pop_back_val();
2430	if (!Visited.insert(Ptr: CurrInst).second)
2431	continue;
2432
2433	const BasicBlock *CurrBB = CurrInst->getParent();
2434
2435	// Go up and look for all potential setters/calls that might change the
2436	// ICV.
2437	while ((CurrInst = CurrInst->getPrevNode())) {
2438	if (ValuesMap.count(Val: CurrInst)) {
2439	std::optional<Value *> NewReplVal = ValuesMap.lookup(Val: CurrInst);
2440	// Unknown value, track new.
2441	if (!ReplVal) {
2442	ReplVal = NewReplVal;
2443	break;
2444	}
2445
2446	// If we found a new value, we can't know the icv value anymore.
2447	if (NewReplVal)
2448	if (ReplVal != NewReplVal)
2449	return nullptr;
2450
2451	break;
2452	}
2453
2454	std::optional<Value > NewReplVal = getValueForCall(A, I: CurrInst, ICV);
2455	if (!NewReplVal)
2456	continue;
2457
2458	// Unknown value, track new.
2459	if (!ReplVal) {
2460	ReplVal = NewReplVal;
2461	break;
2462	}
2463
2464	// if (NewReplVal.hasValue())
2465	// We found a new value, we can't know the icv value anymore.
2466	if (ReplVal != NewReplVal)
2467	return nullptr;
2468	}
2469
2470	// If we are in the same BB and we have a value, we are done.
2471	if (CurrBB == I->getParent() && ReplVal)
2472	return ReplVal;
2473
2474	// Go through all predecessors and add terminators for analysis.
2475	for (const BasicBlock *Pred : predecessors(BB: CurrBB))
2476	if (const Instruction *Terminator = Pred->getTerminator())
2477	Worklist.push_back(Elt: Terminator);
2478	}
2479
2480	return ReplVal;
2481	}
2482	};
2483
2484	struct AAICVTrackerFunctionReturned : AAICVTracker {
2485	AAICVTrackerFunctionReturned(const IRPosition &IRP, Attributor &A)
2486	: AAICVTracker (IRP, A) {}
2487
2488	// FIXME: come up with better string.
2489	const std::string getAsStr(Attributor ) const* override {
2490	return "ICVTrackerFunctionReturned";
2491	}
2492
2493	// FIXME: come up with some stats.
2494	void trackStatistics() const override {}
2495
2496	/// We don't manifest anything for this AA.
2497	ChangeStatus manifest(Attributor &A) override {
2498	return ChangeStatus::UNCHANGED;
2499	}
2500
2501	// Map of ICV to their values at specific program point.
2502	EnumeratedArray<std::optional<Value *>, InternalControlVar,
2503	InternalControlVar::ICV___last>
2504	ICVReplacementValuesMap;
2505
2506	/// Return the value with which \p I can be replaced for specific \p ICV.
2507	std::optional<Value *>
2508	getUniqueReplacementValue(InternalControlVar ICV) const override {
2509	return ICVReplacementValuesMap [ICV];
2510	}
2511
2512	ChangeStatus updateImpl(Attributor &A) override {
2513	ChangeStatus Changed = ChangeStatus::UNCHANGED;
2514	const auto *ICVTrackingAA = A.getAAFor<AAICVTracker>(
2515	QueryingAA: *this, IRP: IRPosition::function(F: *getAnchorScope()), DepClass: DepClassTy::REQUIRED);
2516
2517	if (!ICVTrackingAA->isAssumedTracked())
2518	return indicatePessimisticFixpoint();
2519
2520	for (InternalControlVar ICV : TrackableICVs) {
2521	std::optional<Value *> &ReplVal = ICVReplacementValuesMap [ICV];
2522	std::optional<Value *> UniqueICVValue;
2523
2524	auto CheckReturnInst = [&](Instruction &I) {
2525	std::optional<Value *> NewReplVal =
2526	ICVTrackingAA->getReplacementValue(ICV, I: &I, A);
2527
2528	// If we found a second ICV value there is no unique returned value.
2529	if (UniqueICVValue && UniqueICVValue != NewReplVal)
2530	return false;
2531
2532	UniqueICVValue = NewReplVal;
2533
2534	return true;
2535	};
2536
2537	bool UsedAssumedInformation = false;
2538	if (!A.checkForAllInstructions(Pred: CheckReturnInst, QueryingAA: *this, Opcodes: {Instruction::Ret},
2539	UsedAssumedInformation,
2540	/ CheckBBLivenessOnly / true))
2541	UniqueICVValue = nullptr;
2542
2543	if (UniqueICVValue == ReplVal)
2544	continue;
2545
2546	ReplVal = UniqueICVValue;
2547	Changed = ChangeStatus::CHANGED;
2548	}
2549
2550	return Changed;
2551	}
2552	};
2553
2554	struct AAICVTrackerCallSite : AAICVTracker {
2555	AAICVTrackerCallSite(const IRPosition &IRP, Attributor &A)
2556	: AAICVTracker (IRP, A) {}
2557
2558	void initialize(Attributor &A) override {
2559	assert(getAnchorScope() && "Expected anchor function");
2560
2561	// We only initialize this AA for getters, so we need to know which ICV it
2562	// gets.
2563	auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2564	for (InternalControlVar ICV : TrackableICVs) {
2565	auto ICVInfo = OMPInfoCache.ICVs [ICV];
2566	auto &Getter = OMPInfoCache.RFIs [ICVInfo.Getter];
2567	if (Getter.Declaration == getAssociatedFunction()) {
2568	AssociatedICV = ICVInfo.Kind;
2569	return;
2570	}
2571	}
2572
2573	/// Unknown ICV.
2574	indicatePessimisticFixpoint();
2575	}
2576
2577	ChangeStatus manifest(Attributor &A) override {
2578	if (!ReplVal \|\| !*ReplVal)
2579	return ChangeStatus::UNCHANGED;
2580
2581	A.changeAfterManifest(IRP: IRPosition::inst(I: getCtxI()), NV&: *ReplVal);
2582	A.deleteAfterManifest(I&: *getCtxI());
2583
2584	return ChangeStatus::CHANGED;
2585	}
2586
2587	// FIXME: come up with better string.
2588	const std::string getAsStr(Attributor ) const* override {
2589	return "ICVTrackerCallSite";
2590	}
2591
2592	// FIXME: come up with some stats.
2593	void trackStatistics() const override {}
2594
2595	InternalControlVar AssociatedICV;
2596	std::optional<Value *> ReplVal;
2597
2598	ChangeStatus updateImpl(Attributor &A) override {
2599	const auto *ICVTrackingAA = A.getAAFor<AAICVTracker>(
2600	QueryingAA: *this, IRP: IRPosition::function(F: *getAnchorScope()), DepClass: DepClassTy::REQUIRED);
2601
2602	// We don't have any information, so we assume it changes the ICV.
2603	if (!ICVTrackingAA->isAssumedTracked())
2604	return indicatePessimisticFixpoint();
2605
2606	std::optional<Value *> NewReplVal =
2607	ICVTrackingAA->getReplacementValue(ICV: AssociatedICV, I: getCtxI(), A);
2608
2609	if (ReplVal == NewReplVal)
2610	return ChangeStatus::UNCHANGED;
2611
2612	ReplVal = NewReplVal;
2613	return ChangeStatus::CHANGED;
2614	}
2615
2616	// Return the value with which associated value can be replaced for specific
2617	// \p ICV.
2618	std::optional<Value *>
2619	getUniqueReplacementValue(InternalControlVar ICV) const override {
2620	return ReplVal;
2621	}
2622	};
2623
2624	struct AAICVTrackerCallSiteReturned : AAICVTracker {
2625	AAICVTrackerCallSiteReturned(const IRPosition &IRP, Attributor &A)
2626	: AAICVTracker (IRP, A) {}
2627
2628	// FIXME: come up with better string.
2629	const std::string getAsStr(Attributor ) const* override {
2630	return "ICVTrackerCallSiteReturned";
2631	}
2632
2633	// FIXME: come up with some stats.
2634	void trackStatistics() const override {}
2635
2636	/// We don't manifest anything for this AA.
2637	ChangeStatus manifest(Attributor &A) override {
2638	return ChangeStatus::UNCHANGED;
2639	}
2640
2641	// Map of ICV to their values at specific program point.
2642	EnumeratedArray<std::optional<Value *>, InternalControlVar,
2643	InternalControlVar::ICV___last>
2644	ICVReplacementValuesMap;
2645
2646	/// Return the value with which associated value can be replaced for specific
2647	/// \p ICV.
2648	std::optional<Value *>
2649	getUniqueReplacementValue(InternalControlVar ICV) const override {
2650	return ICVReplacementValuesMap [ICV];
2651	}
2652
2653	ChangeStatus updateImpl(Attributor &A) override {
2654	ChangeStatus Changed = ChangeStatus::UNCHANGED;
2655	const auto *ICVTrackingAA = A.getAAFor<AAICVTracker>(
2656	QueryingAA: *this, IRP: IRPosition::returned(F: *getAssociatedFunction()),
2657	DepClass: DepClassTy::REQUIRED);
2658
2659	// We don't have any information, so we assume it changes the ICV.
2660	if (!ICVTrackingAA->isAssumedTracked())
2661	return indicatePessimisticFixpoint();
2662
2663	for (InternalControlVar ICV : TrackableICVs) {
2664	std::optional<Value *> &ReplVal = ICVReplacementValuesMap [ICV];
2665	std::optional<Value *> NewReplVal =
2666	ICVTrackingAA->getUniqueReplacementValue(ICV);
2667
2668	if (ReplVal == NewReplVal)
2669	continue;
2670
2671	ReplVal = NewReplVal;
2672	Changed = ChangeStatus::CHANGED;
2673	}
2674	return Changed;
2675	}
2676	};
2677
2678	/// Determines if \p BB exits the function unconditionally itself or reaches a
2679	/// block that does through only unique successors.
2680	static bool hasFunctionEndAsUniqueSuccessor(const BasicBlock *BB) {
2681	if (succ_empty(BB))
2682	return true;
2683	const BasicBlock *const Successor = BB->getUniqueSuccessor();
2684	if (!Successor)
2685	return false;
2686	return hasFunctionEndAsUniqueSuccessor(BB: Successor);
2687	}
2688
2689	struct AAExecutionDomainFunction : public AAExecutionDomain {
2690	AAExecutionDomainFunction(const IRPosition &IRP, Attributor &A)
2691	: AAExecutionDomain (IRP, A) {}
2692
2693	~AAExecutionDomainFunction() { delete RPOT; }
2694
2695	void initialize(Attributor &A) override {
2696	Function *F = getAnchorScope();
2697	assert(F && "Expected anchor function");
2698	RPOT = new ReversePostOrderTraversal<Function *>(F);
2699	}
2700
2701	const std::string getAsStr(Attributor ) const* override {
2702	unsigned TotalBlocks = `0`, InitialThreadBlocks = `0`, AlignedBlocks = `0`;
2703	for (auto &It : BEDMap) {
2704	if (!It.getFirst())
2705	continue;
2706	TotalBlocks++;
2707	InitialThreadBlocks += It.getSecond().IsExecutedByInitialThreadOnly;
2708	AlignedBlocks += It.getSecond().IsReachedFromAlignedBarrierOnly &&
2709	It.getSecond().IsReachingAlignedBarrierOnly;
2710	}
2711	return "[AAExecutionDomain] " + std::to_string(val: InitialThreadBlocks) + "/" +
2712	std::to_string(val: AlignedBlocks) + " of " +
2713	std::to_string(val: TotalBlocks) +
2714	" executed by initial thread / aligned";
2715	}
2716
2717	/// See AbstractAttribute::trackStatistics().
2718	void trackStatistics() const override {}
2719
2720	ChangeStatus manifest(Attributor &A) override {
2721	LLVM_DEBUG({
2722	for (const BasicBlock &BB : *getAnchorScope()) {
2723	if (!isExecutedByInitialThreadOnly(BB))
2724	continue;
2725	dbgs() << TAG << " Basic block @" << getAnchorScope()->getName() << " "
2726	<< BB.getName() << " is executed by a single thread.\n";
2727	}
2728	});
2729
2730	ChangeStatus Changed = ChangeStatus::UNCHANGED;
2731
2732	if (DisableOpenMPOptBarrierElimination)
2733	return Changed;
2734
2735	SmallPtrSet<CallBase *, `16`> DeletedBarriers;
2736	auto HandleAlignedBarrier = [&](CallBase *CB) {
2737	const ExecutionDomainTy &ED = CB ? CEDMap [{CB, PRE}] : BEDMap [nullptr];
2738	if (!ED.IsReachedFromAlignedBarrierOnly \|\|
2739	ED.EncounteredNonLocalSideEffect)
2740	return;
2741	if (!ED.EncounteredAssumes.empty() && !A.isModulePass())
2742	return;
2743
2744	// We can remove this barrier, if it is one, or aligned barriers reaching
2745	// the kernel end (if CB is nullptr). Aligned barriers reaching the kernel
2746	// end should only be removed if the kernel end is their unique successor;
2747	// otherwise, they may have side-effects that aren't accounted for in the
2748	// kernel end in their other successors. If those barriers have other
2749	// barriers reaching them, those can be transitively removed as well as
2750	// long as the kernel end is also their unique successor.
2751	if (CB) {
2752	DeletedBarriers.insert(Ptr: CB);
2753	A.deleteAfterManifest(I&: *CB);
2754	++NumBarriersEliminated;
2755	Changed = ChangeStatus::CHANGED;
2756	} else if (!ED.AlignedBarriers.empty()) {
2757	Changed = ChangeStatus::CHANGED;
2758	SmallVector<CallBase *> Worklist(ED.AlignedBarriers.begin(),
2759	ED.AlignedBarriers.end());
2760	SmallSetVector<CallBase *, `16`> Visited;
2761	while (!Worklist.empty()) {
2762	CallBase *LastCB = Worklist.pop_back_val();
2763	if (!Visited.insert(X: LastCB))
2764	continue;
2765	if (LastCB->getFunction() != getAnchorScope())
2766	continue;
2767	if (!hasFunctionEndAsUniqueSuccessor(BB: LastCB->getParent()))
2768	continue;
2769	if (!DeletedBarriers.count(Ptr: LastCB)) {
2770	++NumBarriersEliminated;
2771	A.deleteAfterManifest(I&: *LastCB);
2772	continue;
2773	}
2774	// The final aligned barrier (LastCB) reaching the kernel end was
2775	// removed already. This means we can go one step further and remove
2776	// the barriers encoutered last before (LastCB).
2777	const ExecutionDomainTy &LastED = CEDMap [{LastCB, PRE}];
2778	Worklist.append(in_start: LastED.AlignedBarriers.begin(),
2779	in_end: LastED.AlignedBarriers.end());
2780	}
2781	}
2782
2783	// If we actually eliminated a barrier we need to eliminate the associated
2784	// llvm.assumes as well to avoid creating UB.
2785	if (!ED.EncounteredAssumes.empty() && (CB \|\| !ED.AlignedBarriers.empty()))
2786	for (auto *AssumeCB : ED.EncounteredAssumes)
2787	A.deleteAfterManifest(I&: *AssumeCB);
2788	};
2789
2790	for (auto *CB : AlignedBarriers)
2791	HandleAlignedBarrier(CB);
2792
2793	// Handle the "kernel end barrier" for kernels too.
2794	if (omp::isOpenMPKernel(Fn&: *getAnchorScope()))
2795	HandleAlignedBarrier(nullptr);
2796
2797	return Changed;
2798	}
2799
2800	bool isNoOpFence(const FenceInst &FI) const override {
2801	return getState().isValidState() && !NonNoOpFences.count(Ptr: &FI);
2802	}
2803
2804	/// Merge barrier and assumption information from \p PredED into the successor
2805	/// \p ED.
2806	void
2807	mergeInPredecessorBarriersAndAssumptions(Attributor &A, ExecutionDomainTy &ED,
2808	const ExecutionDomainTy &PredED);
2809
2810	/// Merge all information from \p PredED into the successor \p ED. If
2811	/// \p InitialEdgeOnly is set, only the initial edge will enter the block
2812	/// represented by \p ED from this predecessor.
2813	bool mergeInPredecessor(Attributor &A, ExecutionDomainTy &ED,
2814	const ExecutionDomainTy &PredED,
2815	bool InitialEdgeOnly = false);
2816
2817	/// Accumulate information for the entry block in \p EntryBBED.
2818	bool handleCallees(Attributor &A, ExecutionDomainTy &EntryBBED);
2819
2820	/// See AbstractAttribute::updateImpl.
2821	ChangeStatus updateImpl(Attributor &A) override;
2822
2823	/// Query interface, see AAExecutionDomain
2824	///{
2825	bool isExecutedByInitialThreadOnly(const BasicBlock &BB) const override {
2826	if (!isValidState())
2827	return false;
2828	assert(BB.getParent() == getAnchorScope() && "Block is out of scope!");
2829	return BEDMap.lookup(Val: &BB).IsExecutedByInitialThreadOnly;
2830	}
2831
2832	bool isExecutedInAlignedRegion(Attributor &A,
2833	const Instruction &I) const override {
2834	assert(I.getFunction() == getAnchorScope() &&
2835	"Instruction is out of scope!");
2836	if (!isValidState())
2837	return false;
2838
2839	bool ForwardIsOk = true;
2840	const Instruction *CurI;
2841
2842	// Check forward until a call or the block end is reached.
2843	CurI = &I;
2844	do {
2845	auto *CB = dyn_cast<CallBase>(Val: CurI);
2846	if (!CB)
2847	continue;
2848	if (CB != &I && AlignedBarriers.contains(key: const_cast<CallBase *>(CB)))
2849	return true;
2850	const auto &It = CEDMap.find(Val: {CB, PRE});
2851	if (It == CEDMap.end())
2852	continue;
2853	if (!It ->getSecond().IsReachingAlignedBarrierOnly)
2854	ForwardIsOk = false;
2855	break;
2856	} while ((CurI = CurI->getNextNonDebugInstruction()));
2857
2858	if (!CurI && !BEDMap.lookup(Val: I.getParent()).IsReachingAlignedBarrierOnly)
2859	ForwardIsOk = false;
2860
2861	// Check backward until a call or the block beginning is reached.
2862	CurI = &I;
2863	do {
2864	auto *CB = dyn_cast<CallBase>(Val: CurI);
2865	if (!CB)
2866	continue;
2867	if (CB != &I && AlignedBarriers.contains(key: const_cast<CallBase *>(CB)))
2868	return true;
2869	const auto &It = CEDMap.find(Val: {CB, POST});
2870	if (It == CEDMap.end())
2871	continue;
2872	if (It ->getSecond().IsReachedFromAlignedBarrierOnly)
2873	break;
2874	return false;
2875	} while ((CurI = CurI->getPrevNonDebugInstruction()));
2876
2877	// Delayed decision on the forward pass to allow aligned barrier detection
2878	// in the backwards traversal.
2879	if (!ForwardIsOk)
2880	return false;
2881
2882	if (!CurI) {
2883	const BasicBlock *BB = I.getParent();
2884	if (BB == &BB->getParent()->getEntryBlock())
2885	return BEDMap.lookup(Val: nullptr).IsReachedFromAlignedBarrierOnly;
2886	if (!llvm::all_of(Range: predecessors(BB), P: [&](const BasicBlock *PredBB) {
2887	return BEDMap.lookup(Val: PredBB).IsReachedFromAlignedBarrierOnly;
2888	})) {
2889	return false;
2890	}
2891	}
2892
2893	// On neither traversal we found a anything but aligned barriers.
2894	return true;
2895	}
2896
2897	ExecutionDomainTy getExecutionDomain(const BasicBlock &BB) const override {
2898	assert(isValidState() &&
2899	"No request should be made against an invalid state!");
2900	return BEDMap.lookup(Val: &BB);
2901	}
2902	std::pair<ExecutionDomainTy, ExecutionDomainTy>
2903	getExecutionDomain(const CallBase &CB) const override {
2904	assert(isValidState() &&
2905	"No request should be made against an invalid state!");
2906	return {CEDMap.lookup(Val: {&CB, PRE}), CEDMap.lookup(Val: {&CB, POST})};
2907	}
2908	ExecutionDomainTy getFunctionExecutionDomain() const override {
2909	assert(isValidState() &&
2910	"No request should be made against an invalid state!");
2911	return InterProceduralED;
2912	}
2913	///}
2914
2915	// Check if the edge into the successor block contains a condition that only
2916	// lets the main thread execute it.
2917	static bool isInitialThreadOnlyEdge(Attributor &A, BranchInst *Edge,
2918	BasicBlock &SuccessorBB) {
2919	if (!Edge \|\| !Edge->isConditional())
2920	return false;
2921	if (Edge->getSuccessor(i: `0`) != &SuccessorBB)
2922	return false;
2923
2924	auto *Cmp = dyn_cast<CmpInst>(Val: Edge->getCondition());
2925	if (!Cmp \|\| !Cmp->isTrueWhenEqual() \|\| !Cmp->isEquality())
2926	return false;
2927
2928	ConstantInt *C = dyn_cast<ConstantInt>(Val: Cmp->getOperand(i_nocapture: `1`));
2929	if (!C)
2930	return false;
2931
2932	// Match: -1 == __kmpc_target_init (for non-SPMD kernels only!)
2933	if (C->isAllOnesValue()) {
2934	auto *CB = dyn_cast<CallBase>(Val: Cmp->getOperand(i_nocapture: `0`));
2935	auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2936	auto &RFI = OMPInfoCache.RFIs [OMPRTL___kmpc_target_init];
2937	CB = CB ? OpenMPOpt::getCallIfRegularCall(V&: CB, RFI: &RFI) : nullptr*;
2938	if (!CB)
2939	return false;
2940	ConstantStruct *KernelEnvC =
2941	KernelInfo::getKernelEnvironementFromKernelInitCB(KernelInitCB: CB);
2942	ConstantInt *ExecModeC =
2943	KernelInfo::getExecModeFromKernelEnvironment(KernelEnvC);
2944	return ExecModeC->getSExtValue() & OMP_TGT_EXEC_MODE_GENERIC;
2945	}
2946
2947	if (C->isZero()) {
2948	// Match: 0 == llvm.nvvm.read.ptx.sreg.tid.x()
2949	if (auto *II = dyn_cast<IntrinsicInst>(Val: Cmp->getOperand(i_nocapture: `0`)))
2950	if (II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_tid_x)
2951	return true;
2952
2953	// Match: 0 == llvm.amdgcn.workitem.id.x()
2954	if (auto *II = dyn_cast<IntrinsicInst>(Val: Cmp->getOperand(i_nocapture: `0`)))
2955	if (II->getIntrinsicID() == Intrinsic::amdgcn_workitem_id_x)
2956	return true;
2957	}
2958
2959	return false;
2960	};
2961
2962	/// Mapping containing information about the function for other AAs.
2963	ExecutionDomainTy InterProceduralED;
2964
2965	enum Direction { PRE = `0`, POST = `1` };
2966	/// Mapping containing information per block.
2967	DenseMap<const BasicBlock *, ExecutionDomainTy> BEDMap;
2968	DenseMap<PointerIntPair<const CallBase *, `1`, Direction>, ExecutionDomainTy>
2969	CEDMap;
2970	SmallSetVector<CallBase *, `16`> AlignedBarriers;
2971
2972	ReversePostOrderTraversal<Function > RPOT = nullptr;
2973
2974	/// Set \p R to \V and report true if that changed \p R.
2975	static bool setAndRecord(bool &R, bool V) {
2976	bool Eq = (R == V);
2977	R = V;
2978	return !Eq;
2979	}
2980
2981	/// Collection of fences known to be non-no-opt. All fences not in this set
2982	/// can be assumed no-opt.
2983	SmallPtrSet<const FenceInst *, `8`> NonNoOpFences;
2984	};
2985
2986	void AAExecutionDomainFunction::mergeInPredecessorBarriersAndAssumptions(
2987	Attributor &A, ExecutionDomainTy &ED, const ExecutionDomainTy &PredED) {
2988	for (auto *EA : PredED.EncounteredAssumes)
2989	ED.addAssumeInst(A, AI&: *EA);
2990
2991	for (auto *AB : PredED.AlignedBarriers)
2992	ED.addAlignedBarrier(A, CB&: *AB);
2993	}
2994
2995	bool AAExecutionDomainFunction::mergeInPredecessor(
2996	Attributor &A, ExecutionDomainTy &ED, const ExecutionDomainTy &PredED,
2997	bool InitialEdgeOnly) {
2998
2999	bool Changed = false;
3000	Changed \|=
3001	setAndRecord(R&: ED.IsExecutedByInitialThreadOnly,
3002	V: InitialEdgeOnly \|\| (PredED.IsExecutedByInitialThreadOnly &&
3003	ED.IsExecutedByInitialThreadOnly));
3004
3005	Changed \|= setAndRecord(R&: ED.IsReachedFromAlignedBarrierOnly,
3006	V: ED.IsReachedFromAlignedBarrierOnly &&
3007	PredED.IsReachedFromAlignedBarrierOnly);
3008	Changed \|= setAndRecord(R&: ED.EncounteredNonLocalSideEffect,
3009	V: ED.EncounteredNonLocalSideEffect \|
3010	PredED.EncounteredNonLocalSideEffect);
3011	// Do not track assumptions and barriers as part of Changed.
3012	if (ED.IsReachedFromAlignedBarrierOnly)
3013	mergeInPredecessorBarriersAndAssumptions(A, ED, PredED);
3014	else
3015	ED.clearAssumeInstAndAlignedBarriers();
3016	return Changed;
3017	}
3018
3019	bool AAExecutionDomainFunction::handleCallees(Attributor &A,
3020	ExecutionDomainTy &EntryBBED) {
3021	SmallVector<std::pair<ExecutionDomainTy, ExecutionDomainTy>, `4`> CallSiteEDs;
3022	auto PredForCallSite = [&](AbstractCallSite ACS) {
3023	const auto *EDAA = A.getAAFor<AAExecutionDomain>(
3024	QueryingAA: *this, IRP: IRPosition::function(F: *ACS.getInstruction()->getFunction()),
3025	DepClass: DepClassTy::OPTIONAL);
3026	if (!EDAA \|\| !EDAA->getState().isValidState())
3027	return false;
3028	CallSiteEDs.emplace_back(
3029	Args: EDAA->getExecutionDomain(CB: *cast<CallBase>(Val: ACS.getInstruction())));
3030	return true;
3031	};
3032
3033	ExecutionDomainTy ExitED;
3034	bool AllCallSitesKnown;
3035	if (A.checkForAllCallSites(Pred: PredForCallSite, QueryingAA: *this,
3036	/ RequiresAllCallSites / RequireAllCallSites: true,
3037	UsedAssumedInformation&: AllCallSitesKnown)) {
3038	for (const auto &[CSInED, CSOutED] : CallSiteEDs) {
3039	mergeInPredecessor(A, ED&: EntryBBED, PredED: CSInED);
3040	ExitED.IsReachingAlignedBarrierOnly &=
3041	CSOutED.IsReachingAlignedBarrierOnly;
3042	}
3043
3044	} else {
3045	// We could not find all predecessors, so this is either a kernel or a
3046	// function with external linkage (or with some other weird uses).
3047	if (omp::isOpenMPKernel(Fn&: *getAnchorScope())) {
3048	EntryBBED.IsExecutedByInitialThreadOnly = false;
3049	EntryBBED.IsReachedFromAlignedBarrierOnly = true;
3050	EntryBBED.EncounteredNonLocalSideEffect = false;
3051	ExitED.IsReachingAlignedBarrierOnly = false;
3052	} else {
3053	EntryBBED.IsExecutedByInitialThreadOnly = false;
3054	EntryBBED.IsReachedFromAlignedBarrierOnly = false;
3055	EntryBBED.EncounteredNonLocalSideEffect = true;
3056	ExitED.IsReachingAlignedBarrierOnly = false;
3057	}
3058	}
3059
3060	bool Changed = false;
3061	auto &FnED = BEDMap [nullptr];
3062	Changed \|= setAndRecord(R&: FnED.IsReachedFromAlignedBarrierOnly,
3063	V: FnED.IsReachedFromAlignedBarrierOnly &
3064	EntryBBED.IsReachedFromAlignedBarrierOnly);
3065	Changed \|= setAndRecord(R&: FnED.IsReachingAlignedBarrierOnly,
3066	V: FnED.IsReachingAlignedBarrierOnly &
3067	ExitED.IsReachingAlignedBarrierOnly);
3068	Changed \|= setAndRecord(R&: FnED.IsExecutedByInitialThreadOnly,
3069	V: EntryBBED.IsExecutedByInitialThreadOnly);
3070	return Changed;
3071	}
3072
3073	ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
3074
3075	bool Changed = false;
3076
3077	// Helper to deal with an aligned barrier encountered during the forward
3078	// traversal. \p CB is the aligned barrier, \p ED is the execution domain when
3079	// it was encountered.
3080	auto HandleAlignedBarrier = [&](CallBase &CB, ExecutionDomainTy &ED) {
3081	Changed \|= AlignedBarriers.insert(X: &CB);
3082	// First, update the barrier ED kept in the separate CEDMap.
3083	auto &CallInED = CEDMap [{&CB, PRE}];
3084	Changed \|= mergeInPredecessor(A, ED&: CallInED, PredED: ED);
3085	CallInED.IsReachingAlignedBarrierOnly = true;
3086	// Next adjust the ED we use for the traversal.
3087	ED.EncounteredNonLocalSideEffect = false;
3088	ED.IsReachedFromAlignedBarrierOnly = true;
3089	// Aligned barrier collection has to come last.
3090	ED.clearAssumeInstAndAlignedBarriers();
3091	ED.addAlignedBarrier(A, CB);
3092	auto &CallOutED = CEDMap [{&CB, POST}];
3093	Changed \|= mergeInPredecessor(A, ED&: CallOutED, PredED: ED);
3094	};
3095
3096	auto *LivenessAA =
3097	A.getAAFor<AAIsDead>(QueryingAA: *this, IRP: getIRPosition(), DepClass: DepClassTy::OPTIONAL);
3098
3099	Function *F = getAnchorScope();
3100	BasicBlock &EntryBB = F->getEntryBlock();
3101	bool IsKernel = omp::isOpenMPKernel(Fn&: *F);
3102
3103	SmallVector<Instruction *> SyncInstWorklist;
3104	for (auto &RIt : *RPOT) {
3105	BasicBlock &BB = *RIt;
3106
3107	bool IsEntryBB = &BB == &EntryBB;
3108	// TODO: We use local reasoning since we don't have a divergence analysis
3109	// running as well. We could basically allow uniform branches here.
3110	bool AlignedBarrierLastInBlock = IsEntryBB && IsKernel;
3111	bool IsExplicitlyAligned = IsEntryBB && IsKernel;
3112	ExecutionDomainTy ED;
3113	// Propagate "incoming edges" into information about this block.
3114	if (IsEntryBB) {
3115	Changed \|= handleCallees(A, EntryBBED&: ED);
3116	} else {
3117	// For live non-entry blocks we only propagate
3118	// information via live edges.
3119	if (LivenessAA && LivenessAA->isAssumedDead(BB: &BB))
3120	continue;
3121
3122	for (auto *PredBB : predecessors(BB: &BB)) {
3123	if (LivenessAA && LivenessAA->isEdgeDead(From: PredBB, To: &BB))
3124	continue;
3125	bool InitialEdgeOnly = isInitialThreadOnlyEdge(
3126	A, Edge: dyn_cast<BranchInst>(Val: PredBB->getTerminator()), SuccessorBB&: BB);
3127	mergeInPredecessor(A, ED, PredED: BEDMap [PredBB], InitialEdgeOnly);
3128	}
3129	}
3130
3131	// Now we traverse the block, accumulate effects in ED and attach
3132	// information to calls.
3133	for (Instruction &I : BB) {
3134	bool UsedAssumedInformation;
3135	if (A.isAssumedDead(I, QueryingAA: *this, LivenessAA, UsedAssumedInformation,
3136	/ CheckBBLivenessOnly / false, DepClass: DepClassTy::OPTIONAL,
3137	/ CheckForDeadStore / true))
3138	continue;
3139
3140	// Asummes and "assume-like" (dbg, lifetime, ...) are handled first, the
3141	// former is collected the latter is ignored.
3142	if (auto *II = dyn_cast<IntrinsicInst>(Val: &I)) {
3143	if (auto *AI = dyn_cast_or_null<AssumeInst>(Val: II)) {
3144	ED.addAssumeInst(A, AI&: *AI);
3145	continue;
3146	}
3147	// TODO: Should we also collect and delete lifetime markers?
3148	if (II->isAssumeLikeIntrinsic())
3149	continue;
3150	}
3151
3152	if (auto *FI = dyn_cast<FenceInst>(Val: &I)) {
3153	if (!ED.EncounteredNonLocalSideEffect) {
3154	// An aligned fence without non-local side-effects is a no-op.
3155	if (ED.IsReachedFromAlignedBarrierOnly)
3156	continue;
3157	// A non-aligned fence without non-local side-effects is a no-op
3158	// if the ordering only publishes non-local side-effects (or less).
3159	switch (FI->getOrdering()) {
3160	case AtomicOrdering::NotAtomic:
3161	continue;
3162	case AtomicOrdering::Unordered:
3163	continue;
3164	case AtomicOrdering::Monotonic:
3165	continue;
3166	case AtomicOrdering::Acquire:
3167	break;
3168	case AtomicOrdering::Release:
3169	continue;
3170	case AtomicOrdering::AcquireRelease:
3171	break;
3172	case AtomicOrdering::SequentiallyConsistent:
3173	break;
3174	};
3175	}
3176	NonNoOpFences.insert(Ptr: FI);
3177	}
3178
3179	auto *CB = dyn_cast<CallBase>(Val: &I);
3180	bool IsNoSync = AA::isNoSyncInst(A, I, QueryingAA: *this);
3181	bool IsAlignedBarrier =
3182	!IsNoSync && CB &&
3183	AANoSync::isAlignedBarrier(CB: *CB, ExecutedAligned: AlignedBarrierLastInBlock);
3184
3185	AlignedBarrierLastInBlock &= IsNoSync;
3186	IsExplicitlyAligned &= IsNoSync;
3187
3188	// Next we check for calls. Aligned barriers are handled
3189	// explicitly, everything else is kept for the backward traversal and will
3190	// also affect our state.
3191	if (CB) {
3192	if (IsAlignedBarrier) {
3193	HandleAlignedBarrier (*CB, ED);
3194	AlignedBarrierLastInBlock = true;
3195	IsExplicitlyAligned = true;
3196	continue;
3197	}
3198
3199	// Check the pointer(s) of a memory intrinsic explicitly.
3200	if (isa<MemIntrinsic>(Val: &I)) {
3201	if (!ED.EncounteredNonLocalSideEffect &&
3202	AA::isPotentiallyAffectedByBarrier(A, I, QueryingAA: *this))
3203	ED.EncounteredNonLocalSideEffect = true;
3204	if (!IsNoSync) {
3205	ED.IsReachedFromAlignedBarrierOnly = false;
3206	SyncInstWorklist.push_back(Elt: &I);
3207	}
3208	continue;
3209	}
3210
3211	// Record how we entered the call, then accumulate the effect of the
3212	// call in ED for potential use by the callee.
3213	auto &CallInED = CEDMap [{CB, PRE}];
3214	Changed \|= mergeInPredecessor(A, ED&: CallInED, PredED: ED);
3215
3216	// If we have a sync-definition we can check if it starts/ends in an
3217	// aligned barrier. If we are unsure we assume any sync breaks
3218	// alignment.
3219	Function *Callee = CB->getCalledFunction();
3220	if (!IsNoSync && Callee && !Callee->isDeclaration()) {
3221	const auto *EDAA = A.getAAFor<AAExecutionDomain>(
3222	QueryingAA: *this, IRP: IRPosition::function(F: *Callee), DepClass: DepClassTy::OPTIONAL);
3223	if (EDAA && EDAA->getState().isValidState()) {
3224	const auto &CalleeED = EDAA->getFunctionExecutionDomain();
3225	ED.IsReachedFromAlignedBarrierOnly =
3226	CalleeED.IsReachedFromAlignedBarrierOnly;
3227	AlignedBarrierLastInBlock = ED.IsReachedFromAlignedBarrierOnly;
3228	if (IsNoSync \|\| !CalleeED.IsReachedFromAlignedBarrierOnly)
3229	ED.EncounteredNonLocalSideEffect \|=
3230	CalleeED.EncounteredNonLocalSideEffect;
3231	else
3232	ED.EncounteredNonLocalSideEffect =
3233	CalleeED.EncounteredNonLocalSideEffect;
3234	if (!CalleeED.IsReachingAlignedBarrierOnly) {
3235	Changed \|=
3236	setAndRecord(R&: CallInED.IsReachingAlignedBarrierOnly, V: false);
3237	SyncInstWorklist.push_back(Elt: &I);
3238	}
3239	if (CalleeED.IsReachedFromAlignedBarrierOnly)
3240	mergeInPredecessorBarriersAndAssumptions(A, ED, PredED: CalleeED);
3241	auto &CallOutED = CEDMap [{CB, POST}];
3242	Changed \|= mergeInPredecessor(A, ED&: CallOutED, PredED: ED);
3243	continue;
3244	}
3245	}
3246	if (!IsNoSync) {
3247	ED.IsReachedFromAlignedBarrierOnly = false;
3248	Changed \|= setAndRecord(R&: CallInED.IsReachingAlignedBarrierOnly, V: false);
3249	SyncInstWorklist.push_back(Elt: &I);
3250	}
3251	AlignedBarrierLastInBlock &= ED.IsReachedFromAlignedBarrierOnly;
3252	ED.EncounteredNonLocalSideEffect \|= !CB->doesNotAccessMemory();
3253	auto &CallOutED = CEDMap [{CB, POST}];
3254	Changed \|= mergeInPredecessor(A, ED&: CallOutED, PredED: ED);
3255	}
3256
3257	if (!I.mayHaveSideEffects() && !I.mayReadFromMemory())
3258	continue;
3259
3260	// If we have a callee we try to use fine-grained information to
3261	// determine local side-effects.
3262	if (CB) {
3263	const auto *MemAA = A.getAAFor<AAMemoryLocation>(
3264	QueryingAA: *this, IRP: IRPosition::callsite_function(CB: *CB), DepClass: DepClassTy::OPTIONAL);
3265
3266	auto AccessPred = [&](const Instruction I, const* Value *Ptr,
3267	AAMemoryLocation::AccessKind,
3268	AAMemoryLocation::MemoryLocationsKind) {
3269	return !AA::isPotentiallyAffectedByBarrier(A, Ptrs: {Ptr}, QueryingAA: *this, CtxI: I);
3270	};
3271	if (MemAA && MemAA->getState().isValidState() &&
3272	MemAA->checkForAllAccessesToMemoryKind(
3273	Pred: AccessPred, MLK: AAMemoryLocation::ALL_LOCATIONS))
3274	continue;
3275	}
3276
3277	auto &InfoCache = A.getInfoCache();
3278	if (!I.mayHaveSideEffects() && InfoCache.isOnlyUsedByAssume(I))
3279	continue;
3280
3281	if (auto *LI = dyn_cast<LoadInst>(Val: &I))
3282	if (LI->hasMetadata(KindID: LLVMContext::MD_invariant_load))
3283	continue;
3284
3285	if (!ED.EncounteredNonLocalSideEffect &&
3286	AA::isPotentiallyAffectedByBarrier(A, I, QueryingAA: *this))
3287	ED.EncounteredNonLocalSideEffect = true;
3288	}
3289
3290	bool IsEndAndNotReachingAlignedBarriersOnly = false;
3291	if (!isa<UnreachableInst>(Val: BB.getTerminator()) &&
3292	!BB.getTerminator()->getNumSuccessors()) {
3293
3294	Changed \|= mergeInPredecessor(A, ED&: InterProceduralED, PredED: ED);
3295
3296	auto &FnED = BEDMap [nullptr];
3297	if (IsKernel && !IsExplicitlyAligned)
3298	FnED.IsReachingAlignedBarrierOnly = false;
3299	Changed \|= mergeInPredecessor(A, ED&: FnED, PredED: ED);
3300
3301	if (!FnED.IsReachingAlignedBarrierOnly) {
3302	IsEndAndNotReachingAlignedBarriersOnly = true;
3303	SyncInstWorklist.push_back(Elt: BB.getTerminator());
3304	auto &BBED = BEDMap [&BB];
3305	Changed \|= setAndRecord(R&: BBED.IsReachingAlignedBarrierOnly, V: false);
3306	}
3307	}
3308
3309	ExecutionDomainTy &StoredED = BEDMap [&BB];
3310	ED.IsReachingAlignedBarrierOnly = StoredED.IsReachingAlignedBarrierOnly &
3311	!IsEndAndNotReachingAlignedBarriersOnly;
3312
3313	// Check if we computed anything different as part of the forward
3314	// traversal. We do not take assumptions and aligned barriers into account
3315	// as they do not influence the state we iterate. Backward traversal values
3316	// are handled later on.
3317	if (ED.IsExecutedByInitialThreadOnly !=
3318	StoredED.IsExecutedByInitialThreadOnly \|\|
3319	ED.IsReachedFromAlignedBarrierOnly !=
3320	StoredED.IsReachedFromAlignedBarrierOnly \|\|
3321	ED.EncounteredNonLocalSideEffect !=
3322	StoredED.EncounteredNonLocalSideEffect)
3323	Changed = true;
3324
3325	// Update the state with the new value.
3326	StoredED = std::move(ED);
3327	}
3328
3329	// Propagate (non-aligned) sync instruction effects backwards until the
3330	// entry is hit or an aligned barrier.
3331	SmallSetVector<BasicBlock *, `16`> Visited;
3332	while (!SyncInstWorklist.empty()) {
3333	Instruction *SyncInst = SyncInstWorklist.pop_back_val();
3334	Instruction *CurInst = SyncInst;
3335	bool HitAlignedBarrierOrKnownEnd = false;
3336	while ((CurInst = CurInst->getPrevNode())) {
3337	auto *CB = dyn_cast<CallBase>(Val: CurInst);
3338	if (!CB)
3339	continue;
3340	auto &CallOutED = CEDMap [{CB, POST}];
3341	Changed \|= setAndRecord(R&: CallOutED.IsReachingAlignedBarrierOnly, V: false);
3342	auto &CallInED = CEDMap [{CB, PRE}];
3343	HitAlignedBarrierOrKnownEnd =
3344	AlignedBarriers.count(key: CB) \|\| !CallInED.IsReachingAlignedBarrierOnly;
3345	if (HitAlignedBarrierOrKnownEnd)
3346	break;
3347	Changed \|= setAndRecord(R&: CallInED.IsReachingAlignedBarrierOnly, V: false);
3348	}
3349	if (HitAlignedBarrierOrKnownEnd)
3350	continue;
3351	BasicBlock *SyncBB = SyncInst->getParent();
3352	for (auto *PredBB : predecessors(BB: SyncBB)) {
3353	if (LivenessAA && LivenessAA->isEdgeDead(From: PredBB, To: SyncBB))
3354	continue;
3355	if (!Visited.insert(X: PredBB))
3356	continue;
3357	auto &PredED = BEDMap [PredBB];
3358	if (setAndRecord(R&: PredED.IsReachingAlignedBarrierOnly, V: false)) {
3359	Changed = true;
3360	SyncInstWorklist.push_back(Elt: PredBB->getTerminator());
3361	}
3362	}
3363	if (SyncBB != &EntryBB)
3364	continue;
3365	Changed \|=
3366	setAndRecord(R&: InterProceduralED.IsReachingAlignedBarrierOnly, V: false);
3367	}
3368
3369	return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
3370	}
3371
3372	/// Try to replace memory allocation calls called by a single thread with a
3373	/// static buffer of shared memory.
3374	struct AAHeapToShared : public StateWrapper<BooleanState, AbstractAttribute> {
3375	using Base = StateWrapper<BooleanState, AbstractAttribute>;
3376	AAHeapToShared(const IRPosition &IRP, Attributor &A) : Base (IRP) {}
3377
3378	/// Create an abstract attribute view for the position \p IRP.
3379	static AAHeapToShared &createForPosition(const IRPosition &IRP,
3380	Attributor &A);
3381
3382	/// Returns true if HeapToShared conversion is assumed to be possible.
3383	virtual bool isAssumedHeapToShared(CallBase &CB) const = `0`;
3384
3385	/// Returns true if HeapToShared conversion is assumed and the CB is a
3386	/// callsite to a free operation to be removed.
3387	virtual bool isAssumedHeapToSharedRemovedFree(CallBase &CB) const = `0`;
3388
3389	/// See AbstractAttribute::getName().
3390	const std::string getName() const override { return "AAHeapToShared"; }
3391
3392	/// See AbstractAttribute::getIdAddr().
3393	const char getIdAddr() const* override { return &ID; }
3394
3395	/// This function should return true if the type of the \p AA is
3396	/// AAHeapToShared.
3397	static bool classof(const AbstractAttribute *AA) {
3398	return (AA->getIdAddr() == &ID);
3399	}
3400
3401	/// Unique ID (due to the unique address)
3402	static const char ID;
3403	};
3404
3405	struct AAHeapToSharedFunction : public AAHeapToShared {
3406	AAHeapToSharedFunction(const IRPosition &IRP, Attributor &A)
3407	: AAHeapToShared (IRP, A) {}
3408
3409	const std::string getAsStr(Attributor ) const* override {
3410	return "[AAHeapToShared] " + std::to_string(val: MallocCalls.size()) +
3411	" malloc calls eligible.";
3412	}
3413
3414	/// See AbstractAttribute::trackStatistics().
3415	void trackStatistics() const override {}
3416
3417	/// This functions finds free calls that will be removed by the
3418	/// HeapToShared transformation.
3419	void findPotentialRemovedFreeCalls(Attributor &A) {
3420	auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
3421	auto &FreeRFI = OMPInfoCache.RFIs [OMPRTL___kmpc_free_shared];
3422
3423	PotentialRemovedFreeCalls.clear();
3424	// Update free call users of found malloc calls.
3425	for (CallBase *CB : MallocCalls) {
3426	SmallVector<CallBase *, `4`> FreeCalls;
3427	for (auto *U : CB->users()) {
3428	CallBase *C = dyn_cast<CallBase>(Val: U);
3429	if (C && C->getCalledFunction() == FreeRFI.Declaration)
3430	FreeCalls.push_back(Elt: C);
3431	}
3432
3433	if (FreeCalls.size() != `1`)
3434	continue;
3435
3436	PotentialRemovedFreeCalls.insert(Ptr: FreeCalls.front());
3437	}
3438	}
3439
3440	void initialize(Attributor &A) override {
3441	if (DisableOpenMPOptDeglobalization) {
3442	indicatePessimisticFixpoint();
3443	return;
3444	}
3445
3446	auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
3447	auto &RFI = OMPInfoCache.RFIs [OMPRTL___kmpc_alloc_shared];
3448	if (!RFI.Declaration)
3449	return;
3450
3451	Attributor::SimplifictionCallbackTy SCB =
3452	[](const IRPosition &, const AbstractAttribute *,
3453	bool &) -> std::optional<Value > { return* nullptr; };
3454
3455	Function *F = getAnchorScope();
3456	for (User *U : RFI.Declaration->users())
3457	if (CallBase *CB = dyn_cast<CallBase>(Val: U)) {
3458	if (CB->getFunction() != F)
3459	continue;
3460	MallocCalls.insert(X: CB);
3461	A.registerSimplificationCallback(IRP: IRPosition::callsite_returned(CB: *CB),
3462	CB: SCB);
3463	}
3464
3465	findPotentialRemovedFreeCalls(A);
3466	}
3467
3468	bool isAssumedHeapToShared(CallBase &CB) const override {
3469	return isValidState() && MallocCalls.count(key: &CB);
3470	}
3471
3472	bool isAssumedHeapToSharedRemovedFree(CallBase &CB) const override {
3473	return isValidState() && PotentialRemovedFreeCalls.count(Ptr: &CB);
3474	}
3475
3476	ChangeStatus manifest(Attributor &A) override {
3477	if (MallocCalls.empty())
3478	return ChangeStatus::UNCHANGED;
3479
3480	auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
3481	auto &FreeCall = OMPInfoCache.RFIs [OMPRTL___kmpc_free_shared];
3482
3483	Function *F = getAnchorScope();
3484	auto HS = A.lookupAAFor<AAHeapToStack>(IRP: IRPosition::function(F: F), QueryingAA: this,
3485	DepClass: DepClassTy::OPTIONAL);
3486
3487	ChangeStatus Changed = ChangeStatus::UNCHANGED;
3488	for (CallBase *CB : MallocCalls) {
3489	// Skip replacing this if HeapToStack has already claimed it.
3490	if (HS && HS->isAssumedHeapToStack(CB: *CB))
3491	continue;
3492
3493	// Find the unique free call to remove it.
3494	SmallVector<CallBase *, `4`> FreeCalls;
3495	for (auto *U : CB->users()) {
3496	CallBase *C = dyn_cast<CallBase>(Val: U);
3497	if (C && C->getCalledFunction() == FreeCall.Declaration)
3498	FreeCalls.push_back(Elt: C);
3499	}
3500	if (FreeCalls.size() != `1`)
3501	continue;
3502
3503	auto *AllocSize = cast<ConstantInt>(Val: CB->getArgOperand(i: `0`));
3504
3505	if (AllocSize->getZExtValue() + SharedMemoryUsed > SharedMemoryLimit) {
3506	LLVM_DEBUG(dbgs() << TAG << "Cannot replace call " << *CB
3507	<< " with shared memory."
3508	<< " Shared memory usage is limited to "
3509	<< SharedMemoryLimit << " bytes\n");
3510	continue;
3511	}
3512
3513	LLVM_DEBUG(dbgs() << TAG << "Replace globalization call " << *CB
3514	<< " with " << AllocSize->getZExtValue()
3515	<< " bytes of shared memory\n");
3516
3517	// Create a new shared memory buffer of the same size as the allocation
3518	// and replace all the uses of the original allocation with it.
3519	Module *M = CB->getModule();
3520	Type *Int8Ty = Type::getInt8Ty(C&: M->getContext());
3521	Type *Int8ArrTy = ArrayType::get(ElementType: Int8Ty, NumElements: AllocSize->getZExtValue());
3522	auto SharedMem = new* GlobalVariable (
3523	M, Int8ArrTy, /* IsConstant / false, GlobalValue::InternalLinkage,
3524	PoisonValue::get(T: Int8ArrTy), CB->getName() + "_shared", nullptr,
3525	GlobalValue::NotThreadLocal,
3526	static_cast<unsigned>(AddressSpace::Shared));
3527	auto *NewBuffer =
3528	ConstantExpr::getPointerCast(C: SharedMem, Ty: Int8Ty->getPointerTo());
3529
3530	auto Remark = [&](OptimizationRemark OR) {
3531	return OR << "Replaced globalized variable with "
3532	<< ore::NV ("SharedMemory", AllocSize->getZExtValue())
3533	<< (AllocSize->isOne() ? " byte " : " bytes ")
3534	<< "of shared memory.";
3535	};
3536	A.emitRemark<OptimizationRemark>(I: CB, RemarkName: "OMP111", RemarkCB&: Remark);
3537
3538	MaybeAlign Alignment = CB->getRetAlign();
3539	assert(Alignment &&
3540	"HeapToShared on allocation without alignment attribute");
3541	SharedMem->setAlignment(*Alignment);
3542
3543	A.changeAfterManifest(IRP: IRPosition::callsite_returned(CB: CB), NV&: NewBuffer);
3544	A.deleteAfterManifest(I&: *CB);
3545	A.deleteAfterManifest(I&: *FreeCalls.front());
3546
3547	SharedMemoryUsed += AllocSize->getZExtValue();
3548	NumBytesMovedToSharedMemory = SharedMemoryUsed;
3549	Changed = ChangeStatus::CHANGED;
3550	}
3551
3552	return Changed;
3553	}
3554
3555	ChangeStatus updateImpl(Attributor &A) override {
3556	if (MallocCalls.empty())
3557	return indicatePessimisticFixpoint();
3558	auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
3559	auto &RFI = OMPInfoCache.RFIs [OMPRTL___kmpc_alloc_shared];
3560	if (!RFI.Declaration)
3561	return ChangeStatus::UNCHANGED;
3562
3563	Function *F = getAnchorScope();
3564
3565	auto NumMallocCalls = MallocCalls.size();
3566
3567	// Only consider malloc calls executed by a single thread with a constant.
3568	for (User *U : RFI.Declaration->users()) {
3569	if (CallBase *CB = dyn_cast<CallBase>(Val: U)) {
3570	if (CB->getCaller() != F)
3571	continue;
3572	if (!MallocCalls.count(key: CB))
3573	continue;
3574	if (!isa<ConstantInt>(Val: CB->getArgOperand(i: `0`))) {
3575	MallocCalls.remove(X: CB);
3576	continue;
3577	}
3578	const auto *ED = A.getAAFor<AAExecutionDomain>(
3579	QueryingAA: *this, IRP: IRPosition::function(F: *F), DepClass: DepClassTy::REQUIRED);
3580	if (!ED \|\| !ED->isExecutedByInitialThreadOnly(I: *CB))
3581	MallocCalls.remove(X: CB);
3582	}
3583	}
3584
3585	findPotentialRemovedFreeCalls(A);
3586
3587	if (NumMallocCalls != MallocCalls.size())
3588	return ChangeStatus::CHANGED;
3589
3590	return ChangeStatus::UNCHANGED;
3591	}
3592
3593	/// Collection of all malloc calls in a function.
3594	SmallSetVector<CallBase *, `4`> MallocCalls;
3595	/// Collection of potentially removed free calls in a function.
3596	SmallPtrSet<CallBase *, `4`> PotentialRemovedFreeCalls;
3597	/// The total amount of shared memory that has been used for HeapToShared.
3598	unsigned SharedMemoryUsed = `0`;
3599	};
3600
3601	struct AAKernelInfo : public StateWrapper<KernelInfoState, AbstractAttribute> {
3602	using Base = StateWrapper<KernelInfoState, AbstractAttribute>;
3603	AAKernelInfo(const IRPosition &IRP, Attributor &A) : Base (IRP) {}
3604
3605	/// The callee value is tracked beyond a simple stripPointerCasts, so we allow
3606	/// unknown callees.
3607	static bool requiresCalleeForCallBase() { return false; }
3608
3609	/// Statistics are tracked as part of manifest for now.
3610	void trackStatistics() const override {}
3611
3612	/// See AbstractAttribute::getAsStr()
3613	const std::string getAsStr(Attributor ) const* override {
3614	if (!isValidState())
3615	return "<invalid>";
3616	return std::string (SPMDCompatibilityTracker.isAssumed() ? "SPMD"
3617	: "generic") +
3618	std::string (SPMDCompatibilityTracker.isAtFixpoint() ? " [FIX]"
3619	: "") +
3620	std::string (" #PRs: ") +
3621	(ReachedKnownParallelRegions.isValidState()
3622	? std::to_string(val: ReachedKnownParallelRegions.size())
3623	: "<invalid>") +
3624	", #Unknown PRs: " +
3625	(ReachedUnknownParallelRegions.isValidState()
3626	? std::to_string(val: ReachedUnknownParallelRegions.size())
3627	: "<invalid>") +
3628	", #Reaching Kernels: " +
3629	(ReachingKernelEntries.isValidState()
3630	? std::to_string(val: ReachingKernelEntries.size())
3631	: "<invalid>") +
3632	", #ParLevels: " +
3633	(ParallelLevels.isValidState()
3634	? std::to_string(val: ParallelLevels.size())
3635	: "<invalid>") +
3636	", NestedPar: " + (NestedParallelism ? "yes" : "no");
3637	}
3638
3639	/// Create an abstract attribute biew for the position \p IRP.
3640	static AAKernelInfo &createForPosition(const IRPosition &IRP, Attributor &A);
3641
3642	/// See AbstractAttribute::getName()
3643	const std::string getName() const override { return "AAKernelInfo"; }
3644
3645	/// See AbstractAttribute::getIdAddr()
3646	const char getIdAddr() const* override { return &ID; }
3647
3648	/// This function should return true if the type of the \p AA is AAKernelInfo
3649	static bool classof(const AbstractAttribute *AA) {
3650	return (AA->getIdAddr() == &ID);
3651	}
3652
3653	static const char ID;
3654	};
3655
3656	/// The function kernel info abstract attribute, basically, what can we say
3657	/// about a function with regards to the KernelInfoState.
3658	struct AAKernelInfoFunction : AAKernelInfo {
3659	AAKernelInfoFunction(const IRPosition &IRP, Attributor &A)
3660	: AAKernelInfo (IRP, A) {}
3661
3662	SmallPtrSet<Instruction *, `4`> GuardedInstructions;
3663
3664	SmallPtrSetImpl<Instruction *> &getGuardedInstructions() {
3665	return GuardedInstructions;
3666	}
3667
3668	void setConfigurationOfKernelEnvironment(ConstantStruct *ConfigC) {
3669	Constant *NewKernelEnvC = ConstantFoldInsertValueInstruction(
3670	Agg: KernelEnvC, Val: ConfigC, Idxs: {KernelInfo::ConfigurationIdx});
3671	assert(NewKernelEnvC && "Failed to create new kernel environment");
3672	KernelEnvC = cast<ConstantStruct>(Val: NewKernelEnvC);
3673	}
3674
3675	#define KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MEMBER) \
3676	void set##MEMBER##OfKernelEnvironment(ConstantInt *NewVal) { \
3677	ConstantStruct *ConfigC = \
3678	KernelInfo::getConfigurationFromKernelEnvironment(KernelEnvC); \
3679	Constant *NewConfigC = ConstantFoldInsertValueInstruction( \
3680	ConfigC, NewVal, {KernelInfo::MEMBER##Idx}); \
3681	assert(NewConfigC && "Failed to create new configuration environment"); \
3682	setConfigurationOfKernelEnvironment(cast<ConstantStruct>(NewConfigC)); \
3683	}
3684
3685	KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(UseGenericStateMachine)
3686	KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MayUseNestedParallelism)
3687	KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(ExecMode)
3688	KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MinThreads)
3689	KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MaxThreads)
3690	KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MinTeams)
3691	KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MaxTeams)
3692
3693	#undef KERNEL_ENVIRONMENT_CONFIGURATION_SETTER
3694
3695	/// See AbstractAttribute::initialize(...).
3696	void initialize(Attributor &A) override {
3697	// This is a high-level transform that might change the constant arguments
3698	// of the init and dinit calls. We need to tell the Attributor about this
3699	// to avoid other parts using the current constant value for simpliication.
3700	auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
3701
3702	Function *Fn = getAnchorScope();
3703
3704	OMPInformationCache::RuntimeFunctionInfo &InitRFI =
3705	OMPInfoCache.RFIs [OMPRTL___kmpc_target_init];
3706	OMPInformationCache::RuntimeFunctionInfo &DeinitRFI =
3707	OMPInfoCache.RFIs [OMPRTL___kmpc_target_deinit];
3708
3709	// For kernels we perform more initialization work, first we find the init
3710	// and deinit calls.
3711	auto StoreCallBase = [](Use &U,
3712	OMPInformationCache::RuntimeFunctionInfo &RFI,
3713	CallBase *&Storage) {
3714	CallBase *CB = OpenMPOpt::getCallIfRegularCall(U, RFI: &RFI);
3715	assert(CB &&
3716	"Unexpected use of __kmpc_target_init or __kmpc_target_deinit!");
3717	assert(!Storage &&
3718	"Multiple uses of __kmpc_target_init or __kmpc_target_deinit!");
3719	Storage = CB;
3720	return false;
3721	};
3722	InitRFI.foreachUse(
3723	CB: [&](Use &U, Function &) {
3724	StoreCallBase(U, InitRFI, KernelInitCB);
3725	return false;
3726	},
3727	F: Fn);
3728	DeinitRFI.foreachUse(
3729	CB: [&](Use &U, Function &) {
3730	StoreCallBase(U, DeinitRFI, KernelDeinitCB);
3731	return false;
3732	},
3733	F: Fn);
3734
3735	// Ignore kernels without initializers such as global constructors.
3736	if (!KernelInitCB \|\| !KernelDeinitCB)
3737	return;
3738
3739	// Add itself to the reaching kernel and set IsKernelEntry.
3740	ReachingKernelEntries.insert(Elem: Fn);
3741	IsKernelEntry = true;
3742
3743	KernelEnvC =
3744	KernelInfo::getKernelEnvironementFromKernelInitCB(KernelInitCB);
3745	GlobalVariable *KernelEnvGV =
3746	KernelInfo::getKernelEnvironementGVFromKernelInitCB(KernelInitCB);
3747
3748	Attributor::GlobalVariableSimplifictionCallbackTy
3749	KernelConfigurationSimplifyCB =
3750	[&](const GlobalVariable &GV, const AbstractAttribute *AA,
3751	bool &UsedAssumedInformation) -> std::optional<Constant *> {
3752	if (!isAtFixpoint()) {
3753	if (!AA)
3754	return nullptr;
3755	UsedAssumedInformation = true;
3756	A.recordDependence(FromAA: *this, ToAA: *AA, DepClass: DepClassTy::OPTIONAL);
3757	}
3758	return KernelEnvC;
3759	};
3760
3761	A.registerGlobalVariableSimplificationCallback(
3762	GV: *KernelEnvGV, CB: KernelConfigurationSimplifyCB);
3763
3764	// Check if we know we are in SPMD-mode already.
3765	ConstantInt *ExecModeC =
3766	KernelInfo::getExecModeFromKernelEnvironment(KernelEnvC);
3767	ConstantInt *AssumedExecModeC = ConstantInt::get(
3768	Ty: ExecModeC->getIntegerType(),
3769	V: ExecModeC->getSExtValue() \| OMP_TGT_EXEC_MODE_GENERIC_SPMD);
3770	if (ExecModeC->getSExtValue() & OMP_TGT_EXEC_MODE_SPMD)
3771	SPMDCompatibilityTracker.indicateOptimisticFixpoint();
3772	else if (DisableOpenMPOptSPMDization)
3773	// This is a generic region but SPMDization is disabled so stop
3774	// tracking.
3775	SPMDCompatibilityTracker.indicatePessimisticFixpoint();
3776	else
3777	setExecModeOfKernelEnvironment(AssumedExecModeC);
3778
3779	const Triple T(Fn->getParent()->getTargetTriple());
3780	auto *Int32Ty = Type::getInt32Ty(C&: Fn->getContext());
3781	auto [MinThreads, MaxThreads] =
3782	OpenMPIRBuilder::readThreadBoundsForKernel(T, Kernel&: *Fn);
3783	if (MinThreads)
3784	setMinThreadsOfKernelEnvironment(ConstantInt::get(Ty: Int32Ty, V: MinThreads));
3785	if (MaxThreads)
3786	setMaxThreadsOfKernelEnvironment(ConstantInt::get(Ty: Int32Ty, V: MaxThreads));
3787	auto [MinTeams, MaxTeams] =
3788	OpenMPIRBuilder::readTeamBoundsForKernel(T, Kernel&: *Fn);
3789	if (MinTeams)
3790	setMinTeamsOfKernelEnvironment(ConstantInt::get(Ty: Int32Ty, V: MinTeams));
3791	if (MaxTeams)
3792	setMaxTeamsOfKernelEnvironment(ConstantInt::get(Ty: Int32Ty, V: MaxTeams));
3793
3794	ConstantInt *MayUseNestedParallelismC =
3795	KernelInfo::getMayUseNestedParallelismFromKernelEnvironment(KernelEnvC);
3796	ConstantInt *AssumedMayUseNestedParallelismC = ConstantInt::get(
3797	Ty: MayUseNestedParallelismC->getIntegerType(), V: NestedParallelism);
3798	setMayUseNestedParallelismOfKernelEnvironment(
3799	AssumedMayUseNestedParallelismC);
3800
3801	if (!DisableOpenMPOptStateMachineRewrite) {
3802	ConstantInt *UseGenericStateMachineC =
3803	KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
3804	KernelEnvC);
3805	ConstantInt *AssumedUseGenericStateMachineC =
3806	ConstantInt::get(Ty: UseGenericStateMachineC->getIntegerType(), V: false);
3807	setUseGenericStateMachineOfKernelEnvironment(
3808	AssumedUseGenericStateMachineC);
3809	}
3810
3811	// Register virtual uses of functions we might need to preserve.
3812	auto RegisterVirtualUse = [&](RuntimeFunction RFKind,
3813	Attributor::VirtualUseCallbackTy &CB) {
3814	if (!OMPInfoCache.RFIs [RFKind].Declaration)
3815	return;
3816	A.registerVirtualUseCallback(V: *OMPInfoCache.RFIs [RFKind].Declaration, CB);
3817	};
3818
3819	// Add a dependence to ensure updates if the state changes.
3820	auto AddDependence = [](Attributor &A, const AAKernelInfo *KI,
3821	const AbstractAttribute *QueryingAA) {
3822	if (QueryingAA) {
3823	A.recordDependence(FromAA: KI, ToAA: QueryingAA, DepClass: DepClassTy::OPTIONAL);
3824	}
3825	return true;
3826	};
3827
3828	Attributor::VirtualUseCallbackTy CustomStateMachineUseCB =
3829	[&](Attributor &A, const AbstractAttribute *QueryingAA) {
3830	// Whenever we create a custom state machine we will insert calls to
3831	// __kmpc_get_hardware_num_threads_in_block,
3832	// __kmpc_get_warp_size,
3833	// __kmpc_barrier_simple_generic,
3834	// __kmpc_kernel_parallel, and
3835	// __kmpc_kernel_end_parallel.
3836	// Not needed if we are on track for SPMDzation.
3837	if (SPMDCompatibilityTracker.isValidState())
3838	return AddDependence(A, this, QueryingAA);
3839	// Not needed if we can't rewrite due to an invalid state.
3840	if (!ReachedKnownParallelRegions.isValidState())
3841	return AddDependence(A, this, QueryingAA);
3842	return false;
3843	};
3844
3845	// Not needed if we are pre-runtime merge.
3846	if (!KernelInitCB->getCalledFunction()->isDeclaration()) {
3847	RegisterVirtualUse(OMPRTL___kmpc_get_hardware_num_threads_in_block,
3848	CustomStateMachineUseCB);
3849	RegisterVirtualUse(OMPRTL___kmpc_get_warp_size, CustomStateMachineUseCB);
3850	RegisterVirtualUse(OMPRTL___kmpc_barrier_simple_generic,
3851	CustomStateMachineUseCB);
3852	RegisterVirtualUse(OMPRTL___kmpc_kernel_parallel,
3853	CustomStateMachineUseCB);
3854	RegisterVirtualUse(OMPRTL___kmpc_kernel_end_parallel,
3855	CustomStateMachineUseCB);
3856	}
3857
3858	// If we do not perform SPMDzation we do not need the virtual uses below.
3859	if (SPMDCompatibilityTracker.isAtFixpoint())
3860	return;
3861
3862	Attributor::VirtualUseCallbackTy HWThreadIdUseCB =
3863	[&](Attributor &A, const AbstractAttribute *QueryingAA) {
3864	// Whenever we perform SPMDzation we will insert
3865	// __kmpc_get_hardware_thread_id_in_block calls.
3866	if (!SPMDCompatibilityTracker.isValidState())
3867	return AddDependence(A, this, QueryingAA);
3868	return false;
3869	};
3870	RegisterVirtualUse(OMPRTL___kmpc_get_hardware_thread_id_in_block,
3871	HWThreadIdUseCB);
3872
3873	Attributor::VirtualUseCallbackTy SPMDBarrierUseCB =
3874	[&](Attributor &A, const AbstractAttribute *QueryingAA) {
3875	// Whenever we perform SPMDzation with guarding we will insert
3876	// __kmpc_simple_barrier_spmd calls. If SPMDzation failed, there is
3877	// nothing to guard, or there are no parallel regions, we don't need
3878	// the calls.
3879	if (!SPMDCompatibilityTracker.isValidState())
3880	return AddDependence(A, this, QueryingAA);
3881	if (SPMDCompatibilityTracker.empty())
3882	return AddDependence(A, this, QueryingAA);
3883	if (!mayContainParallelRegion())
3884	return AddDependence(A, this, QueryingAA);
3885	return false;
3886	};
3887	RegisterVirtualUse(OMPRTL___kmpc_barrier_simple_spmd, SPMDBarrierUseCB);
3888	}
3889
3890	/// Sanitize the string \p S such that it is a suitable global symbol name.
3891	static std::string sanitizeForGlobalName(std::string S) {
3892	std::replace_if(
3893	first: S.begin(), last: S.end(),
3894	pred: [](const char C) {
3895	return !((C >= `'a'` && C <= `'z'`) \|\| (C >= `'A'` && C <= `'Z'`) \|\|
3896	(C >= `'0'` && C <= `'9'`) \|\| C == `'_'`);
3897	},
3898	new_value: `'.'`);
3899	return S;
3900	}
3901
3902	/// Modify the IR based on the KernelInfoState as the fixpoint iteration is
3903	/// finished now.
3904	ChangeStatus manifest(Attributor &A) override {
3905	// If we are not looking at a kernel with __kmpc_target_init and
3906	// __kmpc_target_deinit call we cannot actually manifest the information.
3907	if (!KernelInitCB \|\| !KernelDeinitCB)
3908	return ChangeStatus::UNCHANGED;
3909
3910	ChangeStatus Changed = ChangeStatus::UNCHANGED;
3911
3912	bool HasBuiltStateMachine = true;
3913	if (!changeToSPMDMode(A, Changed)) {
3914	if (!KernelInitCB->getCalledFunction()->isDeclaration())
3915	HasBuiltStateMachine = buildCustomStateMachine(A, Changed);
3916	else
3917	HasBuiltStateMachine = false;
3918	}
3919
3920	// We need to reset KernelEnvC if specific rewriting is not done.
3921	ConstantStruct *ExistingKernelEnvC =
3922	KernelInfo::getKernelEnvironementFromKernelInitCB(KernelInitCB);
3923	ConstantInt *OldUseGenericStateMachineVal =
3924	KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
3925	KernelEnvC: ExistingKernelEnvC);
3926	if (!HasBuiltStateMachine)
3927	setUseGenericStateMachineOfKernelEnvironment(
3928	OldUseGenericStateMachineVal);
3929
3930	// At last, update the KernelEnvc
3931	GlobalVariable *KernelEnvGV =
3932	KernelInfo::getKernelEnvironementGVFromKernelInitCB(KernelInitCB);
3933	if (KernelEnvGV->getInitializer() != KernelEnvC) {
3934	KernelEnvGV->setInitializer(KernelEnvC);
3935	Changed = ChangeStatus::CHANGED;
3936	}
3937
3938	return Changed;
3939	}
3940
3941	void insertInstructionGuardsHelper(Attributor &A) {
3942	auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
3943
3944	auto CreateGuardedRegion = [&](Instruction *RegionStartI,
3945	Instruction *RegionEndI) {
3946	LoopInfo LI = nullptr*;
3947	DominatorTree DT = nullptr*;
3948	MemorySSAUpdater MSU = nullptr*;
3949	using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
3950
3951	BasicBlock *ParentBB = RegionStartI->getParent();
3952	Function *Fn = ParentBB->getParent();
3953	Module &M = *Fn->getParent();
3954
3955	// Create all the blocks and logic.
3956	// ParentBB:
3957	// goto RegionCheckTidBB
3958	// RegionCheckTidBB:
3959	// Tid = __kmpc_hardware_thread_id()
3960	// if (Tid != 0)
3961	// goto RegionBarrierBB
3962	// RegionStartBB:
3963	// <execute instructions guarded>
3964	// goto RegionEndBB
3965	// RegionEndBB:
3966	// <store escaping values to shared mem>
3967	// goto RegionBarrierBB
3968	// RegionBarrierBB:
3969	// __kmpc_simple_barrier_spmd()
3970	// // second barrier is omitted if lacking escaping values.
3971	// <load escaping values from shared mem>
3972	// __kmpc_simple_barrier_spmd()
3973	// goto RegionExitBB
3974	// RegionExitBB:
3975	// <execute rest of instructions>
3976
3977	BasicBlock *RegionEndBB = SplitBlock(Old: ParentBB, SplitPt: RegionEndI->getNextNode(),
3978	DT, LI, MSSAU: MSU, BBName: "region.guarded.end");
3979	BasicBlock *RegionBarrierBB =
3980	SplitBlock(Old: RegionEndBB, SplitPt: &*RegionEndBB->getFirstInsertionPt(), DT, LI,
3981	MSSAU: MSU, BBName: "region.barrier");
3982	BasicBlock *RegionExitBB =
3983	SplitBlock(Old: RegionBarrierBB, SplitPt: &*RegionBarrierBB->getFirstInsertionPt(),
3984	DT, LI, MSSAU: MSU, BBName: "region.exit");
3985	BasicBlock *RegionStartBB =
3986	SplitBlock(Old: ParentBB, SplitPt: RegionStartI, DT, LI, MSSAU: MSU, BBName: "region.guarded");
3987
3988	assert(ParentBB->getUniqueSuccessor() == RegionStartBB &&
3989	"Expected a different CFG");
3990
3991	BasicBlock *RegionCheckTidBB = SplitBlock(
3992	Old: ParentBB, SplitPt: ParentBB->getTerminator(), DT, LI, MSSAU: MSU, BBName: "region.check.tid");
3993
3994	// Register basic blocks with the Attributor.
3995	A.registerManifestAddedBasicBlock(BB&: *RegionEndBB);
3996	A.registerManifestAddedBasicBlock(BB&: *RegionBarrierBB);
3997	A.registerManifestAddedBasicBlock(BB&: *RegionExitBB);
3998	A.registerManifestAddedBasicBlock(BB&: *RegionStartBB);
3999	A.registerManifestAddedBasicBlock(BB&: *RegionCheckTidBB);
4000
4001	bool HasBroadcastValues = false;
4002	// Find escaping outputs from the guarded region to outside users and
4003	// broadcast their values to them.
4004	for (Instruction &I : *RegionStartBB) {
4005	SmallVector<Use *, `4`> OutsideUses;
4006	for (Use &U : I.uses()) {
4007	Instruction &UsrI = *cast<Instruction>(Val: U.getUser());
4008	if (UsrI.getParent() != RegionStartBB)
4009	OutsideUses.push_back(Elt: &U);
4010	}
4011
4012	if (OutsideUses.empty())
4013	continue;
4014
4015	HasBroadcastValues = true;
4016
4017	// Emit a global variable in shared memory to store the broadcasted
4018	// value.
4019	auto SharedMem = new* GlobalVariable (
4020	M, I.getType(), / IsConstant / false,
4021	GlobalValue::InternalLinkage, UndefValue::get(T: I.getType()),
4022	sanitizeForGlobalName(
4023	S: (I.getName() + ".guarded.output.alloc").str()),
4024	nullptr, GlobalValue::NotThreadLocal,
4025	static_cast<unsigned>(AddressSpace::Shared));
4026
4027	// Emit a store instruction to update the value.
4028	new StoreInst (&I, SharedMem,
4029	RegionEndBB->getTerminator()->getIterator());
4030
4031	LoadInst LoadI = new* LoadInst (
4032	I.getType(), SharedMem, I.getName() + ".guarded.output.load",
4033	RegionBarrierBB->getTerminator()->getIterator());
4034
4035	// Emit a load instruction and replace uses of the output value.
4036	for (Use *U : OutsideUses)
4037	A.changeUseAfterManifest(U&: U, NV&: LoadI);
4038	}
4039
4040	auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
4041
4042	// Go to tid check BB in ParentBB.
4043	const DebugLoc DL = ParentBB->getTerminator()->getDebugLoc();
4044	ParentBB->getTerminator()->eraseFromParent();
4045	OpenMPIRBuilder::LocationDescription Loc(
4046	InsertPointTy (ParentBB, ParentBB->end()), DL);
4047	OMPInfoCache.OMPBuilder.updateToLocation(Loc);
4048	uint32_t SrcLocStrSize;
4049	auto *SrcLocStr =
4050	OMPInfoCache.OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4051	Value *Ident =
4052	OMPInfoCache.OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4053	BranchInst::Create(IfTrue: RegionCheckTidBB, InsertBefore: ParentBB)->setDebugLoc(DL);
4054
4055	// Add check for Tid in RegionCheckTidBB
4056	RegionCheckTidBB->getTerminator()->eraseFromParent();
4057	OpenMPIRBuilder::LocationDescription LocRegionCheckTid(
4058	InsertPointTy (RegionCheckTidBB, RegionCheckTidBB->end()), DL);
4059	OMPInfoCache.OMPBuilder.updateToLocation(Loc: LocRegionCheckTid);
4060	FunctionCallee HardwareTidFn =
4061	OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4062	M, FnID: OMPRTL___kmpc_get_hardware_thread_id_in_block);
4063	CallInst *Tid =
4064	OMPInfoCache.OMPBuilder.Builder.CreateCall(Callee: HardwareTidFn, Args: {});
4065	Tid->setDebugLoc(DL);
4066	OMPInfoCache.setCallingConvention(Callee: HardwareTidFn, CI: Tid);
4067	Value *TidCheck = OMPInfoCache.OMPBuilder.Builder.CreateIsNull(Arg: Tid);
4068	OMPInfoCache.OMPBuilder.Builder
4069	.CreateCondBr(Cond: TidCheck, True: RegionStartBB, False: RegionBarrierBB)
4070	->setDebugLoc(DL);
4071
4072	// First barrier for synchronization, ensures main thread has updated
4073	// values.
4074	FunctionCallee BarrierFn =
4075	OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4076	M, FnID: OMPRTL___kmpc_barrier_simple_spmd);
4077	OMPInfoCache.OMPBuilder.updateToLocation(Loc: InsertPointTy (
4078	RegionBarrierBB, RegionBarrierBB->getFirstInsertionPt()));
4079	CallInst *Barrier =
4080	OMPInfoCache.OMPBuilder.Builder.CreateCall(Callee: BarrierFn, Args: {Ident, Tid});
4081	Barrier->setDebugLoc(DL);
4082	OMPInfoCache.setCallingConvention(Callee: BarrierFn, CI: Barrier);
4083
4084	// Second barrier ensures workers have read broadcast values.
4085	if (HasBroadcastValues) {
4086	CallInst *Barrier =
4087	CallInst::Create(Func: BarrierFn, Args: {Ident, Tid}, NameStr: "",
4088	InsertBefore: RegionBarrierBB->getTerminator()->getIterator());
4089	Barrier->setDebugLoc(DL);
4090	OMPInfoCache.setCallingConvention(Callee: BarrierFn, CI: Barrier);
4091	}
4092	};
4093
4094	auto &AllocSharedRFI = OMPInfoCache.RFIs [OMPRTL___kmpc_alloc_shared];
4095	SmallPtrSet<BasicBlock *, `8`> Visited;
4096	for (Instruction *GuardedI : SPMDCompatibilityTracker) {
4097	BasicBlock *BB = GuardedI->getParent();
4098	if (!Visited.insert(Ptr: BB).second)
4099	continue;
4100
4101	SmallVector<std::pair<Instruction , Instruction >> Reorders;
4102	Instruction LastEffect = nullptr*;
4103	BasicBlock::reverse_iterator IP = BB->rbegin(), IPEnd = BB->rend();
4104	while (++IP != IPEnd) {
4105	if (!IP ->mayHaveSideEffects() && !IP ->mayReadFromMemory())
4106	continue;
4107	Instruction I = &IP;
4108	if (OpenMPOpt::getCallIfRegularCall(V&: *I, RFI: &AllocSharedRFI))
4109	continue;
4110	if (!I->user_empty() \|\| !SPMDCompatibilityTracker.contains(Elem: I)) {
4111	LastEffect = nullptr;
4112	continue;
4113	}
4114	if (LastEffect)
4115	Reorders.push_back(Elt: {I, LastEffect});
4116	LastEffect = &*IP;
4117	}
4118	for (auto &Reorder : Reorders)
4119	Reorder.first->moveBefore(MovePos: Reorder.second);
4120	}
4121
4122	SmallVector<std::pair<Instruction , Instruction >, `4`> GuardedRegions;
4123
4124	for (Instruction *GuardedI : SPMDCompatibilityTracker) {
4125	BasicBlock *BB = GuardedI->getParent();
4126	auto *CalleeAA = A.lookupAAFor<AAKernelInfo>(
4127	IRP: IRPosition::function(F: GuardedI->getFunction()), QueryingAA: nullptr*,
4128	DepClass: DepClassTy::NONE);
4129	assert(CalleeAA != nullptr && "Expected Callee AAKernelInfo");
4130	auto &CalleeAAFunction = *cast<AAKernelInfoFunction>(Val: CalleeAA);
4131	// Continue if instruction is already guarded.
4132	if (CalleeAAFunction.getGuardedInstructions().contains(Ptr: GuardedI))
4133	continue;
4134
4135	Instruction GuardedRegionStart = nullptr, GuardedRegionEnd = nullptr;
4136	for (Instruction &I : *BB) {
4137	// If instruction I needs to be guarded update the guarded region
4138	// bounds.
4139	if (SPMDCompatibilityTracker.contains(Elem: &I)) {
4140	CalleeAAFunction.getGuardedInstructions().insert(Ptr: &I);
4141	if (GuardedRegionStart)
4142	GuardedRegionEnd = &I;
4143	else
4144	GuardedRegionStart = GuardedRegionEnd = &I;
4145
4146	continue;
4147	}
4148
4149	// Instruction I does not need guarding, store
4150	// any region found and reset bounds.
4151	if (GuardedRegionStart) {
4152	GuardedRegions.push_back(
4153	Elt: std::make_pair(x&: GuardedRegionStart, y&: GuardedRegionEnd));
4154	GuardedRegionStart = nullptr;
4155	GuardedRegionEnd = nullptr;
4156	}
4157	}
4158	}
4159
4160	for (auto &GR : GuardedRegions)
4161	CreateGuardedRegion(GR.first, GR.second);
4162	}
4163
4164	void forceSingleThreadPerWorkgroupHelper(Attributor &A) {
4165	// Only allow 1 thread per workgroup to continue executing the user code.
4166	//
4167	// InitCB = __kmpc_target_init(...)
4168	// ThreadIdInBlock = __kmpc_get_hardware_thread_id_in_block();
4169	// if (ThreadIdInBlock != 0) return;
4170	// UserCode:
4171	// // user code
4172	//
4173	auto &Ctx = getAnchorValue().getContext();
4174	Function *Kernel = getAssociatedFunction();
4175	assert(Kernel && "Expected an associated function!");
4176
4177	// Create block for user code to branch to from initial block.
4178	BasicBlock *InitBB = KernelInitCB->getParent();
4179	BasicBlock *UserCodeBB = InitBB->splitBasicBlock(
4180	I: KernelInitCB->getNextNode(), BBName: "main.thread.user_code");
4181	BasicBlock *ReturnBB =
4182	BasicBlock::Create(Context&: Ctx, Name: "exit.threads", Parent: Kernel, InsertBefore: UserCodeBB);
4183
4184	// Register blocks with attributor:
4185	A.registerManifestAddedBasicBlock(BB&: *InitBB);
4186	A.registerManifestAddedBasicBlock(BB&: *UserCodeBB);
4187	A.registerManifestAddedBasicBlock(BB&: *ReturnBB);
4188
4189	// Debug location:
4190	const DebugLoc &DLoc = KernelInitCB->getDebugLoc();
4191	ReturnInst::Create(C&: Ctx, InsertAtEnd: ReturnBB)->setDebugLoc(DLoc);
4192	InitBB->getTerminator()->eraseFromParent();
4193
4194	// Prepare call to OMPRTL___kmpc_get_hardware_thread_id_in_block.
4195	Module &M = *Kernel->getParent();
4196	auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
4197	FunctionCallee ThreadIdInBlockFn =
4198	OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4199	M, FnID: OMPRTL___kmpc_get_hardware_thread_id_in_block);
4200
4201	// Get thread ID in block.
4202	CallInst *ThreadIdInBlock =
4203	CallInst::Create(Func: ThreadIdInBlockFn, NameStr: "thread_id.in.block", InsertBefore: InitBB);
4204	OMPInfoCache.setCallingConvention(Callee: ThreadIdInBlockFn, CI: ThreadIdInBlock);
4205	ThreadIdInBlock->setDebugLoc(DLoc);
4206
4207	// Eliminate all threads in the block with ID not equal to 0:
4208	Instruction *IsMainThread =
4209	ICmpInst::Create(Op: ICmpInst::ICmp, Pred: CmpInst::ICMP_NE, S1: ThreadIdInBlock,
4210	S2: ConstantInt::get(Ty: ThreadIdInBlock->getType(), V: `0`),
4211	Name: "thread.is_main", InsertBefore: InitBB);
4212	IsMainThread->setDebugLoc(DLoc);
4213	BranchInst::Create(IfTrue: ReturnBB, IfFalse: UserCodeBB, Cond: IsMainThread, InsertBefore: InitBB);
4214	}
4215
4216	bool changeToSPMDMode(Attributor &A, ChangeStatus &Changed) {
4217	auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
4218
4219	// We cannot change to SPMD mode if the runtime functions aren't availible.
4220	if (!OMPInfoCache.runtimeFnsAvailable(
4221	Fns: {OMPRTL___kmpc_get_hardware_thread_id_in_block,
4222	OMPRTL___kmpc_barrier_simple_spmd}))
4223	return false;
4224
4225	if (!SPMDCompatibilityTracker.isAssumed()) {
4226	for (Instruction *NonCompatibleI : SPMDCompatibilityTracker) {
4227	if (!NonCompatibleI)
4228	continue;
4229
4230	// Skip diagnostics on calls to known OpenMP runtime functions for now.
4231	if (auto *CB = dyn_cast<CallBase>(Val: NonCompatibleI))
4232	if (OMPInfoCache.RTLFunctions.contains(V: CB->getCalledFunction()))
4233	continue;
4234
4235	auto Remark = [&](OptimizationRemarkAnalysis ORA) {
4236	ORA << "Value has potential side effects preventing SPMD-mode "
4237	"execution";
4238	if (isa<CallBase>(Val: NonCompatibleI)) {
4239	ORA << ". Add `[[omp::assume(\"ompx_spmd_amenable\")]]` to "
4240	"the called function to override";
4241	}
4242	return ORA << ".";
4243	};
4244	A.emitRemark<OptimizationRemarkAnalysis>(I: NonCompatibleI, RemarkName: "OMP121",
4245	RemarkCB&: Remark);
4246
4247	LLVM_DEBUG(dbgs() << TAG << "SPMD-incompatible side-effect: "
4248	<< *NonCompatibleI << "\n");
4249	}
4250
4251	return false;
4252	}
4253
4254	// Get the actual kernel, could be the caller of the anchor scope if we have
4255	// a debug wrapper.
4256	Function *Kernel = getAnchorScope();
4257	if (Kernel->hasLocalLinkage()) {
4258	assert(Kernel->hasOneUse() && "Unexpected use of debug kernel wrapper.");
4259	auto *CB = cast<CallBase>(Val: Kernel->user_back());
4260	Kernel = CB->getCaller();
4261	}
4262	assert(omp::isOpenMPKernel(*Kernel) && "Expected kernel function!");
4263
4264	// Check if the kernel is already in SPMD mode, if so, return success.
4265	ConstantStruct *ExistingKernelEnvC =
4266	KernelInfo::getKernelEnvironementFromKernelInitCB(KernelInitCB);
4267	auto *ExecModeC =
4268	KernelInfo::getExecModeFromKernelEnvironment(KernelEnvC: ExistingKernelEnvC);
4269	const int8_t ExecModeVal = ExecModeC->getSExtValue();
4270	if (ExecModeVal != OMP_TGT_EXEC_MODE_GENERIC)
4271	return true;
4272
4273	// We will now unconditionally modify the IR, indicate a change.
4274	Changed = ChangeStatus::CHANGED;
4275
4276	// Do not use instruction guards when no parallel is present inside
4277	// the target region.
4278	if (mayContainParallelRegion())
4279	insertInstructionGuardsHelper(A);
4280	else
4281	forceSingleThreadPerWorkgroupHelper(A);
4282
4283	// Adjust the global exec mode flag that tells the runtime what mode this
4284	// kernel is executed in.
4285	assert(ExecModeVal == OMP_TGT_EXEC_MODE_GENERIC &&
4286	"Initially non-SPMD kernel has SPMD exec mode!");
4287	setExecModeOfKernelEnvironment(
4288	ConstantInt::get(Ty: ExecModeC->getIntegerType(),
4289	V: ExecModeVal \| OMP_TGT_EXEC_MODE_GENERIC_SPMD));
4290
4291	++NumOpenMPTargetRegionKernelsSPMD;
4292
4293	auto Remark = [&](OptimizationRemark OR) {
4294	return OR << "Transformed generic-mode kernel to SPMD-mode.";
4295	};
4296	A.emitRemark<OptimizationRemark>(I: KernelInitCB, RemarkName: "OMP120", RemarkCB&: Remark);
4297	return true;
4298	};
4299
4300	bool buildCustomStateMachine(Attributor &A, ChangeStatus &Changed) {
4301	// If we have disabled state machine rewrites, don't make a custom one
4302	if (DisableOpenMPOptStateMachineRewrite)
4303	return false;
4304
4305	// Don't rewrite the state machine if we are not in a valid state.
4306	if (!ReachedKnownParallelRegions.isValidState())
4307	return false;
4308
4309	auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
4310	if (!OMPInfoCache.runtimeFnsAvailable(
4311	Fns: {OMPRTL___kmpc_get_hardware_num_threads_in_block,
4312	OMPRTL___kmpc_get_warp_size, OMPRTL___kmpc_barrier_simple_generic,
4313	OMPRTL___kmpc_kernel_parallel, OMPRTL___kmpc_kernel_end_parallel}))
4314	return false;
4315
4316	ConstantStruct *ExistingKernelEnvC =
4317	KernelInfo::getKernelEnvironementFromKernelInitCB(KernelInitCB);
4318
4319	// Check if the current configuration is non-SPMD and generic state machine.
4320	// If we already have SPMD mode or a custom state machine we do not need to
4321	// go any further. If it is anything but a constant something is weird and
4322	// we give up.
4323	ConstantInt *UseStateMachineC =
4324	KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
4325	KernelEnvC: ExistingKernelEnvC);
4326	ConstantInt *ModeC =
4327	KernelInfo::getExecModeFromKernelEnvironment(KernelEnvC: ExistingKernelEnvC);
4328
4329	// If we are stuck with generic mode, try to create a custom device (=GPU)
4330	// state machine which is specialized for the parallel regions that are
4331	// reachable by the kernel.
4332	if (UseStateMachineC->isZero() \|\|
4333	(ModeC->getSExtValue() & OMP_TGT_EXEC_MODE_SPMD))
4334	return false;
4335
4336	Changed = ChangeStatus::CHANGED;
4337
4338	// If not SPMD mode, indicate we use a custom state machine now.
4339	setUseGenericStateMachineOfKernelEnvironment(
4340	ConstantInt::get(Ty: UseStateMachineC->getIntegerType(), V: false));
4341
4342	// If we don't actually need a state machine we are done here. This can
4343	// happen if there simply are no parallel regions. In the resulting kernel
4344	// all worker threads will simply exit right away, leaving the main thread
4345	// to do the work alone.
4346	if (!mayContainParallelRegion()) {
4347	++NumOpenMPTargetRegionKernelsWithoutStateMachine;
4348
4349	auto Remark = [&](OptimizationRemark OR) {
4350	return OR << "Removing unused state machine from generic-mode kernel.";
4351	};
4352	A.emitRemark<OptimizationRemark>(I: KernelInitCB, RemarkName: "OMP130", RemarkCB&: Remark);
4353
4354	return true;
4355	}
4356
4357	// Keep track in the statistics of our new shiny custom state machine.
4358	if (ReachedUnknownParallelRegions.empty()) {
4359	++NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback;
4360
4361	auto Remark = [&](OptimizationRemark OR) {
4362	return OR << "Rewriting generic-mode kernel with a customized state "
4363	"machine.";
4364	};
4365	A.emitRemark<OptimizationRemark>(I: KernelInitCB, RemarkName: "OMP131", RemarkCB&: Remark);
4366	} else {
4367	++NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback;
4368
4369	auto Remark = [&](OptimizationRemarkAnalysis OR) {
4370	return OR << "Generic-mode kernel is executed with a customized state "
4371	"machine that requires a fallback.";
4372	};
4373	A.emitRemark<OptimizationRemarkAnalysis>(I: KernelInitCB, RemarkName: "OMP132", RemarkCB&: Remark);
4374
4375	// Tell the user why we ended up with a fallback.
4376	for (CallBase *UnknownParallelRegionCB : ReachedUnknownParallelRegions) {
4377	if (!UnknownParallelRegionCB)
4378	continue;
4379	auto Remark = [&](OptimizationRemarkAnalysis ORA) {
4380	return ORA << "Call may contain unknown parallel regions. Use "
4381	<< "`[[omp::assume(\"omp_no_parallelism\")]]` to "
4382	"override.";
4383	};
4384	A.emitRemark<OptimizationRemarkAnalysis>(I: UnknownParallelRegionCB,
4385	RemarkName: "OMP133", RemarkCB&: Remark);
4386	}
4387	}
4388
4389	// Create all the blocks:
4390	//
4391	// InitCB = __kmpc_target_init(...)
4392	// BlockHwSize =
4393	// __kmpc_get_hardware_num_threads_in_block();
4394	// WarpSize = __kmpc_get_warp_size();
4395	// BlockSize = BlockHwSize - WarpSize;
4396	// IsWorkerCheckBB: bool IsWorker = InitCB != -1;
4397	// if (IsWorker) {
4398	// if (InitCB >= BlockSize) return;
4399	// SMBeginBB: __kmpc_barrier_simple_generic(...);
4400	// void WorkFn;*
4401	// bool Active = __kmpc_kernel_parallel(&WorkFn);
4402	// if (!WorkFn) return;
4403	// SMIsActiveCheckBB: if (Active) {
4404	// SMIfCascadeCurrentBB: if (WorkFn == <ParFn0>)
4405	// ParFn0(...);
4406	// SMIfCascadeCurrentBB: else if (WorkFn == <ParFn1>)
4407	// ParFn1(...);
4408	// ...
4409	// SMIfCascadeCurrentBB: else
4410	// ((WorkFnTy)WorkFn)(...);*
4411	// SMEndParallelBB: __kmpc_kernel_end_parallel(...);
4412	// }
4413	// SMDoneBB: __kmpc_barrier_simple_generic(...);
4414	// goto SMBeginBB;
4415	// }
4416	// UserCodeEntryBB: // user code
4417	// __kmpc_target_deinit(...)
4418	//
4419	auto &Ctx = getAnchorValue().getContext();
4420	Function *Kernel = getAssociatedFunction();
4421	assert(Kernel && "Expected an associated function!");
4422
4423	BasicBlock *InitBB = KernelInitCB->getParent();
4424	BasicBlock *UserCodeEntryBB = InitBB->splitBasicBlock(
4425	I: KernelInitCB->getNextNode(), BBName: "thread.user_code.check");
4426	BasicBlock *IsWorkerCheckBB =
4427	BasicBlock::Create(Context&: Ctx, Name: "is_worker_check", Parent: Kernel, InsertBefore: UserCodeEntryBB);
4428	BasicBlock *StateMachineBeginBB = BasicBlock::Create(
4429	Context&: Ctx, Name: "worker_state_machine.begin", Parent: Kernel, InsertBefore: UserCodeEntryBB);
4430	BasicBlock *StateMachineFinishedBB = BasicBlock::Create(
4431	Context&: Ctx, Name: "worker_state_machine.finished", Parent: Kernel, InsertBefore: UserCodeEntryBB);
4432	BasicBlock *StateMachineIsActiveCheckBB = BasicBlock::Create(
4433	Context&: Ctx, Name: "worker_state_machine.is_active.check", Parent: Kernel, InsertBefore: UserCodeEntryBB);
4434	BasicBlock *StateMachineIfCascadeCurrentBB =
4435	BasicBlock::Create(Context&: Ctx, Name: "worker_state_machine.parallel_region.check",
4436	Parent: Kernel, InsertBefore: UserCodeEntryBB);
4437	BasicBlock *StateMachineEndParallelBB =
4438	BasicBlock::Create(Context&: Ctx, Name: "worker_state_machine.parallel_region.end",
4439	Parent: Kernel, InsertBefore: UserCodeEntryBB);
4440	BasicBlock *StateMachineDoneBarrierBB = BasicBlock::Create(
4441	Context&: Ctx, Name: "worker_state_machine.done.barrier", Parent: Kernel, InsertBefore: UserCodeEntryBB);
4442	A.registerManifestAddedBasicBlock(BB&: *InitBB);
4443	A.registerManifestAddedBasicBlock(BB&: *UserCodeEntryBB);
4444	A.registerManifestAddedBasicBlock(BB&: *IsWorkerCheckBB);
4445	A.registerManifestAddedBasicBlock(BB&: *StateMachineBeginBB);
4446	A.registerManifestAddedBasicBlock(BB&: *StateMachineFinishedBB);
4447	A.registerManifestAddedBasicBlock(BB&: *StateMachineIsActiveCheckBB);
4448	A.registerManifestAddedBasicBlock(BB&: *StateMachineIfCascadeCurrentBB);
4449	A.registerManifestAddedBasicBlock(BB&: *StateMachineEndParallelBB);
4450	A.registerManifestAddedBasicBlock(BB&: *StateMachineDoneBarrierBB);
4451
4452	const DebugLoc &DLoc = KernelInitCB->getDebugLoc();
4453	ReturnInst::Create(C&: Ctx, InsertAtEnd: StateMachineFinishedBB)->setDebugLoc(DLoc);
4454	InitBB->getTerminator()->eraseFromParent();
4455
4456	Instruction *IsWorker =
4457	ICmpInst::Create(Op: ICmpInst::ICmp, Pred: llvm::CmpInst::ICMP_NE, S1: KernelInitCB,
4458	S2: ConstantInt::get(Ty: KernelInitCB->getType(), V: -`1`),
4459	Name: "thread.is_worker", InsertBefore: InitBB);
4460	IsWorker->setDebugLoc(DLoc);
4461	BranchInst::Create(IfTrue: IsWorkerCheckBB, IfFalse: UserCodeEntryBB, Cond: IsWorker, InsertBefore: InitBB);
4462
4463	Module &M = *Kernel->getParent();
4464	FunctionCallee BlockHwSizeFn =
4465	OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4466	M, FnID: OMPRTL___kmpc_get_hardware_num_threads_in_block);
4467	FunctionCallee WarpSizeFn =
4468	OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4469	M, FnID: OMPRTL___kmpc_get_warp_size);
4470	CallInst *BlockHwSize =
4471	CallInst::Create(Func: BlockHwSizeFn, NameStr: "block.hw_size", InsertBefore: IsWorkerCheckBB);
4472	OMPInfoCache.setCallingConvention(Callee: BlockHwSizeFn, CI: BlockHwSize);
4473	BlockHwSize->setDebugLoc(DLoc);
4474	CallInst *WarpSize =
4475	CallInst::Create(Func: WarpSizeFn, NameStr: "warp.size", InsertBefore: IsWorkerCheckBB);
4476	OMPInfoCache.setCallingConvention(Callee: WarpSizeFn, CI: WarpSize);
4477	WarpSize->setDebugLoc(DLoc);
4478	Instruction *BlockSize = BinaryOperator::CreateSub(
4479	V1: BlockHwSize, V2: WarpSize, Name: "block.size", BB: IsWorkerCheckBB);
4480	BlockSize->setDebugLoc(DLoc);
4481	Instruction *IsMainOrWorker = ICmpInst::Create(
4482	Op: ICmpInst::ICmp, Pred: llvm::CmpInst::ICMP_SLT, S1: KernelInitCB, S2: BlockSize,
4483	Name: "thread.is_main_or_worker", InsertBefore: IsWorkerCheckBB);
4484	IsMainOrWorker->setDebugLoc(DLoc);
4485	BranchInst::Create(IfTrue: StateMachineBeginBB, IfFalse: StateMachineFinishedBB,
4486	Cond: IsMainOrWorker, InsertBefore: IsWorkerCheckBB);
4487
4488	// Create local storage for the work function pointer.
4489	const DataLayout &DL = M.getDataLayout();
4490	Type *VoidPtrTy = PointerType::getUnqual(C&: Ctx);
4491	Instruction *WorkFnAI =
4492	new AllocaInst (VoidPtrTy, DL.getAllocaAddrSpace(), nullptr,
4493	"worker.work_fn.addr", Kernel->getEntryBlock().begin());
4494	WorkFnAI->setDebugLoc(DLoc);
4495
4496	OMPInfoCache.OMPBuilder.updateToLocation(
4497	Loc: OpenMPIRBuilder::LocationDescription (
4498	IRBuilder<>::InsertPoint (StateMachineBeginBB,
4499	StateMachineBeginBB->end()),
4500	DLoc));
4501
4502	Value *Ident = KernelInfo::getIdentFromKernelEnvironment(KernelEnvC);
4503	Value *GTid = KernelInitCB;
4504
4505	FunctionCallee BarrierFn =
4506	OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4507	M, FnID: OMPRTL___kmpc_barrier_simple_generic);
4508	CallInst *Barrier =
4509	CallInst::Create(Func: BarrierFn, Args: {Ident, GTid}, NameStr: "", InsertBefore: StateMachineBeginBB);
4510	OMPInfoCache.setCallingConvention(Callee: BarrierFn, CI: Barrier);
4511	Barrier->setDebugLoc(DLoc);
4512
4513	if (WorkFnAI->getType()->getPointerAddressSpace() !=
4514	(unsigned int)AddressSpace::Generic) {
4515	WorkFnAI = new AddrSpaceCastInst (
4516	WorkFnAI, PointerType::get(C&: Ctx, AddressSpace: (unsigned int)AddressSpace::Generic),
4517	WorkFnAI->getName() + ".generic", StateMachineBeginBB);
4518	WorkFnAI->setDebugLoc(DLoc);
4519	}
4520
4521	FunctionCallee KernelParallelFn =
4522	OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4523	M, FnID: OMPRTL___kmpc_kernel_parallel);
4524	CallInst *IsActiveWorker = CallInst::Create(
4525	Func: KernelParallelFn, Args: {WorkFnAI}, NameStr: "worker.is_active", InsertBefore: StateMachineBeginBB);
4526	OMPInfoCache.setCallingConvention(Callee: KernelParallelFn, CI: IsActiveWorker);
4527	IsActiveWorker->setDebugLoc(DLoc);
4528	Instruction WorkFn = new* LoadInst (VoidPtrTy, WorkFnAI, "worker.work_fn",
4529	StateMachineBeginBB);
4530	WorkFn->setDebugLoc(DLoc);
4531
4532	FunctionType *ParallelRegionFnTy = FunctionType::get(
4533	Result: Type::getVoidTy(C&: Ctx), Params: {Type::getInt16Ty(C&: Ctx), Type::getInt32Ty(C&: Ctx)},
4534	isVarArg: false);
4535
4536	Instruction *IsDone =
4537	ICmpInst::Create(Op: ICmpInst::ICmp, Pred: llvm::CmpInst::ICMP_EQ, S1: WorkFn,
4538	S2: Constant::getNullValue(Ty: VoidPtrTy), Name: "worker.is_done",
4539	InsertBefore: StateMachineBeginBB);
4540	IsDone->setDebugLoc(DLoc);
4541	BranchInst::Create(IfTrue: StateMachineFinishedBB, IfFalse: StateMachineIsActiveCheckBB,
4542	Cond: IsDone, InsertBefore: StateMachineBeginBB)
4543	->setDebugLoc(DLoc);
4544
4545	BranchInst::Create(IfTrue: StateMachineIfCascadeCurrentBB,
4546	IfFalse: StateMachineDoneBarrierBB, Cond: IsActiveWorker,
4547	InsertBefore: StateMachineIsActiveCheckBB)
4548	->setDebugLoc(DLoc);
4549
4550	Value *ZeroArg =
4551	Constant::getNullValue(Ty: ParallelRegionFnTy->getParamType(i: `0`));
4552
4553	const unsigned int WrapperFunctionArgNo = `6`;
4554
4555	// Now that we have most of the CFG skeleton it is time for the if-cascade
4556	// that checks the function pointer we got from the runtime against the
4557	// parallel regions we expect, if there are any.
4558	for (int I = `0`, E = ReachedKnownParallelRegions.size(); I < E; ++I) {
4559	auto *CB = ReachedKnownParallelRegions [I];
4560	auto *ParallelRegion = dyn_cast<Function>(
4561	Val: CB->getArgOperand(i: WrapperFunctionArgNo)->stripPointerCasts());
4562	BasicBlock *PRExecuteBB = BasicBlock::Create(
4563	Context&: Ctx, Name: "worker_state_machine.parallel_region.execute", Parent: Kernel,
4564	InsertBefore: StateMachineEndParallelBB);
4565	CallInst::Create(Func: ParallelRegion, Args: {ZeroArg, GTid}, NameStr: "", InsertBefore: PRExecuteBB)
4566	->setDebugLoc(DLoc);
4567	BranchInst::Create(IfTrue: StateMachineEndParallelBB, InsertBefore: PRExecuteBB)
4568	->setDebugLoc(DLoc);
4569
4570	BasicBlock *PRNextBB =
4571	BasicBlock::Create(Context&: Ctx, Name: "worker_state_machine.parallel_region.check",
4572	Parent: Kernel, InsertBefore: StateMachineEndParallelBB);
4573	A.registerManifestAddedBasicBlock(BB&: *PRExecuteBB);
4574	A.registerManifestAddedBasicBlock(BB&: *PRNextBB);
4575
4576	// Check if we need to compare the pointer at all or if we can just
4577	// call the parallel region function.
4578	Value *IsPR;
4579	if (I + `1` < E \|\| !ReachedUnknownParallelRegions.empty()) {
4580	Instruction *CmpI = ICmpInst::Create(
4581	Op: ICmpInst::ICmp, Pred: llvm::CmpInst::ICMP_EQ, S1: WorkFn, S2: ParallelRegion,
4582	Name: "worker.check_parallel_region", InsertBefore: StateMachineIfCascadeCurrentBB);
4583	CmpI->setDebugLoc(DLoc);
4584	IsPR = CmpI;
4585	} else {
4586	IsPR = ConstantInt::getTrue(Context&: Ctx);
4587	}
4588
4589	BranchInst::Create(IfTrue: PRExecuteBB, IfFalse: PRNextBB, Cond: IsPR,
4590	InsertBefore: StateMachineIfCascadeCurrentBB)
4591	->setDebugLoc(DLoc);
4592	StateMachineIfCascadeCurrentBB = PRNextBB;
4593	}
4594
4595	// At the end of the if-cascade we place the indirect function pointer call
4596	// in case we might need it, that is if there can be parallel regions we
4597	// have not handled in the if-cascade above.
4598	if (!ReachedUnknownParallelRegions.empty()) {
4599	StateMachineIfCascadeCurrentBB->setName(
4600	"worker_state_machine.parallel_region.fallback.execute");
4601	CallInst::Create(Ty: ParallelRegionFnTy, Func: WorkFn, Args: {ZeroArg, GTid}, NameStr: "",
4602	InsertBefore: StateMachineIfCascadeCurrentBB)
4603	->setDebugLoc(DLoc);
4604	}
4605	BranchInst::Create(IfTrue: StateMachineEndParallelBB,
4606	InsertBefore: StateMachineIfCascadeCurrentBB)
4607	->setDebugLoc(DLoc);
4608
4609	FunctionCallee EndParallelFn =
4610	OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4611	M, FnID: OMPRTL___kmpc_kernel_end_parallel);
4612	CallInst *EndParallel =
4613	CallInst::Create(Func: EndParallelFn, Args: {}, NameStr: "", InsertBefore: StateMachineEndParallelBB);
4614	OMPInfoCache.setCallingConvention(Callee: EndParallelFn, CI: EndParallel);
4615	EndParallel->setDebugLoc(DLoc);
4616	BranchInst::Create(IfTrue: StateMachineDoneBarrierBB, InsertBefore: StateMachineEndParallelBB)
4617	->setDebugLoc(DLoc);
4618
4619	CallInst::Create(Func: BarrierFn, Args: {Ident, GTid}, NameStr: "", InsertBefore: StateMachineDoneBarrierBB)
4620	->setDebugLoc(DLoc);
4621	BranchInst::Create(IfTrue: StateMachineBeginBB, InsertBefore: StateMachineDoneBarrierBB)
4622	->setDebugLoc(DLoc);
4623
4624	return true;
4625	}
4626
4627	/// Fixpoint iteration update function. Will be called every time a dependence
4628	/// changed its state (and in the beginning).
4629	ChangeStatus updateImpl(Attributor &A) override {
4630	KernelInfoState StateBefore = getState();
4631
4632	// When we leave this function this RAII will make sure the member
4633	// KernelEnvC is updated properly depending on the state. That member is
4634	// used for simplification of values and needs to be up to date at all
4635	// times.
4636	struct UpdateKernelEnvCRAII {
4637	AAKernelInfoFunction &AA;
4638
4639	UpdateKernelEnvCRAII(AAKernelInfoFunction &AA) : AA(AA) {}
4640
4641	~UpdateKernelEnvCRAII() {
4642	if (!AA.KernelEnvC)
4643	return;
4644
4645	ConstantStruct *ExistingKernelEnvC =
4646	KernelInfo::getKernelEnvironementFromKernelInitCB(KernelInitCB: AA.KernelInitCB);
4647
4648	if (!AA.isValidState()) {
4649	AA.KernelEnvC = ExistingKernelEnvC;
4650	return;
4651	}
4652
4653	if (!AA.ReachedKnownParallelRegions.isValidState())
4654	AA.setUseGenericStateMachineOfKernelEnvironment(
4655	KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
4656	KernelEnvC: ExistingKernelEnvC));
4657
4658	if (!AA.SPMDCompatibilityTracker.isValidState())
4659	AA.setExecModeOfKernelEnvironment(
4660	KernelInfo::getExecModeFromKernelEnvironment(KernelEnvC: ExistingKernelEnvC));
4661
4662	ConstantInt *MayUseNestedParallelismC =
4663	KernelInfo::getMayUseNestedParallelismFromKernelEnvironment(
4664	KernelEnvC: AA.KernelEnvC);
4665	ConstantInt *NewMayUseNestedParallelismC = ConstantInt::get(
4666	Ty: MayUseNestedParallelismC->getIntegerType(), V: AA.NestedParallelism);
4667	AA.setMayUseNestedParallelismOfKernelEnvironment(
4668	NewMayUseNestedParallelismC);
4669	}
4670	} RAII(*this);
4671
4672	// Callback to check a read/write instruction.
4673	auto CheckRWInst = [&](Instruction &I) {
4674	// We handle calls later.
4675	if (isa<CallBase>(Val: I))
4676	return true;
4677	// We only care about write effects.
4678	if (!I.mayWriteToMemory())
4679	return true;
4680	if (auto *SI = dyn_cast<StoreInst>(Val: &I)) {
4681	const auto *UnderlyingObjsAA = A.getAAFor<AAUnderlyingObjects>(
4682	QueryingAA: *this, IRP: IRPosition::value(V: *SI->getPointerOperand()),
4683	DepClass: DepClassTy::OPTIONAL);
4684	auto *HS = A.getAAFor<AAHeapToStack>(
4685	QueryingAA: *this, IRP: IRPosition::function(F: *I.getFunction()),
4686	DepClass: DepClassTy::OPTIONAL);
4687	if (UnderlyingObjsAA &&
4688	UnderlyingObjsAA->forallUnderlyingObjects(Pred: [&](Value &Obj) {
4689	if (AA::isAssumedThreadLocalObject(A, Obj, QueryingAA: *this))
4690	return true;
4691	// Check for AAHeapToStack moved objects which must not be
4692	// guarded.
4693	auto *CB = dyn_cast<CallBase>(Val: &Obj);
4694	return CB && HS && HS->isAssumedHeapToStack(CB: *CB);
4695	}))
4696	return true;
4697	}
4698
4699	// Insert instruction that needs guarding.
4700	SPMDCompatibilityTracker.insert(Elem: &I);
4701	return true;
4702	};
4703
4704	bool UsedAssumedInformationInCheckRWInst = false;
4705	if (!SPMDCompatibilityTracker.isAtFixpoint())
4706	if (!A.checkForAllReadWriteInstructions(
4707	Pred: CheckRWInst, QueryingAA&: *this, UsedAssumedInformation&: UsedAssumedInformationInCheckRWInst))
4708	SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4709
4710	bool UsedAssumedInformationFromReachingKernels = false;
4711	if (!IsKernelEntry) {
4712	updateParallelLevels(A);
4713
4714	bool AllReachingKernelsKnown = true;
4715	updateReachingKernelEntries(A, AllReachingKernelsKnown);
4716	UsedAssumedInformationFromReachingKernels = !AllReachingKernelsKnown;
4717
4718	if (!SPMDCompatibilityTracker.empty()) {
4719	if (!ParallelLevels.isValidState())
4720	SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4721	else if (!ReachingKernelEntries.isValidState())
4722	SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4723	else {
4724	// Check if all reaching kernels agree on the mode as we can otherwise
4725	// not guard instructions. We might not be sure about the mode so we
4726	// we cannot fix the internal spmd-zation state either.
4727	int SPMD = `0`, Generic = `0`;
4728	for (auto *Kernel : ReachingKernelEntries) {
4729	auto *CBAA = A.getAAFor<AAKernelInfo>(
4730	QueryingAA: *this, IRP: IRPosition::function(F: *Kernel), DepClass: DepClassTy::OPTIONAL);
4731	if (CBAA && CBAA->SPMDCompatibilityTracker.isValidState() &&
4732	CBAA->SPMDCompatibilityTracker.isAssumed())
4733	++SPMD;
4734	else
4735	++Generic;
4736	if (!CBAA \|\| !CBAA->SPMDCompatibilityTracker.isAtFixpoint())
4737	UsedAssumedInformationFromReachingKernels = true;
4738	}
4739	if (SPMD != `0` && Generic != `0`)
4740	SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4741	}
4742	}
4743	}
4744
4745	// Callback to check a call instruction.
4746	bool AllParallelRegionStatesWereFixed = true;
4747	bool AllSPMDStatesWereFixed = true;
4748	auto CheckCallInst = [&](Instruction &I) {
4749	auto &CB = cast<CallBase>(Val&: I);
4750	auto *CBAA = A.getAAFor<AAKernelInfo>(
4751	QueryingAA: *this, IRP: IRPosition::callsite_function(CB), DepClass: DepClassTy::OPTIONAL);
4752	if (!CBAA)
4753	return false;
4754	getState() ^= CBAA->getState();
4755	AllSPMDStatesWereFixed &= CBAA->SPMDCompatibilityTracker.isAtFixpoint();
4756	AllParallelRegionStatesWereFixed &=
4757	CBAA->ReachedKnownParallelRegions.isAtFixpoint();
4758	AllParallelRegionStatesWereFixed &=
4759	CBAA->ReachedUnknownParallelRegions.isAtFixpoint();
4760	return true;
4761	};
4762
4763	bool UsedAssumedInformationInCheckCallInst = false;
4764	if (!A.checkForAllCallLikeInstructions(
4765	Pred: CheckCallInst, QueryingAA: *this, UsedAssumedInformation&: UsedAssumedInformationInCheckCallInst)) {
4766	LLVM_DEBUG(dbgs() << TAG
4767	<< "Failed to visit all call-like instructions!\n";);
4768	return indicatePessimisticFixpoint();
4769	}
4770
4771	// If we haven't used any assumed information for the reached parallel
4772	// region states we can fix it.
4773	if (!UsedAssumedInformationInCheckCallInst &&
4774	AllParallelRegionStatesWereFixed) {
4775	ReachedKnownParallelRegions.indicateOptimisticFixpoint();
4776	ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
4777	}
4778
4779	// If we haven't used any assumed information for the SPMD state we can fix
4780	// it.
4781	if (!UsedAssumedInformationInCheckRWInst &&
4782	!UsedAssumedInformationInCheckCallInst &&
4783	!UsedAssumedInformationFromReachingKernels && AllSPMDStatesWereFixed)
4784	SPMDCompatibilityTracker.indicateOptimisticFixpoint();
4785
4786	return StateBefore == getState() ? ChangeStatus::UNCHANGED
4787	: ChangeStatus::CHANGED;
4788	}
4789
4790	private:
4791	/// Update info regarding reaching kernels.
4792	void updateReachingKernelEntries(Attributor &A,
4793	bool &AllReachingKernelsKnown) {
4794	auto PredCallSite = [&](AbstractCallSite ACS) {
4795	Function *Caller = ACS.getInstruction()->getFunction();
4796
4797	assert(Caller && "Caller is nullptr");
4798
4799	auto *CAA = A.getOrCreateAAFor<AAKernelInfo>(
4800	IRP: IRPosition::function(F: Caller), QueryingAA: this*, DepClass: DepClassTy::REQUIRED);
4801	if (CAA && CAA->ReachingKernelEntries.isValidState()) {
4802	ReachingKernelEntries ^= CAA->ReachingKernelEntries;
4803	return true;
4804	}
4805
4806	// We lost track of the caller of the associated function, any kernel
4807	// could reach now.
4808	ReachingKernelEntries.indicatePessimisticFixpoint();
4809
4810	return true;
4811	};
4812
4813	if (!A.checkForAllCallSites(Pred: PredCallSite, QueryingAA: *this,
4814	RequireAllCallSites: true / RequireAllCallSites /,
4815	UsedAssumedInformation&: AllReachingKernelsKnown))
4816	ReachingKernelEntries.indicatePessimisticFixpoint();
4817	}
4818
4819	/// Update info regarding parallel levels.
4820	void updateParallelLevels(Attributor &A) {
4821	auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
4822	OMPInformationCache::RuntimeFunctionInfo &Parallel51RFI =
4823	OMPInfoCache.RFIs [OMPRTL___kmpc_parallel_51];
4824
4825	auto PredCallSite = [&](AbstractCallSite ACS) {
4826	Function *Caller = ACS.getInstruction()->getFunction();
4827
4828	assert(Caller && "Caller is nullptr");
4829
4830	auto *CAA =
4831	A.getOrCreateAAFor<AAKernelInfo>(IRP: IRPosition::function(F: *Caller));
4832	if (CAA && CAA->ParallelLevels.isValidState()) {
4833	// Any function that is called by `__kmpc_parallel_51` will not be
4834	// folded as the parallel level in the function is updated. In order to
4835	// get it right, all the analysis would depend on the implentation. That
4836	// said, if in the future any change to the implementation, the analysis
4837	// could be wrong. As a consequence, we are just conservative here.
4838	if (Caller == Parallel51RFI.Declaration) {
4839	ParallelLevels.indicatePessimisticFixpoint();
4840	return true;
4841	}
4842
4843	ParallelLevels ^= CAA->ParallelLevels;
4844
4845	return true;
4846	}
4847
4848	// We lost track of the caller of the associated function, any kernel
4849	// could reach now.
4850	ParallelLevels.indicatePessimisticFixpoint();
4851
4852	return true;
4853	};
4854
4855	bool AllCallSitesKnown = true;
4856	if (!A.checkForAllCallSites(Pred: PredCallSite, QueryingAA: *this,
4857	RequireAllCallSites: true / RequireAllCallSites /,
4858	UsedAssumedInformation&: AllCallSitesKnown))
4859	ParallelLevels.indicatePessimisticFixpoint();
4860	}
4861	};
4862
4863	/// The call site kernel info abstract attribute, basically, what can we say
4864	/// about a call site with regards to the KernelInfoState. For now this simply
4865	/// forwards the information from the callee.
4866	struct AAKernelInfoCallSite : AAKernelInfo {
4867	AAKernelInfoCallSite(const IRPosition &IRP, Attributor &A)
4868	: AAKernelInfo (IRP, A) {}
4869
4870	/// See AbstractAttribute::initialize(...).
4871	void initialize(Attributor &A) override {
4872	AAKernelInfo::initialize(A);
4873
4874	CallBase &CB = cast<CallBase>(Val&: getAssociatedValue());
4875	auto *AssumptionAA = A.getAAFor<AAAssumptionInfo>(
4876	QueryingAA: *this, IRP: IRPosition::callsite_function(CB), DepClass: DepClassTy::OPTIONAL);
4877
4878	// Check for SPMD-mode assumptions.
4879	if (AssumptionAA && AssumptionAA->hasAssumption(Assumption: "ompx_spmd_amenable")) {
4880	indicateOptimisticFixpoint();
4881	return;
4882	}
4883
4884	// First weed out calls we do not care about, that is readonly/readnone
4885	// calls, intrinsics, and "no_openmp" calls. Neither of these can reach a
4886	// parallel region or anything else we are looking for.
4887	if (!CB.mayWriteToMemory() \|\| isa<IntrinsicInst>(Val: CB)) {
4888	indicateOptimisticFixpoint();
4889	return;
4890	}
4891
4892	// Next we check if we know the callee. If it is a known OpenMP function
4893	// we will handle them explicitly in the switch below. If it is not, we
4894	// will use an AAKernelInfo object on the callee to gather information and
4895	// merge that into the current state. The latter happens in the updateImpl.
4896	auto CheckCallee = [&](Function Callee, unsigned* NumCallees) {
4897	auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
4898	const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Val: Callee);
4899	if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {
4900	// Unknown caller or declarations are not analyzable, we give up.
4901	if (!Callee \|\| !A.isFunctionIPOAmendable(F: *Callee)) {
4902
4903	// Unknown callees might contain parallel regions, except if they have
4904	// an appropriate assumption attached.
4905	if (!AssumptionAA \|\|
4906	!(AssumptionAA->hasAssumption(Assumption: "omp_no_openmp") \|\|
4907	AssumptionAA->hasAssumption(Assumption: "omp_no_parallelism")))
4908	ReachedUnknownParallelRegions.insert(Elem: &CB);
4909
4910	// If SPMDCompatibilityTracker is not fixed, we need to give up on the
4911	// idea we can run something unknown in SPMD-mode.
4912	if (!SPMDCompatibilityTracker.isAtFixpoint()) {
4913	SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4914	SPMDCompatibilityTracker.insert(Elem: &CB);
4915	}
4916
4917	// We have updated the state for this unknown call properly, there
4918	// won't be any change so we indicate a fixpoint.
4919	indicateOptimisticFixpoint();
4920	}
4921	// If the callee is known and can be used in IPO, we will update the
4922	// state based on the callee state in updateImpl.
4923	return;
4924	}
4925	if (NumCallees > `1`) {
4926	indicatePessimisticFixpoint();
4927	return;
4928	}
4929
4930	RuntimeFunction RF = It ->getSecond();
4931	switch (RF) {
4932	// All the functions we know are compatible with SPMD mode.
4933	case OMPRTL___kmpc_is_spmd_exec_mode:
4934	case OMPRTL___kmpc_distribute_static_fini:
4935	case OMPRTL___kmpc_for_static_fini:
4936	case OMPRTL___kmpc_global_thread_num:
4937	case OMPRTL___kmpc_get_hardware_num_threads_in_block:
4938	case OMPRTL___kmpc_get_hardware_num_blocks:
4939	case OMPRTL___kmpc_single:
4940	case OMPRTL___kmpc_end_single:
4941	case OMPRTL___kmpc_master:
4942	case OMPRTL___kmpc_end_master:
4943	case OMPRTL___kmpc_barrier:
4944	case OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2:
4945	case OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2:
4946	case OMPRTL___kmpc_error:
4947	case OMPRTL___kmpc_flush:
4948	case OMPRTL___kmpc_get_hardware_thread_id_in_block:
4949	case OMPRTL___kmpc_get_warp_size:
4950	case OMPRTL_omp_get_thread_num:
4951	case OMPRTL_omp_get_num_threads:
4952	case OMPRTL_omp_get_max_threads:
4953	case OMPRTL_omp_in_parallel:
4954	case OMPRTL_omp_get_dynamic:
4955	case OMPRTL_omp_get_cancellation:
4956	case OMPRTL_omp_get_nested:
4957	case OMPRTL_omp_get_schedule:
4958	case OMPRTL_omp_get_thread_limit:
4959	case OMPRTL_omp_get_supported_active_levels:
4960	case OMPRTL_omp_get_max_active_levels:
4961	case OMPRTL_omp_get_level:
4962	case OMPRTL_omp_get_ancestor_thread_num:
4963	case OMPRTL_omp_get_team_size:
4964	case OMPRTL_omp_get_active_level:
4965	case OMPRTL_omp_in_final:
4966	case OMPRTL_omp_get_proc_bind:
4967	case OMPRTL_omp_get_num_places:
4968	case OMPRTL_omp_get_num_procs:
4969	case OMPRTL_omp_get_place_proc_ids:
4970	case OMPRTL_omp_get_place_num:
4971	case OMPRTL_omp_get_partition_num_places:
4972	case OMPRTL_omp_get_partition_place_nums:
4973	case OMPRTL_omp_get_wtime:
4974	break;
4975	case OMPRTL___kmpc_distribute_static_init_4:
4976	case OMPRTL___kmpc_distribute_static_init_4u:
4977	case OMPRTL___kmpc_distribute_static_init_8:
4978	case OMPRTL___kmpc_distribute_static_init_8u:
4979	case OMPRTL___kmpc_for_static_init_4:
4980	case OMPRTL___kmpc_for_static_init_4u:
4981	case OMPRTL___kmpc_for_static_init_8:
4982	case OMPRTL___kmpc_for_static_init_8u: {
4983	// Check the schedule and allow static schedule in SPMD mode.
4984	unsigned ScheduleArgOpNo = `2`;
4985	auto *ScheduleTypeCI =
4986	dyn_cast<ConstantInt>(Val: CB.getArgOperand(i: ScheduleArgOpNo));
4987	unsigned ScheduleTypeVal =
4988	ScheduleTypeCI ? ScheduleTypeCI->getZExtValue() : `0`;
4989	switch (OMPScheduleType(ScheduleTypeVal)) {
4990	case OMPScheduleType::UnorderedStatic:
4991	case OMPScheduleType::UnorderedStaticChunked:
4992	case OMPScheduleType::OrderedDistribute:
4993	case OMPScheduleType::OrderedDistributeChunked:
4994	break;
4995	default:
4996	SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4997	SPMDCompatibilityTracker.insert(Elem: &CB);
4998	break;
4999	};
5000	} break;
5001	case OMPRTL___kmpc_target_init:
5002	KernelInitCB = &CB;
5003	break;
5004	case OMPRTL___kmpc_target_deinit:
5005	KernelDeinitCB = &CB;
5006	break;
5007	case OMPRTL___kmpc_parallel_51:
5008	if (!handleParallel51(A, CB))
5009	indicatePessimisticFixpoint();
5010	return;
5011	case OMPRTL___kmpc_omp_task:
5012	// We do not look into tasks right now, just give up.
5013	SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5014	SPMDCompatibilityTracker.insert(Elem: &CB);
5015	ReachedUnknownParallelRegions.insert(Elem: &CB);
5016	break;
5017	case OMPRTL___kmpc_alloc_shared:
5018	case OMPRTL___kmpc_free_shared:
5019	// Return without setting a fixpoint, to be resolved in updateImpl.
5020	return;
5021	default:
5022	// Unknown OpenMP runtime calls cannot be executed in SPMD-mode,
5023	// generally. However, they do not hide parallel regions.
5024	SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5025	SPMDCompatibilityTracker.insert(Elem: &CB);
5026	break;
5027	}
5028	// All other OpenMP runtime calls will not reach parallel regions so they
5029	// can be safely ignored for now. Since it is a known OpenMP runtime call
5030	// we have now modeled all effects and there is no need for any update.
5031	indicateOptimisticFixpoint();
5032	};
5033
5034	const auto *AACE =
5035	A.getAAFor<AACallEdges>(QueryingAA: *this, IRP: getIRPosition(), DepClass: DepClassTy::OPTIONAL);
5036	if (!AACE \|\| !AACE->getState().isValidState() \|\| AACE->hasUnknownCallee()) {
5037	CheckCallee(getAssociatedFunction(), `1`);
5038	return;
5039	}
5040	const auto &OptimisticEdges = AACE->getOptimisticEdges();
5041	for (auto *Callee : OptimisticEdges) {
5042	CheckCallee(Callee, OptimisticEdges.size());
5043	if (isAtFixpoint())
5044	break;
5045	}
5046	}
5047
5048	ChangeStatus updateImpl(Attributor &A) override {
5049	// TODO: Once we have call site specific value information we can provide
5050	// call site specific liveness information and then it makes
5051	// sense to specialize attributes for call sites arguments instead of
5052	// redirecting requests to the callee argument.
5053	auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
5054	KernelInfoState StateBefore = getState();
5055
5056	auto CheckCallee = [&](Function F, int* NumCallees) {
5057	const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Val: F);
5058
5059	// If F is not a runtime function, propagate the AAKernelInfo of the
5060	// callee.
5061	if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {
5062	const IRPosition &FnPos = IRPosition::function(F: *F);
5063	auto *FnAA =
5064	A.getAAFor<AAKernelInfo>(QueryingAA: *this, IRP: FnPos, DepClass: DepClassTy::REQUIRED);
5065	if (!FnAA)
5066	return indicatePessimisticFixpoint();
5067	if (getState() == FnAA->getState())
5068	return ChangeStatus::UNCHANGED;
5069	getState() = FnAA->getState();
5070	return ChangeStatus::CHANGED;
5071	}
5072	if (NumCallees > `1`)
5073	return indicatePessimisticFixpoint();
5074
5075	CallBase &CB = cast<CallBase>(Val&: getAssociatedValue());
5076	if (It ->getSecond() == OMPRTL___kmpc_parallel_51) {
5077	if (!handleParallel51(A, CB))
5078	return indicatePessimisticFixpoint();
5079	return StateBefore == getState() ? ChangeStatus::UNCHANGED
5080	: ChangeStatus::CHANGED;
5081	}
5082
5083	// F is a runtime function that allocates or frees memory, check
5084	// AAHeapToStack and AAHeapToShared.
5085	assert(
5086	(It->getSecond() == OMPRTL___kmpc_alloc_shared \|\|
5087	It->getSecond() == OMPRTL___kmpc_free_shared) &&
5088	"Expected a __kmpc_alloc_shared or __kmpc_free_shared runtime call");
5089
5090	auto *HeapToStackAA = A.getAAFor<AAHeapToStack>(
5091	QueryingAA: *this, IRP: IRPosition::function(F: *CB.getCaller()), DepClass: DepClassTy::OPTIONAL);
5092	auto *HeapToSharedAA = A.getAAFor<AAHeapToShared>(
5093	QueryingAA: *this, IRP: IRPosition::function(F: *CB.getCaller()), DepClass: DepClassTy::OPTIONAL);
5094
5095	RuntimeFunction RF = It ->getSecond();
5096
5097	switch (RF) {
5098	// If neither HeapToStack nor HeapToShared assume the call is removed,
5099	// assume SPMD incompatibility.
5100	case OMPRTL___kmpc_alloc_shared:
5101	if ((!HeapToStackAA \|\| !HeapToStackAA->isAssumedHeapToStack(CB)) &&
5102	(!HeapToSharedAA \|\| !HeapToSharedAA->isAssumedHeapToShared(CB)))
5103	SPMDCompatibilityTracker.insert(Elem: &CB);
5104	break;
5105	case OMPRTL___kmpc_free_shared:
5106	if ((!HeapToStackAA \|\|
5107	!HeapToStackAA->isAssumedHeapToStackRemovedFree(CB)) &&
5108	(!HeapToSharedAA \|\|
5109	!HeapToSharedAA->isAssumedHeapToSharedRemovedFree(CB)))
5110	SPMDCompatibilityTracker.insert(Elem: &CB);
5111	break;
5112	default:
5113	SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5114	SPMDCompatibilityTracker.insert(Elem: &CB);
5115	}
5116	return ChangeStatus::CHANGED;
5117	};
5118
5119	const auto *AACE =
5120	A.getAAFor<AACallEdges>(QueryingAA: *this, IRP: getIRPosition(), DepClass: DepClassTy::OPTIONAL);
5121	if (!AACE \|\| !AACE->getState().isValidState() \|\| AACE->hasUnknownCallee()) {
5122	if (Function *F = getAssociatedFunction())
5123	CheckCallee(F, /NumCallees=/`1`);
5124	} else {
5125	const auto &OptimisticEdges = AACE->getOptimisticEdges();
5126	for (auto *Callee : OptimisticEdges) {
5127	CheckCallee(Callee, OptimisticEdges.size());
5128	if (isAtFixpoint())
5129	break;
5130	}
5131	}
5132
5133	return StateBefore == getState() ? ChangeStatus::UNCHANGED
5134	: ChangeStatus::CHANGED;
5135	}
5136
5137	/// Deal with a __kmpc_parallel_51 call (\p CB). Returns true if the call was
5138	/// handled, if a problem occurred, false is returned.
5139	bool handleParallel51(Attributor &A, CallBase &CB) {
5140	const unsigned int NonWrapperFunctionArgNo = `5`;
5141	const unsigned int WrapperFunctionArgNo = `6`;
5142	auto ParallelRegionOpArgNo = SPMDCompatibilityTracker.isAssumed()
5143	? NonWrapperFunctionArgNo
5144	: WrapperFunctionArgNo;
5145
5146	auto *ParallelRegion = dyn_cast<Function>(
5147	Val: CB.getArgOperand(i: ParallelRegionOpArgNo)->stripPointerCasts());
5148	if (!ParallelRegion)
5149	return false;
5150
5151	ReachedKnownParallelRegions.insert(Elem: &CB);
5152	/// Check nested parallelism
5153	auto *FnAA = A.getAAFor<AAKernelInfo>(
5154	QueryingAA: *this, IRP: IRPosition::function(F: *ParallelRegion), DepClass: DepClassTy::OPTIONAL);
5155	NestedParallelism \|= !FnAA \|\| !FnAA->getState().isValidState() \|\|
5156	!FnAA->ReachedKnownParallelRegions.empty() \|\|
5157	!FnAA->ReachedKnownParallelRegions.isValidState() \|\|
5158	!FnAA->ReachedUnknownParallelRegions.isValidState() \|\|
5159	!FnAA->ReachedUnknownParallelRegions.empty();
5160	return true;
5161	}
5162	};
5163
5164	struct AAFoldRuntimeCall
5165	: public StateWrapper<BooleanState, AbstractAttribute> {
5166	using Base = StateWrapper<BooleanState, AbstractAttribute>;
5167
5168	AAFoldRuntimeCall(const IRPosition &IRP, Attributor &A) : Base (IRP) {}
5169
5170	/// Statistics are tracked as part of manifest for now.
5171	void trackStatistics() const override {}
5172
5173	/// Create an abstract attribute biew for the position \p IRP.
5174	static AAFoldRuntimeCall &createForPosition(const IRPosition &IRP,
5175	Attributor &A);
5176
5177	/// See AbstractAttribute::getName()
5178	const std::string getName() const override { return "AAFoldRuntimeCall"; }
5179
5180	/// See AbstractAttribute::getIdAddr()
5181	const char getIdAddr() const* override { return &ID; }
5182
5183	/// This function should return true if the type of the \p AA is
5184	/// AAFoldRuntimeCall
5185	static bool classof(const AbstractAttribute *AA) {
5186	return (AA->getIdAddr() == &ID);
5187	}
5188
5189	static const char ID;
5190	};
5191
5192	struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall {
5193	AAFoldRuntimeCallCallSiteReturned(const IRPosition &IRP, Attributor &A)
5194	: AAFoldRuntimeCall (IRP, A) {}
5195
5196	/// See AbstractAttribute::getAsStr()
5197	const std::string getAsStr(Attributor ) const* override {
5198	if (!isValidState())
5199	return "<invalid>";
5200
5201	std::string Str("simplified value: ");
5202
5203	if (!SimplifiedValue)
5204	return Str + std::string ("none");
5205
5206	if (!*SimplifiedValue)
5207	return Str + std::string ("nullptr");
5208
5209	if (ConstantInt CI = dyn_cast<ConstantInt>(Val: SimplifiedValue))
5210	return Str + std::to_string(val: CI->getSExtValue());
5211
5212	return Str + std::string ("unknown");
5213	}
5214
5215	void initialize(Attributor &A) override {
5216	if (DisableOpenMPOptFolding)
5217	indicatePessimisticFixpoint();
5218
5219	Function *Callee = getAssociatedFunction();
5220
5221	auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
5222	const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Val: Callee);
5223	assert(It != OMPInfoCache.RuntimeFunctionIDMap.end() &&
5224	"Expected a known OpenMP runtime function");
5225
5226	RFKind = It ->getSecond();
5227
5228	CallBase &CB = cast<CallBase>(Val&: getAssociatedValue());
5229	A.registerSimplificationCallback(
5230	IRP: IRPosition::callsite_returned(CB),
5231	CB: [&](const IRPosition &IRP, const AbstractAttribute *AA,
5232	bool &UsedAssumedInformation) -> std::optional<Value *> {
5233	assert((isValidState() \|\|
5234	(SimplifiedValue && SimplifiedValue == nullptr*)) &&
5235	"Unexpected invalid state!");
5236
5237	if (!isAtFixpoint()) {
5238	UsedAssumedInformation = true;
5239	if (AA)
5240	A.recordDependence(FromAA: *this, ToAA: *AA, DepClass: DepClassTy::OPTIONAL);
5241	}
5242	return SimplifiedValue;
5243	});
5244	}
5245
5246	ChangeStatus updateImpl(Attributor &A) override {
5247	ChangeStatus Changed = ChangeStatus::UNCHANGED;
5248	switch (RFKind) {
5249	case OMPRTL___kmpc_is_spmd_exec_mode:
5250	Changed \|= foldIsSPMDExecMode(A);
5251	break;
5252	case OMPRTL___kmpc_parallel_level:
5253	Changed \|= foldParallelLevel(A);
5254	break;
5255	case OMPRTL___kmpc_get_hardware_num_threads_in_block:
5256	Changed = Changed \| foldKernelFnAttribute(A, Attr: "omp_target_thread_limit");
5257	break;
5258	case OMPRTL___kmpc_get_hardware_num_blocks:
5259	Changed = Changed \| foldKernelFnAttribute(A, Attr: "omp_target_num_teams");
5260	break;
5261	default:
5262	llvm_unreachable("Unhandled OpenMP runtime function!");
5263	}
5264
5265	return Changed;
5266	}
5267
5268	ChangeStatus manifest(Attributor &A) override {
5269	ChangeStatus Changed = ChangeStatus::UNCHANGED;
5270
5271	if (SimplifiedValue && *SimplifiedValue) {
5272	Instruction &I = *getCtxI();
5273	A.changeAfterManifest(IRP: IRPosition::inst(I), NV&: **SimplifiedValue);
5274	A.deleteAfterManifest(I);
5275
5276	CallBase *CB = dyn_cast<CallBase>(Val: &I);
5277	auto Remark = [&](OptimizationRemark OR) {
5278	if (auto C = dyn_cast<ConstantInt>(Val: SimplifiedValue))
5279	return OR << "Replacing OpenMP runtime call "
5280	<< CB->getCalledFunction()->getName() << " with "
5281	<< ore::NV ("FoldedValue", C->getZExtValue()) << ".";
5282	return OR << "Replacing OpenMP runtime call "
5283	<< CB->getCalledFunction()->getName() << ".";
5284	};
5285
5286	if (CB && EnableVerboseRemarks)
5287	A.emitRemark<OptimizationRemark>(I: CB, RemarkName: "OMP180", RemarkCB&: Remark);
5288
5289	LLVM_DEBUG(dbgs() << TAG << "Replacing runtime call: " << I << " with "
5290	<< **SimplifiedValue << "\n");
5291
5292	Changed = ChangeStatus::CHANGED;
5293	}
5294
5295	return Changed;
5296	}
5297
5298	ChangeStatus indicatePessimisticFixpoint() override {
5299	SimplifiedValue = nullptr;
5300	return AAFoldRuntimeCall::indicatePessimisticFixpoint();
5301	}
5302
5303	private:
5304	/// Fold __kmpc_is_spmd_exec_mode into a constant if possible.
5305	ChangeStatus foldIsSPMDExecMode(Attributor &A) {
5306	std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
5307
5308	unsigned AssumedSPMDCount = `0`, KnownSPMDCount = `0`;
5309	unsigned AssumedNonSPMDCount = `0`, KnownNonSPMDCount = `0`;
5310	auto *CallerKernelInfoAA = A.getAAFor<AAKernelInfo>(
5311	QueryingAA: *this, IRP: IRPosition::function(F: *getAnchorScope()), DepClass: DepClassTy::REQUIRED);
5312
5313	if (!CallerKernelInfoAA \|\|
5314	!CallerKernelInfoAA->ReachingKernelEntries.isValidState())
5315	return indicatePessimisticFixpoint();
5316
5317	for (Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
5318	auto AA = A.getAAFor<AAKernelInfo>(QueryingAA: this, IRP: IRPosition::function(F: *K),
5319	DepClass: DepClassTy::REQUIRED);
5320
5321	if (!AA \|\| !AA->isValidState()) {
5322	SimplifiedValue = nullptr;
5323	return indicatePessimisticFixpoint();
5324	}
5325
5326	if (AA->SPMDCompatibilityTracker.isAssumed()) {
5327	if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5328	++KnownSPMDCount;
5329	else
5330	++AssumedSPMDCount;
5331	} else {
5332	if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5333	++KnownNonSPMDCount;
5334	else
5335	++AssumedNonSPMDCount;
5336	}
5337	}
5338
5339	if ((AssumedSPMDCount + KnownSPMDCount) &&
5340	(AssumedNonSPMDCount + KnownNonSPMDCount))
5341	return indicatePessimisticFixpoint();
5342
5343	auto &Ctx = getAnchorValue().getContext();
5344	if (KnownSPMDCount \|\| AssumedSPMDCount) {
5345	assert(KnownNonSPMDCount == `0` && AssumedNonSPMDCount == `0` &&
5346	"Expected only SPMD kernels!");
5347	// All reaching kernels are in SPMD mode. Update all function calls to
5348	// __kmpc_is_spmd_exec_mode to 1.
5349	SimplifiedValue = ConstantInt::get(Ty: Type::getInt8Ty(C&: Ctx), V: true);
5350	} else if (KnownNonSPMDCount \|\| AssumedNonSPMDCount) {
5351	assert(KnownSPMDCount == `0` && AssumedSPMDCount == `0` &&
5352	"Expected only non-SPMD kernels!");
5353	// All reaching kernels are in non-SPMD mode. Update all function
5354	// calls to __kmpc_is_spmd_exec_mode to 0.
5355	SimplifiedValue = ConstantInt::get(Ty: Type::getInt8Ty(C&: Ctx), V: false);
5356	} else {
5357	// We have empty reaching kernels, therefore we cannot tell if the
5358	// associated call site can be folded. At this moment, SimplifiedValue
5359	// must be none.
5360	assert(!SimplifiedValue && "SimplifiedValue should be none");
5361	}
5362
5363	return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
5364	: ChangeStatus::CHANGED;
5365	}
5366
5367	/// Fold __kmpc_parallel_level into a constant if possible.
5368	ChangeStatus foldParallelLevel(Attributor &A) {
5369	std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
5370
5371	auto *CallerKernelInfoAA = A.getAAFor<AAKernelInfo>(
5372	QueryingAA: *this, IRP: IRPosition::function(F: *getAnchorScope()), DepClass: DepClassTy::REQUIRED);
5373
5374	if (!CallerKernelInfoAA \|\|
5375	!CallerKernelInfoAA->ParallelLevels.isValidState())
5376	return indicatePessimisticFixpoint();
5377
5378	if (!CallerKernelInfoAA->ReachingKernelEntries.isValidState())
5379	return indicatePessimisticFixpoint();
5380
5381	if (CallerKernelInfoAA->ReachingKernelEntries.empty()) {
5382	assert(!SimplifiedValue &&
5383	"SimplifiedValue should keep none at this point");
5384	return ChangeStatus::UNCHANGED;
5385	}
5386
5387	unsigned AssumedSPMDCount = `0`, KnownSPMDCount = `0`;
5388	unsigned AssumedNonSPMDCount = `0`, KnownNonSPMDCount = `0`;
5389	for (Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
5390	auto AA = A.getAAFor<AAKernelInfo>(QueryingAA: this, IRP: IRPosition::function(F: *K),
5391	DepClass: DepClassTy::REQUIRED);
5392	if (!AA \|\| !AA->SPMDCompatibilityTracker.isValidState())
5393	return indicatePessimisticFixpoint();
5394
5395	if (AA->SPMDCompatibilityTracker.isAssumed()) {
5396	if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5397	++KnownSPMDCount;
5398	else
5399	++AssumedSPMDCount;
5400	} else {
5401	if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5402	++KnownNonSPMDCount;
5403	else
5404	++AssumedNonSPMDCount;
5405	}
5406	}
5407
5408	if ((AssumedSPMDCount + KnownSPMDCount) &&
5409	(AssumedNonSPMDCount + KnownNonSPMDCount))
5410	return indicatePessimisticFixpoint();
5411
5412	auto &Ctx = getAnchorValue().getContext();
5413	// If the caller can only be reached by SPMD kernel entries, the parallel
5414	// level is 1. Similarly, if the caller can only be reached by non-SPMD
5415	// kernel entries, it is 0.
5416	if (AssumedSPMDCount \|\| KnownSPMDCount) {
5417	assert(KnownNonSPMDCount == `0` && AssumedNonSPMDCount == `0` &&
5418	"Expected only SPMD kernels!");
5419	SimplifiedValue = ConstantInt::get(Ty: Type::getInt8Ty(C&: Ctx), V: `1`);
5420	} else {
5421	assert(KnownSPMDCount == `0` && AssumedSPMDCount == `0` &&
5422	"Expected only non-SPMD kernels!");
5423	SimplifiedValue = ConstantInt::get(Ty: Type::getInt8Ty(C&: Ctx), V: `0`);
5424	}
5425	return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
5426	: ChangeStatus::CHANGED;
5427	}
5428
5429	ChangeStatus foldKernelFnAttribute(Attributor &A, llvm::StringRef Attr) {
5430	// Specialize only if all the calls agree with the attribute constant value
5431	int32_t CurrentAttrValue = -`1`;
5432	std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
5433
5434	auto *CallerKernelInfoAA = A.getAAFor<AAKernelInfo>(
5435	QueryingAA: *this, IRP: IRPosition::function(F: *getAnchorScope()), DepClass: DepClassTy::REQUIRED);
5436
5437	if (!CallerKernelInfoAA \|\|
5438	!CallerKernelInfoAA->ReachingKernelEntries.isValidState())
5439	return indicatePessimisticFixpoint();
5440
5441	// Iterate over the kernels that reach this function
5442	for (Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
5443	int32_t NextAttrVal = K->getFnAttributeAsParsedInteger(Kind: Attr, Default: -`1`);
5444
5445	if (NextAttrVal == -`1` \|\|
5446	(CurrentAttrValue != -`1` && CurrentAttrValue != NextAttrVal))
5447	return indicatePessimisticFixpoint();
5448	CurrentAttrValue = NextAttrVal;
5449	}
5450
5451	if (CurrentAttrValue != -`1`) {
5452	auto &Ctx = getAnchorValue().getContext();
5453	SimplifiedValue =
5454	ConstantInt::get(Ty: Type::getInt32Ty(C&: Ctx), V: CurrentAttrValue);
5455	}
5456	return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
5457	: ChangeStatus::CHANGED;
5458	}
5459
5460	/// An optional value the associated value is assumed to fold to. That is, we
5461	/// assume the associated value (which is a call) can be replaced by this
5462	/// simplified value.
5463	std::optional<Value *> SimplifiedValue;
5464
5465	/// The runtime function kind of the callee of the associated call site.
5466	RuntimeFunction RFKind;
5467	};
5468
5469	} // namespace
5470
5471	/// Register folding callsite
5472	void OpenMPOpt::registerFoldRuntimeCall(RuntimeFunction RF) {
5473	auto &RFI = OMPInfoCache.RFIs [RF];
5474	RFI.foreachUse(SCC, CB: [&](Use &U, Function &F) {
5475	CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, RFI: &RFI);
5476	if (!CI)
5477	return false;
5478	A.getOrCreateAAFor<AAFoldRuntimeCall>(
5479	IRP: IRPosition::callsite_returned(CB: CI), /* QueryingAA / nullptr,
5480	DepClass: DepClassTy::NONE, / ForceUpdate / false,
5481	/ UpdateAfterInit / false);
5482	return false;
5483	});
5484	}
5485
5486	void OpenMPOpt::registerAAs(bool IsModulePass) {
5487	if (SCC.empty())
5488	return;
5489
5490	if (IsModulePass) {
5491	// Ensure we create the AAKernelInfo AAs first and without triggering an
5492	// update. This will make sure we register all value simplification
5493	// callbacks before any other AA has the chance to create an AAValueSimplify
5494	// or similar.
5495	auto CreateKernelInfoCB = [&](Use &, Function &Kernel) {
5496	A.getOrCreateAAFor<AAKernelInfo>(
5497	IRP: IRPosition::function(F: Kernel), / QueryingAA / nullptr,
5498	DepClass: DepClassTy::NONE, / ForceUpdate / false,
5499	/ UpdateAfterInit / false);
5500	return false;
5501	};
5502	OMPInformationCache::RuntimeFunctionInfo &InitRFI =
5503	OMPInfoCache.RFIs [OMPRTL___kmpc_target_init];
5504	InitRFI.foreachUse(SCC, CB: CreateKernelInfoCB);
5505
5506	registerFoldRuntimeCall(RF: OMPRTL___kmpc_is_spmd_exec_mode);
5507	registerFoldRuntimeCall(RF: OMPRTL___kmpc_parallel_level);
5508	registerFoldRuntimeCall(RF: OMPRTL___kmpc_get_hardware_num_threads_in_block);
5509	registerFoldRuntimeCall(RF: OMPRTL___kmpc_get_hardware_num_blocks);
5510	}
5511
5512	// Create CallSite AA for all Getters.
5513	if (DeduceICVValues) {
5514	for (int Idx = `0`; Idx < OMPInfoCache.ICVs.size() - `1`; ++Idx) {
5515	auto ICVInfo = OMPInfoCache.ICVs [static_cast<InternalControlVar>(Idx)];
5516
5517	auto &GetterRFI = OMPInfoCache.RFIs [ICVInfo.Getter];
5518
5519	auto CreateAA = [&](Use &U, Function &Caller) {
5520	CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, RFI: &GetterRFI);
5521	if (!CI)
5522	return false;
5523
5524	auto &CB = cast<CallBase>(Val&: *CI);
5525
5526	IRPosition CBPos = IRPosition::callsite_function(CB);
5527	A.getOrCreateAAFor<AAICVTracker>(IRP: CBPos);
5528	return false;
5529	};
5530
5531	GetterRFI.foreachUse(SCC, CB: CreateAA);
5532	}
5533	}
5534
5535	// Create an ExecutionDomain AA for every function and a HeapToStack AA for
5536	// every function if there is a device kernel.
5537	if (!isOpenMPDevice(M))
5538	return;
5539
5540	for (auto *F : SCC) {
5541	if (F->isDeclaration())
5542	continue;
5543
5544	// We look at internal functions only on-demand but if any use is not a
5545	// direct call or outside the current set of analyzed functions, we have
5546	// to do it eagerly.
5547	if (F->hasLocalLinkage()) {
5548	if (llvm::all_of(Range: F->uses(), P: [this](const Use &U) {
5549	const auto *CB = dyn_cast<CallBase>(Val: U.getUser());
5550	return CB && CB->isCallee(U: &U) &&
5551	A.isRunOn(Fn: const_cast<Function *>(CB->getCaller()));
5552	}))
5553	continue;
5554	}
5555	registerAAsForFunction(A, F: *F);
5556	}
5557	}
5558
5559	void OpenMPOpt::registerAAsForFunction(Attributor &A, const Function &F) {
5560	if (!DisableOpenMPOptDeglobalization)
5561	A.getOrCreateAAFor<AAHeapToShared>(IRP: IRPosition::function(F));
5562	A.getOrCreateAAFor<AAExecutionDomain>(IRP: IRPosition::function(F));
5563	if (!DisableOpenMPOptDeglobalization)
5564	A.getOrCreateAAFor<AAHeapToStack>(IRP: IRPosition::function(F));
5565	if (F.hasFnAttribute(Kind: Attribute::Convergent))
5566	A.getOrCreateAAFor<AANonConvergent>(IRP: IRPosition::function(F));
5567
5568	for (auto &I : instructions(F)) {
5569	if (auto *LI = dyn_cast<LoadInst>(Val: &I)) {
5570	bool UsedAssumedInformation = false;
5571	A.getAssumedSimplified(V: IRPosition::value(V: LI), /* AA / nullptr,
5572	UsedAssumedInformation, S: AA::Interprocedural);
5573	continue;
5574	}
5575	if (auto *CI = dyn_cast<CallBase>(Val: &I)) {
5576	if (CI->isIndirectCall())
5577	A.getOrCreateAAFor<AAIndirectCallInfo>(
5578	IRP: IRPosition::callsite_function(CB: *CI));
5579	}
5580	if (auto *SI = dyn_cast<StoreInst>(Val: &I)) {
5581	A.getOrCreateAAFor<AAIsDead>(IRP: IRPosition::value(V: *SI));
5582	continue;
5583	}
5584	if (auto *FI = dyn_cast<FenceInst>(Val: &I)) {
5585	A.getOrCreateAAFor<AAIsDead>(IRP: IRPosition::value(V: *FI));
5586	continue;
5587	}
5588	if (auto *II = dyn_cast<IntrinsicInst>(Val: &I)) {
5589	if (II->getIntrinsicID() == Intrinsic::assume) {
5590	A.getOrCreateAAFor<AAPotentialValues>(
5591	IRP: IRPosition::value(V: *II->getArgOperand(i: `0`)));
5592	continue;
5593	}
5594	}
5595	}
5596	}
5597
5598	const char AAICVTracker::ID = `0`;
5599	const char AAKernelInfo::ID = `0`;
5600	const char AAExecutionDomain::ID = `0`;
5601	const char AAHeapToShared::ID = `0`;
5602	const char AAFoldRuntimeCall::ID = `0`;
5603
5604	AAICVTracker &AAICVTracker::createForPosition(const IRPosition &IRP,
5605	Attributor &A) {
5606	AAICVTracker AA = nullptr*;
5607	switch (IRP.getPositionKind()) {
5608	case IRPosition::IRP_INVALID:
5609	case IRPosition::IRP_FLOAT:
5610	case IRPosition::IRP_ARGUMENT:
5611	case IRPosition::IRP_CALL_SITE_ARGUMENT:
5612	llvm_unreachable("ICVTracker can only be created for function position!");
5613	case IRPosition::IRP_RETURNED:
5614	AA = new (A.Allocator) AAICVTrackerFunctionReturned (IRP, A);
5615	break;
5616	case IRPosition::IRP_CALL_SITE_RETURNED:
5617	AA = new (A.Allocator) AAICVTrackerCallSiteReturned (IRP, A);
5618	break;
5619	case IRPosition::IRP_CALL_SITE:
5620	AA = new (A.Allocator) AAICVTrackerCallSite (IRP, A);
5621	break;
5622	case IRPosition::IRP_FUNCTION:
5623	AA = new (A.Allocator) AAICVTrackerFunction (IRP, A);
5624	break;
5625	}
5626
5627	return *AA;
5628	}
5629
5630	AAExecutionDomain &AAExecutionDomain::createForPosition(const IRPosition &IRP,
5631	Attributor &A) {
5632	AAExecutionDomainFunction AA = nullptr*;
5633	switch (IRP.getPositionKind()) {
5634	case IRPosition::IRP_INVALID:
5635	case IRPosition::IRP_FLOAT:
5636	case IRPosition::IRP_ARGUMENT:
5637	case IRPosition::IRP_CALL_SITE_ARGUMENT:
5638	case IRPosition::IRP_RETURNED:
5639	case IRPosition::IRP_CALL_SITE_RETURNED:
5640	case IRPosition::IRP_CALL_SITE:
5641	llvm_unreachable(
5642	"AAExecutionDomain can only be created for function position!");
5643	case IRPosition::IRP_FUNCTION:
5644	AA = new (A.Allocator) AAExecutionDomainFunction (IRP, A);
5645	break;
5646	}
5647
5648	return *AA;
5649	}
5650
5651	AAHeapToShared &AAHeapToShared::createForPosition(const IRPosition &IRP,
5652	Attributor &A) {
5653	AAHeapToSharedFunction AA = nullptr*;
5654	switch (IRP.getPositionKind()) {
5655	case IRPosition::IRP_INVALID:
5656	case IRPosition::IRP_FLOAT:
5657	case IRPosition::IRP_ARGUMENT:
5658	case IRPosition::IRP_CALL_SITE_ARGUMENT:
5659	case IRPosition::IRP_RETURNED:
5660	case IRPosition::IRP_CALL_SITE_RETURNED:
5661	case IRPosition::IRP_CALL_SITE:
5662	llvm_unreachable(
5663	"AAHeapToShared can only be created for function position!");
5664	case IRPosition::IRP_FUNCTION:
5665	AA = new (A.Allocator) AAHeapToSharedFunction (IRP, A);
5666	break;
5667	}
5668
5669	return *AA;
5670	}
5671
5672	AAKernelInfo &AAKernelInfo::createForPosition(const IRPosition &IRP,
5673	Attributor &A) {
5674	AAKernelInfo AA = nullptr*;
5675	switch (IRP.getPositionKind()) {
5676	case IRPosition::IRP_INVALID:
5677	case IRPosition::IRP_FLOAT:
5678	case IRPosition::IRP_ARGUMENT:
5679	case IRPosition::IRP_RETURNED:
5680	case IRPosition::IRP_CALL_SITE_RETURNED:
5681	case IRPosition::IRP_CALL_SITE_ARGUMENT:
5682	llvm_unreachable("KernelInfo can only be created for function position!");
5683	case IRPosition::IRP_CALL_SITE:
5684	AA = new (A.Allocator) AAKernelInfoCallSite (IRP, A);
5685	break;
5686	case IRPosition::IRP_FUNCTION:
5687	AA = new (A.Allocator) AAKernelInfoFunction (IRP, A);
5688	break;
5689	}
5690
5691	return *AA;
5692	}
5693
5694	AAFoldRuntimeCall &AAFoldRuntimeCall::createForPosition(const IRPosition &IRP,
5695	Attributor &A) {
5696	AAFoldRuntimeCall AA = nullptr*;
5697	switch (IRP.getPositionKind()) {
5698	case IRPosition::IRP_INVALID:
5699	case IRPosition::IRP_FLOAT:
5700	case IRPosition::IRP_ARGUMENT:
5701	case IRPosition::IRP_RETURNED:
5702	case IRPosition::IRP_FUNCTION:
5703	case IRPosition::IRP_CALL_SITE:
5704	case IRPosition::IRP_CALL_SITE_ARGUMENT:
5705	llvm_unreachable("KernelInfo can only be created for call site position!");
5706	case IRPosition::IRP_CALL_SITE_RETURNED:
5707	AA = new (A.Allocator) AAFoldRuntimeCallCallSiteReturned (IRP, A);
5708	break;
5709	}
5710
5711	return *AA;
5712	}
5713
5714	PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {
5715	if (!containsOpenMP(M))
5716	return PreservedAnalyses::all();
5717	if (DisableOpenMPOptimizations)
5718	return PreservedAnalyses::all();
5719
5720	FunctionAnalysisManager &FAM =
5721	AM.getResult<FunctionAnalysisManagerModuleProxy>(IR&: M).getManager();
5722	KernelSet Kernels = getDeviceKernels(M);
5723
5724	if (PrintModuleBeforeOptimizations)
5725	LLVM_DEBUG(dbgs() << TAG << "Module before OpenMPOpt Module Pass:\n" << M);
5726
5727	auto IsCalled = [&](Function &F) {
5728	if (Kernels.contains(key: &F))
5729	return true;
5730	for (const User *U : F.users())
5731	if (!isa<BlockAddress>(Val: U))
5732	return true;
5733	return false;
5734	};
5735
5736	auto EmitRemark = [&](Function &F) {
5737	auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F);
5738	ORE.emit(RemarkBuilder: [&]() {
5739	OptimizationRemarkAnalysis ORA(DEBUG_TYPE, "OMP140", &F);
5740	return ORA << "Could not internalize function. "
5741	<< "Some optimizations may not be possible. [OMP140]";
5742	});
5743	};
5744
5745	bool Changed = false;
5746
5747	// Create internal copies of each function if this is a kernel Module. This
5748	// allows iterprocedural passes to see every call edge.
5749	DenseMap<Function , Function > InternalizedMap;
5750	if (isOpenMPDevice(M)) {
5751	SmallPtrSet<Function *, `16`> InternalizeFns;
5752	for (Function &F : M)
5753	if (!F.isDeclaration() && !Kernels.contains(key: &F) && IsCalled (F) &&
5754	!DisableInternalization) {
5755	if (Attributor::isInternalizable(F)) {
5756	InternalizeFns.insert(Ptr: &F);
5757	} else if (!F.hasLocalLinkage() && !F.hasFnAttribute(Kind: Attribute::Cold)) {
5758	EmitRemark (F);
5759	}
5760	}
5761
5762	Changed \|=
5763	Attributor::internalizeFunctions(FnSet&: InternalizeFns, FnMap&: InternalizedMap);
5764	}
5765
5766	// Look at every function in the Module unless it was internalized.
5767	SetVector<Function *> Functions;
5768	SmallVector<Function *, `16`> SCC;
5769	for (Function &F : M)
5770	if (!F.isDeclaration() && !InternalizedMap.lookup(Val: &F)) {
5771	SCC.push_back(Elt: &F);
5772	Functions.insert(X: &F);
5773	}
5774
5775	if (SCC.empty())
5776	return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
5777
5778	AnalysisGetter AG(FAM);
5779
5780	auto OREGetter = [&FAM](Function *F) -> OptimizationRemarkEmitter & {
5781	return FAM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: *F);
5782	};
5783
5784	BumpPtrAllocator Allocator;
5785	CallGraphUpdater CGUpdater;
5786
5787	bool PostLink = LTOPhase == ThinOrFullLTOPhase::FullLTOPostLink \|\|
5788	LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink;
5789	OMPInformationCache InfoCache(M, AG, Allocator, /CGSCC/ nullptr, PostLink);
5790
5791	unsigned MaxFixpointIterations =
5792	(isOpenMPDevice(M)) ? SetFixpointIterations : `32`;
5793
5794	AttributorConfig AC(CGUpdater);
5795	AC.DefaultInitializeLiveInternals = false;
5796	AC.IsModulePass = true;
5797	AC.RewriteSignatures = false;
5798	AC.MaxFixpointIterations = MaxFixpointIterations;
5799	AC.OREGetter = OREGetter;
5800	AC.PassName = DEBUG_TYPE;
5801	AC.InitializationCallback = OpenMPOpt::registerAAsForFunction;
5802	AC.IPOAmendableCB = [](const Function &F) {
5803	return F.hasFnAttribute(Kind: "kernel");
5804	};
5805
5806	Attributor A(Functions, InfoCache, AC);
5807
5808	OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
5809	Changed \|= OMPOpt.run(IsModulePass: true);
5810
5811	// Optionally inline device functions for potentially better performance.
5812	if (AlwaysInlineDeviceFunctions && isOpenMPDevice(M))
5813	for (Function &F : M)
5814	if (!F.isDeclaration() && !Kernels.contains(key: &F) &&
5815	!F.hasFnAttribute(Kind: Attribute::NoInline))
5816	F.addFnAttr(Kind: Attribute::AlwaysInline);
5817
5818	if (PrintModuleAfterOptimizations)
5819	LLVM_DEBUG(dbgs() << TAG << "Module after OpenMPOpt Module Pass:\n" << M);
5820
5821	if (Changed)
5822	return PreservedAnalyses::none();
5823
5824	return PreservedAnalyses::all();
5825	}
5826
5827	PreservedAnalyses OpenMPOptCGSCCPass::run(LazyCallGraph::SCC &C,
5828	CGSCCAnalysisManager &AM,
5829	LazyCallGraph &CG,
5830	CGSCCUpdateResult &UR) {
5831	if (!containsOpenMP(M&: *C.begin()->getFunction().getParent()))
5832	return PreservedAnalyses::all();
5833	if (DisableOpenMPOptimizations)
5834	return PreservedAnalyses::all();
5835
5836	SmallVector<Function *, `16`> SCC;
5837	// If there are kernels in the module, we have to run on all SCC's.
5838	for (LazyCallGraph::Node &N : C) {
5839	Function *Fn = &N.getFunction();
5840	SCC.push_back(Elt: Fn);
5841	}
5842
5843	if (SCC.empty())
5844	return PreservedAnalyses::all();
5845
5846	Module &M = *C.begin()->getFunction().getParent();
5847
5848	if (PrintModuleBeforeOptimizations)
5849	LLVM_DEBUG(dbgs() << TAG << "Module before OpenMPOpt CGSCC Pass:\n" << M);
5850
5851	KernelSet Kernels = getDeviceKernels(M);
5852
5853	FunctionAnalysisManager &FAM =
5854	AM.getResult<FunctionAnalysisManagerCGSCCProxy>(IR&: C, ExtraArgs&: CG).getManager();
5855
5856	AnalysisGetter AG(FAM);
5857
5858	auto OREGetter = [&FAM](Function *F) -> OptimizationRemarkEmitter & {
5859	return FAM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: *F);
5860	};
5861
5862	BumpPtrAllocator Allocator;
5863	CallGraphUpdater CGUpdater;
5864	CGUpdater.initialize(LCG&: CG, SCC&: C, AM, UR);
5865
5866	bool PostLink = LTOPhase == ThinOrFullLTOPhase::FullLTOPostLink \|\|
5867	LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink;
5868	SetVector<Function *> Functions(SCC.begin(), SCC.end());
5869	OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator,
5870	/CGSCC/ &Functions, PostLink);
5871
5872	unsigned MaxFixpointIterations =
5873	(isOpenMPDevice(M)) ? SetFixpointIterations : `32`;
5874
5875	AttributorConfig AC(CGUpdater);
5876	AC.DefaultInitializeLiveInternals = false;
5877	AC.IsModulePass = false;
5878	AC.RewriteSignatures = false;
5879	AC.MaxFixpointIterations = MaxFixpointIterations;
5880	AC.OREGetter = OREGetter;
5881	AC.PassName = DEBUG_TYPE;
5882	AC.InitializationCallback = OpenMPOpt::registerAAsForFunction;
5883
5884	Attributor A(Functions, InfoCache, AC);
5885
5886	OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
5887	bool Changed = OMPOpt.run(IsModulePass: false);
5888
5889	if (PrintModuleAfterOptimizations)
5890	LLVM_DEBUG(dbgs() << TAG << "Module after OpenMPOpt CGSCC Pass:\n" << M);
5891
5892	if (Changed)
5893	return PreservedAnalyses::none();
5894
5895	return PreservedAnalyses::all();
5896	}
5897
5898	bool llvm::omp::isOpenMPKernel(Function &Fn) {
5899	return Fn.hasFnAttribute(Kind: "kernel");
5900	}
5901
5902	KernelSet llvm::omp::getDeviceKernels(Module &M) {
5903	// TODO: Create a more cross-platform way of determining device kernels.
5904	NamedMDNode *MD = M.getNamedMetadata(Name: "nvvm.annotations");
5905	KernelSet Kernels;
5906
5907	if (!MD)
5908	return Kernels;
5909
5910	for (auto *Op : MD->operands()) {
5911	if (Op->getNumOperands() < `2`)
5912	continue;
5913	MDString *KindID = dyn_cast<MDString>(Val: Op->getOperand(I: `1`));
5914	if (!KindID \|\| KindID->getString() != "kernel")
5915	continue;
5916
5917	Function *KernelFn =
5918	mdconst::dyn_extract_or_null<Function>(MD: Op->getOperand(I: `0`));
5919	if (!KernelFn)
5920	continue;
5921
5922	// We are only interested in OpenMP target regions. Others, such as kernels
5923	// generated by CUDA but linked together, are not interesting to this pass.
5924	if (isOpenMPKernel(Fn&: *KernelFn)) {
5925	++NumOpenMPTargetRegionKernels;
5926	Kernels.insert(X: KernelFn);
5927	} else
5928	++NumNonOpenMPTargetRegionKernels;
5929	}
5930
5931	return Kernels;
5932	}
5933
5934	bool llvm::omp::containsOpenMP(Module &M) {
5935	Metadata *MD = M.getModuleFlag(Key: "openmp");
5936	if (!MD)
5937	return false;
5938
5939	return true;
5940	}
5941
5942	bool llvm::omp::isOpenMPDevice(Module &M) {
5943	Metadata *MD = M.getModuleFlag(Key: "openmp-device");
5944	if (!MD)
5945	return false;
5946
5947	return true;
5948	}
5949

Browse the source code of llvm_projects/llvm/lib/Transforms/IPO/OpenMPOpt.cpp