NVPTXLowerArgs.cpp source code [llvm_projects/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp]

1	//===-- NVPTXLowerArgs.cpp - Lower arguments ------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	//
10	// Arguments to kernel and device functions are passed via param space,
11	// which imposes certain restrictions:
12	// http://docs.nvidia.com/cuda/parallel-thread-execution/#state-spaces
13	//
14	// Kernel parameters are read-only and accessible only via ld.param
15	// instruction, directly or via a pointer.
16	//
17	// Device function parameters are directly accessible via
18	// ld.param/st.param, but taking the address of one returns a pointer
19	// to a copy created in local space which can't* be used with*
20	// ld.param/st.param.
21	//
22	// Copying a byval struct into local memory in IR allows us to enforce
23	// the param space restrictions, gives the rest of IR a pointer w/o
24	// param space restrictions, and gives us an opportunity to eliminate
25	// the copy.
26	//
27	// Pointer arguments to kernel functions need more work to be lowered:
28	//
29	// 1. Convert non-byval pointer arguments of CUDA kernels to pointers in the
30	// global address space. This allows later optimizations to emit
31	// ld.global./st.global.* for accessing these pointer arguments. For*
32	// example,
33	//
34	// define void @foo(float %input) {*
35	// %v = load float, float %input, align 4*
36	// ...
37	// }
38	//
39	// becomes
40	//
41	// define void @foo(float %input) {*
42	// %input2 = addrspacecast float* %input to float addrspace(1)*
43	// %input3 = addrspacecast float addrspace(1)* %input2 to float*
44	// %v = load float, float %input3, align 4*
45	// ...
46	// }
47	//
48	// Later, NVPTXInferAddressSpaces will optimize it to
49	//
50	// define void @foo(float %input) {*
51	// %input2 = addrspacecast float* %input to float addrspace(1)*
52	// %v = load float, float addrspace(1) %input2, align 4*
53	// ...
54	// }
55	//
56	// 2. Convert byval kernel parameters to pointers in the param address space
57	// (so that NVPTX emits ld/st.param). Convert pointers within* a byval*
58	// kernel parameter to pointers in the global address space. This allows
59	// NVPTX to emit ld/st.global.
60	//
61	// struct S {
62	// int x;*
63	// int y;*
64	// };
65	// __global__ void foo(S s) {
66	// int b = s.y;*
67	// // use b
68	// }
69	//
70	// "b" points to the global address space. In the IR level,
71	//
72	// define void @foo(ptr byval %input) {
73	// %b_ptr = getelementptr {ptr, ptr}, ptr %input, i64 0, i32 1
74	// %b = load ptr, ptr %b_ptr
75	// ; use %b
76	// }
77	//
78	// becomes
79	//
80	// define void @foo({i32, i32} byval %input) {*
81	// %b_param = addrspacecat ptr %input to ptr addrspace(101)
82	// %b_ptr = getelementptr {ptr, ptr}, ptr addrspace(101) %b_param, i64 0, i32 1
83	// %b = load ptr, ptr addrspace(101) %b_ptr
84	// %b_global = addrspacecast ptr %b to ptr addrspace(1)
85	// ; use %b_generic
86	// }
87	//
88	// Create a local copy of kernel byval parameters used in a way that might* mutate*
89	// the parameter, by storing it in an alloca. Mutations to "grid_constant" parameters
90	// are undefined behaviour, and don't require local copies.
91	//
92	// define void @foo(ptr byval(%struct.s) align 4 %input) {
93	// store i32 42, ptr %input
94	// ret void
95	// }
96	//
97	// becomes
98	//
99	// define void @foo(ptr byval(%struct.s) align 4 %input) #1 {
100	// %input1 = alloca %struct.s, align 4
101	// %input2 = addrspacecast ptr %input to ptr addrspace(101)
102	// %input3 = load %struct.s, ptr addrspace(101) %input2, align 4
103	// store %struct.s %input3, ptr %input1, align 4
104	// store i32 42, ptr %input1, align 4
105	// ret void
106	// }
107	//
108	// If %input were passed to a device function, or written to memory,
109	// conservatively assume that %input gets mutated, and create a local copy.
110	//
111	// Convert param pointers to grid_constant byval kernel parameters that are
112	// passed into calls (device functions, intrinsics, inline asm), or otherwise
113	// "escape" (into stores/ptrtoints) to the generic address space, using the
114	// `nvvm.ptr.param.to.gen` intrinsic, so that NVPTX emits cvta.param
115	// (available for sm70+)
116	//
117	// define void @foo(ptr byval(%struct.s) %input) {
118	// ; %input is a grid_constant
119	// %call = call i32 @escape(ptr %input)
120	// ret void
121	// }
122	//
123	// becomes
124	//
125	// define void @foo(ptr byval(%struct.s) %input) {
126	// %input1 = addrspacecast ptr %input to ptr addrspace(101)
127	// ; the following intrinsic converts pointer to generic. We don't use an addrspacecast
128	// ; to prevent generic -> param -> generic from getting cancelled out
129	// %input1.gen = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) %input1)
130	// %call = call i32 @escape(ptr %input1.gen)
131	// ret void
132	// }
133	//
134	// TODO: merge this pass with NVPTXInferAddressSpaces so that other passes don't
135	// cancel the addrspacecast pair this pass emits.
136	//===----------------------------------------------------------------------===//
137
138	#include "MCTargetDesc/NVPTXBaseInfo.h"
139	#include "NVPTX.h"
140	#include "NVPTXTargetMachine.h"
141	#include "NVPTXUtilities.h"
142	#include "llvm/Analysis/ValueTracking.h"
143	#include "llvm/CodeGen/TargetPassConfig.h"
144	#include "llvm/IR/Function.h"
145	#include "llvm/IR/IRBuilder.h"
146	#include "llvm/IR/Instructions.h"
147	#include "llvm/IR/IntrinsicsNVPTX.h"
148	#include "llvm/IR/Module.h"
149	#include "llvm/IR/Type.h"
150	#include "llvm/InitializePasses.h"
151	#include "llvm/Pass.h"
152	#include <numeric>
153	#include <queue>
154
155	#define DEBUG_TYPE "nvptx-lower-args"
156
157	using namespace llvm;
158
159	namespace llvm {
160	void initializeNVPTXLowerArgsPass(PassRegistry &);
161	}
162
163	namespace {
164	class NVPTXLowerArgs : public FunctionPass {
165	bool runOnFunction(Function &F) override;
166
167	bool runOnKernelFunction(const NVPTXTargetMachine &TM, Function &F);
168	bool runOnDeviceFunction(const NVPTXTargetMachine &TM, Function &F);
169
170	// handle byval parameters
171	void handleByValParam(const NVPTXTargetMachine &TM, Argument *Arg);
172	// Knowing Ptr must point to the global address space, this function
173	// addrspacecasts Ptr to global and then back to generic. This allows
174	// NVPTXInferAddressSpaces to fold the global-to-generic cast into
175	// loads/stores that appear later.
176	void markPointerAsGlobal(Value *Ptr);
177
178	public:
179	static char ID; // Pass identification, replacement for typeid
180	NVPTXLowerArgs() : FunctionPass (ID) {}
181	StringRef getPassName() const override {
182	return "Lower pointer arguments of CUDA kernels";
183	}
184	void getAnalysisUsage(AnalysisUsage &AU) const override {
185	AU.addRequired<TargetPassConfig>();
186	}
187	};
188	} // namespace
189
190	char NVPTXLowerArgs::ID = `1`;
191
192	INITIALIZE_PASS_BEGIN(NVPTXLowerArgs, "nvptx-lower-args",
193	"Lower arguments (NVPTX)", false, false)
194	INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
195	INITIALIZE_PASS_END(NVPTXLowerArgs, "nvptx-lower-args",
196	"Lower arguments (NVPTX)", false, false)
197
198	// =============================================================================
199	// If the function had a byval struct ptr arg, say foo(%struct.x byval %d),*
200	// and we can't guarantee that the only accesses are loads,
201	// then add the following instructions to the first basic block:
202	//
203	// %temp = alloca %struct.x, align 8
204	// %tempd = addrspacecast %struct.x* %d to %struct.x addrspace(101)*
205	// %tv = load %struct.x addrspace(101) %tempd*
206	// store %struct.x %tv, %struct.x %temp, align 8*
207	//
208	// The above code allocates some space in the stack and copies the incoming
209	// struct from param space to local space.
210	// Then replace all occurrences of %d by %temp.
211	//
212	// In case we know that all users are GEPs or Loads, replace them with the same
213	// ones in parameter AS, so we can access them using ld.param.
214	// =============================================================================
215
216	// For Loads, replaces the \p OldUse of the pointer with a Use of the same
217	// pointer in parameter AS.
218	// For "escapes" (to memory, a function call, or a ptrtoint), cast the OldUse to
219	// generic using cvta.param.
220	static void convertToParamAS(Use OldUse, Value Param, bool GridConstant) {
221	Instruction *I = dyn_cast<Instruction>(Val: OldUse->getUser());
222	assert(I && "OldUse must be in an instruction");
223	struct IP {
224	Use *OldUse;
225	Instruction *OldInstruction;
226	Value *NewParam;
227	};
228	SmallVector<IP> ItemsToConvert = {{.OldUse: OldUse, .OldInstruction: I, .NewParam: Param}};
229	SmallVector<Instruction *> InstructionsToDelete;
230
231	auto CloneInstInParamAS = [GridConstant](const IP &I) -> Value * {
232	if (auto *LI = dyn_cast<LoadInst>(Val: I.OldInstruction)) {
233	LI->setOperand(i_nocapture: `0`, Val_nocapture: I.NewParam);
234	return LI;
235	}
236	if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: I.OldInstruction)) {
237	SmallVector<Value *, `4`> Indices(GEP->indices());
238	auto *NewGEP = GetElementPtrInst::Create(
239	PointeeType: GEP->getSourceElementType(), Ptr: I.NewParam, IdxList: Indices, NameStr: GEP->getName(),
240	InsertBefore: GEP->getIterator());
241	NewGEP->setIsInBounds(GEP->isInBounds());
242	return NewGEP;
243	}
244	if (auto *BC = dyn_cast<BitCastInst>(Val: I.OldInstruction)) {
245	auto *NewBCType = PointerType::get(C&: BC->getContext(), AddressSpace: ADDRESS_SPACE_PARAM);
246	return BitCastInst::Create(BC->getOpcode(), S: I.NewParam, Ty: NewBCType,
247	Name: BC->getName(), InsertBefore: BC->getIterator());
248	}
249	if (auto *ASC = dyn_cast<AddrSpaceCastInst>(Val: I.OldInstruction)) {
250	assert(ASC->getDestAddressSpace() == ADDRESS_SPACE_PARAM);
251	(void)ASC;
252	// Just pass through the argument, the old ASC is no longer needed.
253	return I.NewParam;
254	}
255
256	if (GridConstant) {
257	auto GetParamAddrCastToGeneric =
258	[](Value Addr, Instruction OriginalUser) -> Value * {
259	PointerType *ReturnTy =
260	PointerType::get(C&: OriginalUser->getContext(), AddressSpace: ADDRESS_SPACE_GENERIC);
261	Function *CvtToGen = Intrinsic::getDeclaration(
262	M: OriginalUser->getModule(), id: Intrinsic::nvvm_ptr_param_to_gen,
263	Tys: {ReturnTy, PointerType::get(C&: OriginalUser->getContext(),
264	AddressSpace: ADDRESS_SPACE_PARAM)});
265
266	// Cast param address to generic address space
267	Value *CvtToGenCall =
268	CallInst::Create(Func: CvtToGen, Args: Addr, NameStr: Addr->getName() + ".gen",
269	InsertBefore: OriginalUser->getIterator());
270	return CvtToGenCall;
271	};
272
273	if (auto *CI = dyn_cast<CallInst>(Val: I.OldInstruction)) {
274	I.OldUse->set(GetParamAddrCastToGeneric(I.NewParam, CI));
275	return CI;
276	}
277	if (auto *SI = dyn_cast<StoreInst>(Val: I.OldInstruction)) {
278	// byval address is being stored, cast it to generic
279	if (SI->getValueOperand() == I.OldUse->get())
280	SI->setOperand(i_nocapture: `0`, Val_nocapture: GetParamAddrCastToGeneric(I.NewParam, SI));
281	return SI;
282	}
283	if (auto *PI = dyn_cast<PtrToIntInst>(Val: I.OldInstruction)) {
284	if (PI->getPointerOperand() == I.OldUse->get())
285	PI->setOperand(i_nocapture: `0`, Val_nocapture: GetParamAddrCastToGeneric(I.NewParam, PI));
286	return PI;
287	}
288	llvm_unreachable(
289	"Instruction unsupported even for grid_constant argument");
290	}
291
292	llvm_unreachable("Unsupported instruction");
293	};
294
295	while (!ItemsToConvert.empty()) {
296	IP I = ItemsToConvert.pop_back_val();
297	Value *NewInst = CloneInstInParamAS (I);
298
299	if (NewInst && NewInst != I.OldInstruction) {
300	// We've created a new instruction. Queue users of the old instruction to
301	// be converted and the instruction itself to be deleted. We can't delete
302	// the old instruction yet, because it's still in use by a load somewhere.
303	for (Use &U : I.OldInstruction->uses())
304	ItemsToConvert.push_back(Elt: {.OldUse: &U, .OldInstruction: cast<Instruction>(Val: U.getUser()), .NewParam: NewInst});
305
306	InstructionsToDelete.push_back(Elt: I.OldInstruction);
307	}
308	}
309
310	// Now we know that all argument loads are using addresses in parameter space
311	// and we can finally remove the old instructions in generic AS. Instructions
312	// scheduled for removal should be processed in reverse order so the ones
313	// closest to the load are deleted first. Otherwise they may still be in use.
314	// E.g if we have Value = Load(BitCast(GEP(arg))), InstructionsToDelete will
315	// have {GEP,BitCast}. GEP can't be deleted first, because it's still used by
316	// the BitCast.
317	for (Instruction *I : llvm::reverse(C&: InstructionsToDelete))
318	I->eraseFromParent();
319	}
320
321	// Adjust alignment of arguments passed byval in .param address space. We can
322	// increase alignment of such arguments in a way that ensures that we can
323	// effectively vectorize their loads. We should also traverse all loads from
324	// byval pointer and adjust their alignment, if those were using known offset.
325	// Such alignment changes must be conformed with parameter store and load in
326	// NVPTXTargetLowering::LowerCall.
327	static void adjustByValArgAlignment(Argument Arg, Value ArgInParamAS,
328	const NVPTXTargetLowering *TLI) {
329	Function *Func = Arg->getParent();
330	Type *StructType = Arg->getParamByValType();
331	const DataLayout DL(Func->getParent());
332
333	uint64_t NewArgAlign =
334	TLI->getFunctionParamOptimizedAlign(F: Func, ArgTy: StructType, DL).value();
335	uint64_t CurArgAlign =
336	Arg->getAttribute(Kind: Attribute::Alignment).getValueAsInt();
337
338	if (CurArgAlign >= NewArgAlign)
339	return;
340
341	LLVM_DEBUG(dbgs() << "Try to use alignment " << NewArgAlign << " instead of "
342	<< CurArgAlign << " for " << *Arg << `'\n'`);
343
344	auto NewAlignAttr =
345	Attribute::get(Context&: Func->getContext(), Kind: Attribute::Alignment, Val: NewArgAlign);
346	Arg->removeAttr(Kind: Attribute::Alignment);
347	Arg->addAttr(Attr: NewAlignAttr);
348
349	struct Load {
350	LoadInst *Inst;
351	uint64_t Offset;
352	};
353
354	struct LoadContext {
355	Value *InitialVal;
356	uint64_t Offset;
357	};
358
359	SmallVector<Load> Loads;
360	std::queue<LoadContext> Worklist;
361	Worklist.push(x: {.InitialVal: ArgInParamAS, .Offset: `0`});
362	bool IsGridConstant = isParamGridConstant(*Arg);
363
364	while (!Worklist.empty()) {
365	LoadContext Ctx = Worklist.front();
366	Worklist.pop();
367
368	for (User *CurUser : Ctx.InitialVal->users()) {
369	if (auto *I = dyn_cast<LoadInst>(Val: CurUser)) {
370	Loads.push_back(Elt: {.Inst: I, .Offset: Ctx.Offset});
371	continue;
372	}
373
374	if (auto *I = dyn_cast<BitCastInst>(Val: CurUser)) {
375	Worklist.push(x: {.InitialVal: I, .Offset: Ctx.Offset});
376	continue;
377	}
378
379	if (auto *I = dyn_cast<GetElementPtrInst>(Val: CurUser)) {
380	APInt OffsetAccumulated =
381	APInt::getZero(numBits: DL.getIndexSizeInBits(AS: ADDRESS_SPACE_PARAM));
382
383	if (!I->accumulateConstantOffset(DL, Offset&: OffsetAccumulated))
384	continue;
385
386	uint64_t OffsetLimit = -`1`;
387	uint64_t Offset = OffsetAccumulated.getLimitedValue(Limit: OffsetLimit);
388	assert(Offset != OffsetLimit && "Expect Offset less than UINT64_MAX");
389
390	Worklist.push(x: {.InitialVal: I, .Offset: Ctx.Offset + Offset});
391	continue;
392	}
393
394	// supported for grid_constant
395	if (IsGridConstant &&
396	(isa<CallInst>(Val: CurUser) \|\| isa<StoreInst>(Val: CurUser) \|\|
397	isa<PtrToIntInst>(Val: CurUser)))
398	continue;
399
400	llvm_unreachable("All users must be one of: load, "
401	"bitcast, getelementptr, call, store, ptrtoint");
402	}
403	}
404
405	for (Load &CurLoad : Loads) {
406	Align NewLoadAlign(std::gcd(m: NewArgAlign, n: CurLoad.Offset));
407	Align CurLoadAlign(CurLoad.Inst->getAlign());
408	CurLoad.Inst->setAlignment(std::max(a: NewLoadAlign, b: CurLoadAlign));
409	}
410	}
411
412	void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
413	Argument *Arg) {
414	bool IsGridConstant = isParamGridConstant(*Arg);
415	Function *Func = Arg->getParent();
416	BasicBlock::iterator FirstInst = Func->getEntryBlock().begin();
417	Type *StructType = Arg->getParamByValType();
418	assert(StructType && "Missing byval type");
419
420	auto AreSupportedUsers = [&](Value *Start) {
421	SmallVector<Value *, `16`> ValuesToCheck = {Start};
422	auto IsSupportedUse = [IsGridConstant](Value V) -> bool* {
423	if (isa<GetElementPtrInst>(Val: V) \|\| isa<BitCastInst>(Val: V) \|\| isa<LoadInst>(Val: V))
424	return true;
425	// ASC to param space are OK, too -- we'll just strip them.
426	if (auto *ASC = dyn_cast<AddrSpaceCastInst>(Val: V)) {
427	if (ASC->getDestAddressSpace() == ADDRESS_SPACE_PARAM)
428	return true;
429	}
430	// Simple calls and stores are supported for grid_constants
431	// writes to these pointers are undefined behaviour
432	if (IsGridConstant &&
433	(isa<CallInst>(Val: V) \|\| isa<StoreInst>(Val: V) \|\| isa<PtrToIntInst>(Val: V)))
434	return true;
435	return false;
436	};
437
438	while (!ValuesToCheck.empty()) {
439	Value *V = ValuesToCheck.pop_back_val();
440	if (!IsSupportedUse(V)) {
441	LLVM_DEBUG(dbgs() << "Need a "
442	<< (isParamGridConstant(*Arg) ? "cast " : "copy ")
443	<< "of " << Arg << " because of " << V << "\n");
444	(void)Arg;
445	return false;
446	}
447	if (!isa<LoadInst>(Val: V) && !isa<CallInst>(Val: V) && !isa<StoreInst>(Val: V) &&
448	!isa<PtrToIntInst>(Val: V))
449	llvm::append_range(C&: ValuesToCheck, R: V->users());
450	}
451	return true;
452	};
453
454	if (llvm::all_of(Range: Arg->users(), P: AreSupportedUsers)) {
455	// Convert all loads and intermediate operations to use parameter AS and
456	// skip creation of a local copy of the argument.
457	SmallVector<Use *, `16`> UsesToUpdate;
458	for (Use &U : Arg->uses())
459	UsesToUpdate.push_back(Elt: &U);
460
461	Value ArgInParamAS = new* AddrSpaceCastInst (
462	Arg, PointerType::get(ElementType: StructType, AddressSpace: ADDRESS_SPACE_PARAM), Arg->getName(),
463	FirstInst);
464	for (Use *U : UsesToUpdate)
465	convertToParamAS(OldUse: U, Param: ArgInParamAS, GridConstant: IsGridConstant);
466	LLVM_DEBUG(dbgs() << "No need to copy or cast " << *Arg << "\n");
467
468	const auto *TLI =
469	cast<NVPTXTargetLowering>(Val: TM.getSubtargetImpl()->getTargetLowering());
470
471	adjustByValArgAlignment(Arg, ArgInParamAS, TLI);
472
473	return;
474	}
475
476	const DataLayout &DL = Func->getDataLayout();
477	unsigned AS = DL.getAllocaAddrSpace();
478	if (isParamGridConstant(*Arg)) {
479	// Writes to a grid constant are undefined behaviour. We do not need a
480	// temporary copy. When a pointer might have escaped, conservatively replace
481	// all of its uses (which might include a device function call) with a cast
482	// to the generic address space.
483	IRBuilder<> IRB(&Func->getEntryBlock().front());
484
485	// Cast argument to param address space
486	auto *CastToParam = cast<AddrSpaceCastInst>(Val: IRB.CreateAddrSpaceCast(
487	V: Arg, DestTy: IRB.getPtrTy(AddrSpace: ADDRESS_SPACE_PARAM), Name: Arg->getName() + ".param"));
488
489	// Cast param address to generic address space. We do not use an
490	// addrspacecast to generic here, because, LLVM considers `Arg` to be in the
491	// generic address space, and a `generic -> param` cast followed by a `param
492	// -> generic` cast will be folded away. The `param -> generic` intrinsic
493	// will be correctly lowered to `cvta.param`.
494	Value *CvtToGenCall = IRB.CreateIntrinsic(
495	RetTy: IRB.getPtrTy(AddrSpace: ADDRESS_SPACE_GENERIC), ID: Intrinsic::nvvm_ptr_param_to_gen,
496	Args: CastToParam, FMFSource: nullptr, Name: CastToParam->getName() + ".gen");
497
498	Arg->replaceAllUsesWith(V: CvtToGenCall);
499
500	// Do not replace Arg in the cast to param space
501	CastToParam->setOperand(i_nocapture: `0`, Val_nocapture: Arg);
502	} else {
503	// Otherwise we have to create a temporary copy.
504	AllocaInst *AllocA =
505	new AllocaInst (StructType, AS, Arg->getName(), FirstInst);
506	// Set the alignment to alignment of the byval parameter. This is because,
507	// later load/stores assume that alignment, and we are going to replace
508	// the use of the byval parameter with this alloca instruction.
509	AllocA->setAlignment(Func->getParamAlign(ArgNo: Arg->getArgNo())
510	.value_or(u: DL.getPrefTypeAlign(Ty: StructType)));
511	Arg->replaceAllUsesWith(V: AllocA);
512
513	Value ArgInParam = new* AddrSpaceCastInst (
514	Arg, PointerType::get(C&: Arg->getContext(), AddressSpace: ADDRESS_SPACE_PARAM),
515	Arg->getName(), FirstInst);
516	// Be sure to propagate alignment to this load; LLVM doesn't know that NVPTX
517	// addrspacecast preserves alignment. Since params are constant, this load
518	// is definitely not volatile.
519	LoadInst *LI =
520	new LoadInst (StructType, ArgInParam, Arg->getName(),
521	/isVolatile=/false, AllocA->getAlign(), FirstInst);
522	new StoreInst (LI, AllocA, FirstInst);
523	}
524	}
525
526	void NVPTXLowerArgs::markPointerAsGlobal(Value *Ptr) {
527	if (Ptr->getType()->getPointerAddressSpace() != ADDRESS_SPACE_GENERIC)
528	return;
529
530	// Deciding where to emit the addrspacecast pair.
531	BasicBlock::iterator InsertPt;
532	if (Argument *Arg = dyn_cast<Argument>(Val: Ptr)) {
533	// Insert at the functon entry if Ptr is an argument.
534	InsertPt = Arg->getParent()->getEntryBlock().begin();
535	} else {
536	// Insert right after Ptr if Ptr is an instruction.
537	InsertPt = ++cast<Instruction>(Val: Ptr)->getIterator();
538	assert(InsertPt != InsertPt->getParent()->end() &&
539	"We don't call this function with Ptr being a terminator.");
540	}
541
542	Instruction PtrInGlobal = new* AddrSpaceCastInst (
543	Ptr, PointerType::get(C&: Ptr->getContext(), AddressSpace: ADDRESS_SPACE_GLOBAL),
544	Ptr->getName(), InsertPt);
545	Value PtrInGeneric = new* AddrSpaceCastInst (PtrInGlobal, Ptr->getType(),
546	Ptr->getName(), InsertPt);
547	// Replace with PtrInGeneric all uses of Ptr except PtrInGlobal.
548	Ptr->replaceAllUsesWith(V: PtrInGeneric);
549	PtrInGlobal->setOperand(i: `0`, Val: Ptr);
550	}
551
552	// =============================================================================
553	// Main function for this pass.
554	// =============================================================================
555	bool NVPTXLowerArgs::runOnKernelFunction(const NVPTXTargetMachine &TM,
556	Function &F) {
557	// Copying of byval aggregates + SROA may result in pointers being loaded as
558	// integers, followed by intotoptr. We may want to mark those as global, too,
559	// but only if the loaded integer is used exclusively for conversion to a
560	// pointer with inttoptr.
561	auto HandleIntToPtr = [this](Value &V) {
562	if (llvm::all_of(Range: V.users(), P: [](User U) { return* isa<IntToPtrInst>(Val: U); })) {
563	SmallVector<User *, `16`> UsersToUpdate(V.users());
564	for (User *U : UsersToUpdate)
565	markPointerAsGlobal(Ptr: U);
566	}
567	};
568	if (TM.getDrvInterface() == NVPTX::CUDA) {
569	// Mark pointers in byval structs as global.
570	for (auto &B : F) {
571	for (auto &I : B) {
572	if (LoadInst *LI = dyn_cast<LoadInst>(Val: &I)) {
573	if (LI->getType()->isPointerTy() \|\| LI->getType()->isIntegerTy()) {
574	Value *UO = getUnderlyingObject(V: LI->getPointerOperand());
575	if (Argument *Arg = dyn_cast<Argument>(Val: UO)) {
576	if (Arg->hasByValAttr()) {
577	// LI is a load from a pointer within a byval kernel parameter.
578	if (LI->getType()->isPointerTy())
579	markPointerAsGlobal(Ptr: LI);
580	else
581	HandleIntToPtr (*LI);
582	}
583	}
584	}
585	}
586	}
587	}
588	}
589
590	LLVM_DEBUG(dbgs() << "Lowering kernel args of " << F.getName() << "\n");
591	for (Argument &Arg : F.args()) {
592	if (Arg.getType()->isPointerTy()) {
593	if (Arg.hasByValAttr())
594	handleByValParam(TM, Arg: &Arg);
595	else if (TM.getDrvInterface() == NVPTX::CUDA)
596	markPointerAsGlobal(Ptr: &Arg);
597	} else if (Arg.getType()->isIntegerTy() &&
598	TM.getDrvInterface() == NVPTX::CUDA) {
599	HandleIntToPtr (Arg);
600	}
601	}
602	return true;
603	}
604
605	// Device functions only need to copy byval args into local memory.
606	bool NVPTXLowerArgs::runOnDeviceFunction(const NVPTXTargetMachine &TM,
607	Function &F) {
608	LLVM_DEBUG(dbgs() << "Lowering function args of " << F.getName() << "\n");
609	for (Argument &Arg : F.args())
610	if (Arg.getType()->isPointerTy() && Arg.hasByValAttr())
611	handleByValParam(TM, Arg: &Arg);
612	return true;
613	}
614
615	bool NVPTXLowerArgs::runOnFunction(Function &F) {
616	auto &TM = getAnalysis<TargetPassConfig>().getTM<NVPTXTargetMachine>();
617
618	return isKernelFunction(F) ? runOnKernelFunction(TM, F)
619	: runOnDeviceFunction(TM, F);
620	}
621
622	FunctionPass llvm::createNVPTXLowerArgsPass() { return* new NVPTXLowerArgs (); }
623

Browse the source code of llvm_projects/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp