1//===----- HipStdPar.cpp - HIP C++ Standard Parallelism Support Passes ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8// This file implements two passes that enable HIP C++ Standard Parallelism
9// Support:
10//
11// 1. AcceleratorCodeSelection (required): Given that only algorithms are
12// accelerated, and that the accelerated implementation exists in the form of
13// a compute kernel, we assume that only the kernel, and all functions
14// reachable from it, constitute code that the user expects the accelerator
15// to execute. Thus, we identify the set of all functions reachable from
16// kernels, and then remove all unreachable ones. This last part is necessary
17// because it is possible for code that the user did not expect to execute on
18// an accelerator to contain constructs that cannot be handled by the target
19// BE, which cannot be provably demonstrated to be dead code in general, and
20// thus can lead to mis-compilation. The degenerate case of this is when a
21// Module contains no kernels (the parent TU had no algorithm invocations fit
22// for acceleration), which we handle by completely emptying said module.
23// **NOTE**: The above does not handle indirectly reachable functions i.e.
24// it is possible to obtain a case where the target of an indirect
25// call is otherwise unreachable and thus is removed; this
26// restriction is aligned with the current `-hipstdpar` limitations
27// and will be relaxed in the future.
28//
29// 2. AllocationInterposition (required only when on-demand paging is
30// unsupported): Some accelerators or operating systems might not support
31// transparent on-demand paging. Thus, they would only be able to access
32// memory that is allocated by an accelerator-aware mechanism. For such cases
33// the user can opt into enabling allocation / deallocation interposition,
34// whereby we replace calls to known allocation / deallocation functions with
35// calls to runtime implemented equivalents that forward the requests to
36// accelerator-aware interfaces. We also support freeing system allocated
37// memory that ends up in one of the runtime equivalents, since this can
38// happen if e.g. a library that was compiled without interposition returns
39// an allocation that can be validly passed to `free`.
40//===----------------------------------------------------------------------===//
41
42#include "llvm/Transforms/HipStdPar/HipStdPar.h"
43
44#include "llvm/ADT/STLExtras.h"
45#include "llvm/ADT/SmallPtrSet.h"
46#include "llvm/ADT/SmallVector.h"
47#include "llvm/Analysis/CallGraph.h"
48#include "llvm/Analysis/OptimizationRemarkEmitter.h"
49#include "llvm/IR/Constants.h"
50#include "llvm/IR/Function.h"
51#include "llvm/IR/Module.h"
52#include "llvm/Transforms/Utils/ModuleUtils.h"
53
54#include <cassert>
55#include <string>
56#include <utility>
57
58using namespace llvm;
59
60template<typename T>
61static inline void eraseFromModule(T &ToErase) {
62 ToErase.replaceAllUsesWith(PoisonValue::get(T: ToErase.getType()));
63 ToErase.eraseFromParent();
64}
65
66static inline bool checkIfSupported(GlobalVariable &G) {
67 if (!G.isThreadLocal())
68 return true;
69
70 G.dropDroppableUses();
71
72 if (!G.isConstantUsed())
73 return true;
74
75 std::string W;
76 raw_string_ostream OS(W);
77
78 OS << "Accelerator does not support the thread_local variable "
79 << G.getName();
80
81 Instruction *I = nullptr;
82 SmallVector<User *> Tmp(G.users());
83 SmallPtrSet<User *, 5> Visited;
84 do {
85 auto U = std::move(Tmp.back());
86 Tmp.pop_back();
87
88 if (!Visited.insert(Ptr: U).second)
89 continue;
90
91 if (isa<Instruction>(Val: U))
92 I = cast<Instruction>(Val: U);
93 else
94 Tmp.insert(I: Tmp.end(), From: U->user_begin(), To: U->user_end());
95 } while (!I && !Tmp.empty());
96
97 assert(I && "thread_local global should have at least one non-constant use.");
98
99 G.getContext().diagnose(
100 DI: DiagnosticInfoUnsupported(*I->getParent()->getParent(), W,
101 I->getDebugLoc(), DS_Error));
102
103 return false;
104}
105
106static inline void clearModule(Module &M) { // TODO: simplify.
107 while (!M.functions().empty())
108 eraseFromModule(ToErase&: *M.begin());
109 while (!M.globals().empty())
110 eraseFromModule(ToErase&: *M.globals().begin());
111 while (!M.aliases().empty())
112 eraseFromModule(ToErase&: *M.aliases().begin());
113 while (!M.ifuncs().empty())
114 eraseFromModule(ToErase&: *M.ifuncs().begin());
115}
116
117static inline void maybeHandleGlobals(Module &M) {
118 unsigned GlobAS = M.getDataLayout().getDefaultGlobalsAddressSpace();
119 for (auto &&G : M.globals()) { // TODO: should we handle these in the FE?
120 if (!checkIfSupported(G))
121 return clearModule(M);
122
123 if (G.isThreadLocal())
124 continue;
125 if (G.isConstant())
126 continue;
127 if (G.getAddressSpace() != GlobAS)
128 continue;
129 if (G.getLinkage() != GlobalVariable::ExternalLinkage)
130 continue;
131
132 G.setLinkage(GlobalVariable::ExternalWeakLinkage);
133 G.setInitializer(nullptr);
134 G.setExternallyInitialized(true);
135 }
136}
137
138template<unsigned N>
139static inline void removeUnreachableFunctions(
140 const SmallPtrSet<const Function *, N>& Reachable, Module &M) {
141 removeFromUsedLists(M, [&](Constant *C) {
142 if (auto F = dyn_cast<Function>(Val: C))
143 return !Reachable.contains(F);
144
145 return false;
146 });
147
148 SmallVector<std::reference_wrapper<Function>> ToRemove;
149 copy_if(M, std::back_inserter(x&: ToRemove), [&](auto &&F) {
150 return !F.isIntrinsic() && !Reachable.contains(&F);
151 });
152
153 for_each(Range&: ToRemove, F: eraseFromModule<Function>);
154}
155
156static inline bool isAcceleratorExecutionRoot(const Function *F) {
157 if (!F)
158 return false;
159
160 return F->getCallingConv() == CallingConv::AMDGPU_KERNEL;
161}
162
163static inline bool checkIfSupported(const Function *F, const CallBase *CB) {
164 const auto Dx = F->getName().rfind(Str: "__hipstdpar_unsupported");
165
166 if (Dx == StringRef::npos)
167 return true;
168
169 const auto N = F->getName().substr(Start: 0, N: Dx);
170
171 std::string W;
172 raw_string_ostream OS(W);
173
174 if (N == "__ASM")
175 OS << "Accelerator does not support the ASM block:\n"
176 << cast<ConstantDataArray>(Val: CB->getArgOperand(i: 0))->getAsCString();
177 else
178 OS << "Accelerator does not support the " << N << " function.";
179
180 auto Caller = CB->getParent()->getParent();
181
182 Caller->getContext().diagnose(
183 DI: DiagnosticInfoUnsupported(*Caller, W, CB->getDebugLoc(), DS_Error));
184
185 return false;
186}
187
188PreservedAnalyses
189 HipStdParAcceleratorCodeSelectionPass::run(Module &M,
190 ModuleAnalysisManager &MAM) {
191 auto &CGA = MAM.getResult<CallGraphAnalysis>(IR&: M);
192
193 SmallPtrSet<const Function *, 32> Reachable;
194 for (auto &&CGN : CGA) {
195 if (!isAcceleratorExecutionRoot(F: CGN.first))
196 continue;
197
198 Reachable.insert(Ptr: CGN.first);
199
200 SmallVector<const Function *> Tmp({CGN.first});
201 do {
202 auto F = std::move(Tmp.back());
203 Tmp.pop_back();
204
205 for (auto &&N : *CGA[F]) {
206 if (!N.second)
207 continue;
208 if (!N.second->getFunction())
209 continue;
210 if (Reachable.contains(Ptr: N.second->getFunction()))
211 continue;
212
213 if (!checkIfSupported(F: N.second->getFunction(),
214 CB: dyn_cast<CallBase>(Val&: *N.first)))
215 return PreservedAnalyses::none();
216
217 Reachable.insert(Ptr: N.second->getFunction());
218 Tmp.push_back(Elt: N.second->getFunction());
219 }
220 } while (!std::empty(cont: Tmp));
221 }
222
223 if (std::empty(cont: Reachable))
224 clearModule(M);
225 else
226 removeUnreachableFunctions(Reachable, M);
227
228 maybeHandleGlobals(M);
229
230 return PreservedAnalyses::none();
231}
232
233static constexpr std::pair<StringLiteral, StringLiteral> ReplaceMap[]{
234 {"aligned_alloc", "__hipstdpar_aligned_alloc"},
235 {"calloc", "__hipstdpar_calloc"},
236 {"free", "__hipstdpar_free"},
237 {"malloc", "__hipstdpar_malloc"},
238 {"memalign", "__hipstdpar_aligned_alloc"},
239 {"mmap", "__hipstdpar_mmap"},
240 {"munmap", "__hipstdpar_munmap"},
241 {"posix_memalign", "__hipstdpar_posix_aligned_alloc"},
242 {"realloc", "__hipstdpar_realloc"},
243 {"reallocarray", "__hipstdpar_realloc_array"},
244 {"_ZdaPv", "__hipstdpar_operator_delete"},
245 {"_ZdaPvm", "__hipstdpar_operator_delete_sized"},
246 {"_ZdaPvSt11align_val_t", "__hipstdpar_operator_delete_aligned"},
247 {"_ZdaPvmSt11align_val_t", "__hipstdpar_operator_delete_aligned_sized"},
248 {"_ZdlPv", "__hipstdpar_operator_delete"},
249 {"_ZdlPvm", "__hipstdpar_operator_delete_sized"},
250 {"_ZdlPvSt11align_val_t", "__hipstdpar_operator_delete_aligned"},
251 {"_ZdlPvmSt11align_val_t", "__hipstdpar_operator_delete_aligned_sized"},
252 {"_Znam", "__hipstdpar_operator_new"},
253 {"_ZnamRKSt9nothrow_t", "__hipstdpar_operator_new_nothrow"},
254 {"_ZnamSt11align_val_t", "__hipstdpar_operator_new_aligned"},
255 {"_ZnamSt11align_val_tRKSt9nothrow_t",
256 "__hipstdpar_operator_new_aligned_nothrow"},
257
258 {"_Znwm", "__hipstdpar_operator_new"},
259 {"_ZnwmRKSt9nothrow_t", "__hipstdpar_operator_new_nothrow"},
260 {"_ZnwmSt11align_val_t", "__hipstdpar_operator_new_aligned"},
261 {"_ZnwmSt11align_val_tRKSt9nothrow_t",
262 "__hipstdpar_operator_new_aligned_nothrow"},
263 {"__builtin_calloc", "__hipstdpar_calloc"},
264 {"__builtin_free", "__hipstdpar_free"},
265 {"__builtin_malloc", "__hipstdpar_malloc"},
266 {"__builtin_operator_delete", "__hipstdpar_operator_delete"},
267 {"__builtin_operator_new", "__hipstdpar_operator_new"},
268 {"__builtin_realloc", "__hipstdpar_realloc"},
269 {"__libc_calloc", "__hipstdpar_calloc"},
270 {"__libc_free", "__hipstdpar_free"},
271 {"__libc_malloc", "__hipstdpar_malloc"},
272 {"__libc_memalign", "__hipstdpar_aligned_alloc"},
273 {"__libc_realloc", "__hipstdpar_realloc"}};
274
275static constexpr std::pair<StringLiteral, StringLiteral> HiddenMap[]{
276 // hidden_malloc and hidden_free are only kept for backwards compatibility /
277 // legacy purposes, and we should remove them in the future
278 {"__hipstdpar_hidden_malloc", "__libc_malloc"},
279 {"__hipstdpar_hidden_free", "__libc_free"},
280 {"__hipstdpar_hidden_memalign", "__libc_memalign"},
281 {"__hipstdpar_hidden_mmap", "mmap"},
282 {"__hipstdpar_hidden_munmap", "munmap"}};
283
284PreservedAnalyses
285HipStdParAllocationInterpositionPass::run(Module &M, ModuleAnalysisManager&) {
286 SmallDenseMap<StringRef, StringRef> AllocReplacements(std::cbegin(cont: ReplaceMap),
287 std::cend(cont: ReplaceMap));
288
289 for (auto &&F : M) {
290 if (!F.hasName())
291 continue;
292 auto It = AllocReplacements.find(Val: F.getName());
293 if (It == AllocReplacements.end())
294 continue;
295
296 if (auto R = M.getFunction(Name: It->second)) {
297 F.replaceAllUsesWith(V: R);
298 } else {
299 std::string W;
300 raw_string_ostream OS(W);
301
302 OS << "cannot be interposed, missing: " << AllocReplacements[F.getName()]
303 << ". Tried to run the allocation interposition pass without the "
304 << "replacement functions available.";
305
306 F.getContext().diagnose(DI: DiagnosticInfoUnsupported(F, W,
307 F.getSubprogram(),
308 DS_Warning));
309 }
310 }
311
312 for (auto &&HR : HiddenMap) {
313 if (auto F = M.getFunction(Name: HR.first)) {
314 auto R = M.getOrInsertFunction(Name: HR.second, T: F->getFunctionType(),
315 AttributeList: F->getAttributes());
316 F->replaceAllUsesWith(V: R.getCallee());
317
318 eraseFromModule(ToErase&: *F);
319 }
320 }
321
322 return PreservedAnalyses::none();
323}
324