1//===- InstrProfilingPlatformROCmHSA.cpp - ROCm HSA device drain ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Supplemental HSA-introspection drain (Linux only).
10//
11// The host-shadow drain in InstrProfilingPlatformROCm.cpp only sees device code
12// objects with a host-side shadow (__hipRegisterVar) or an intercepted
13// hipModuleLoad*. Device-linked code with no host shadow (e.g. RCCL) is
14// invisible to it. This pass walks every GPU agent's loaded executables via
15// HSA, finds each __llvm_profile_sections table on the device, and drains the
16// ones the host-shadow pass missed (deduped by the section-bounds tuple). It
17// reuses processDeviceOffloadPrf() so the profraw layout is identical.
18//
19//===----------------------------------------------------------------------===//
20
21#if defined(__linux__)
22
23extern "C" {
24#include "InstrProfiling.h"
25#include "InstrProfilingPort.h"
26}
27
28#include "InstrProfilingPlatformROCmInternal.h"
29#include "interception/interception.h"
30// C (not C++) headers: clang_rt.profile is built -nostdinc++.
31#include <stddef.h>
32#include <stdint.h>
33#include <stdio.h>
34#include <stdlib.h>
35#include <string.h>
36
37using namespace __prof_rocm;
38
39// Mirrored HSA declarations the drain needs (dlopen'd, not linked). See the
40// header for the rationale; the values are HSA's stable C ABI.
41#include "InstrProfilingPlatformROCmHSADefs.h"
42
43#ifdef PROFILE_VERIFY_HSA_ABI
44// When the real ROCm headers are available at build time (developer installs
45// and the downstream GPU CI), check that the mirror above still matches them.
46#include <hsa/hsa.h>
47#include <hsa/hsa_ven_amd_loader.h>
48
49static_assert(PROF_HSA_STATUS_SUCCESS == HSA_STATUS_SUCCESS, "HSA ABI drift");
50static_assert(PROF_HSA_STATUS_INFO_BREAK == HSA_STATUS_INFO_BREAK,
51 "HSA ABI drift");
52static_assert(PROF_HSA_AGENT_INFO_NAME == HSA_AGENT_INFO_NAME, "HSA ABI drift");
53static_assert(PROF_HSA_AGENT_INFO_DEVICE == HSA_AGENT_INFO_DEVICE,
54 "HSA ABI drift");
55static_assert(PROF_HSA_DEVICE_TYPE_GPU == HSA_DEVICE_TYPE_GPU, "HSA ABI drift");
56static_assert(PROF_HSA_SYMBOL_KIND_VARIABLE == HSA_SYMBOL_KIND_VARIABLE,
57 "HSA ABI drift");
58static_assert(PROF_HSA_EXECUTABLE_SYMBOL_INFO_TYPE ==
59 HSA_EXECUTABLE_SYMBOL_INFO_TYPE,
60 "HSA ABI drift");
61static_assert(PROF_HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH ==
62 HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH,
63 "HSA ABI drift");
64static_assert(PROF_HSA_EXECUTABLE_SYMBOL_INFO_NAME ==
65 HSA_EXECUTABLE_SYMBOL_INFO_NAME,
66 "HSA ABI drift");
67static_assert(PROF_HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS ==
68 HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
69 "HSA ABI drift");
70static_assert(PROF_HSA_EXTENSION_AMD_LOADER == HSA_EXTENSION_AMD_LOADER,
71 "HSA ABI drift");
72
73static_assert(sizeof(prof_hsa_agent_t) == sizeof(hsa_agent_t), "HSA ABI drift");
74static_assert(sizeof(prof_hsa_executable_t) == sizeof(hsa_executable_t),
75 "HSA ABI drift");
76static_assert(sizeof(prof_hsa_executable_symbol_t) ==
77 sizeof(hsa_executable_symbol_t),
78 "HSA ABI drift");
79
80static_assert(sizeof(prof_hsa_loader_segment_descriptor_t) ==
81 sizeof(hsa_ven_amd_loader_segment_descriptor_t),
82 "HSA ABI drift");
83static_assert(offsetof(prof_hsa_loader_segment_descriptor_t, agent) ==
84 offsetof(hsa_ven_amd_loader_segment_descriptor_t, agent),
85 "HSA ABI drift");
86static_assert(offsetof(prof_hsa_loader_segment_descriptor_t, executable) ==
87 offsetof(hsa_ven_amd_loader_segment_descriptor_t, executable),
88 "HSA ABI drift");
89static_assert(offsetof(prof_hsa_loader_segment_descriptor_t, segment_base) ==
90 offsetof(hsa_ven_amd_loader_segment_descriptor_t,
91 segment_base),
92 "HSA ABI drift");
93static_assert(offsetof(prof_hsa_loader_segment_descriptor_t, segment_size) ==
94 offsetof(hsa_ven_amd_loader_segment_descriptor_t,
95 segment_size),
96 "HSA ABI drift");
97
98// We fetch the loader pfn table by raw layout, so query_segment_descriptors
99// must sit at the same offset as in the real table.
100static_assert(offsetof(prof_hsa_loader_pfn_t, query_segment_descriptors) ==
101 offsetof(hsa_ven_amd_loader_1_00_pfn_t,
102 hsa_ven_amd_loader_query_segment_descriptors),
103 "HSA ABI drift");
104#endif // PROFILE_VERIFY_HSA_ABI
105
106static hsa_iterate_agents_ty pHsaIterateAgents = nullptr;
107static hsa_agent_get_info_ty pHsaAgentGetInfo = nullptr;
108static hsa_executable_iterate_agent_symbols_ty pHsaExecIterAgentSyms = nullptr;
109static hsa_executable_symbol_get_info_ty pHsaSymGetInfo = nullptr;
110static hsa_loader_query_segment_descriptors_ty pQuerySegDescs = nullptr;
111
112/* Status-check shorthands, in the spirit of the thin HIP wrappers in
113 * InstrProfilingPlatformROCm.cpp: every HSA entry point returns
114 * prof_hsa_status_t. hsaOkOrBreak() also accepts INFO_BREAK, which the
115 * iterate_* callbacks use to stop early and is not an error. */
116static inline bool hsaOk(prof_hsa_status_t St) {
117 return St == PROF_HSA_STATUS_SUCCESS;
118}
119static inline bool hsaOkOrBreak(prof_hsa_status_t St) {
120 return St == PROF_HSA_STATUS_SUCCESS || St == PROF_HSA_STATUS_INFO_BREAK;
121}
122
123/* 0 = not attempted, 1 = ready, -1 = unavailable. Acquire/release atomics: a
124 * thread observing HsaRuntimeState==1 also sees the published p* pointers. */
125static int HsaRuntimeState = 0;
126
127static int setHsaRuntimeState(int S) {
128 __atomic_store_n(&HsaRuntimeState, S, __ATOMIC_RELEASE);
129 return S > 0 ? 0 : -1;
130}
131
132/* Resolve HSA entry points and the AMD loader extension once, and confirm HIP's
133 * hipMemcpy is reachable for the device-to-host copies. */
134static int loadHsaRuntimePointers(void) {
135 int State = __atomic_load_n(&HsaRuntimeState, __ATOMIC_ACQUIRE);
136 if (State)
137 return State > 0 ? 0 : -1;
138
139 if (!__interception::DynamicLoaderAvailable()) {
140 if (isVerboseMode())
141 PROF_NOTE("%s", "Dynamic library loading not available - "
142 "HSA device profiling disabled\n");
143 return setHsaRuntimeState(-1);
144 }
145
146 void *Hsa = __interception::OpenLibrary(name: "libhsa-runtime64.so");
147 if (!Hsa)
148 Hsa = __interception::OpenLibrary(name: "libhsa-runtime64.so.1");
149 if (!Hsa) {
150 if (isVerboseMode())
151 PROF_NOTE("%s", "libhsa-runtime64.so not loadable - "
152 "HSA device profiling disabled\n");
153 return setHsaRuntimeState(-1);
154 }
155
156 hsa_init_ty pHsaInit =
157 (hsa_init_ty)__interception::LookupSymbol(handle: Hsa, symbol: "hsa_init");
158 hsa_system_get_major_extension_table_ty pGetExtTable =
159 (hsa_system_get_major_extension_table_ty)__interception::LookupSymbol(
160 handle: Hsa, symbol: "hsa_system_get_major_extension_table");
161 pHsaIterateAgents = (hsa_iterate_agents_ty)__interception::LookupSymbol(
162 handle: Hsa, symbol: "hsa_iterate_agents");
163 pHsaAgentGetInfo = (hsa_agent_get_info_ty)__interception::LookupSymbol(
164 handle: Hsa, symbol: "hsa_agent_get_info");
165 pHsaExecIterAgentSyms =
166 (hsa_executable_iterate_agent_symbols_ty)__interception::LookupSymbol(
167 handle: Hsa, symbol: "hsa_executable_iterate_agent_symbols");
168 pHsaSymGetInfo =
169 (hsa_executable_symbol_get_info_ty)__interception::LookupSymbol(
170 handle: Hsa, symbol: "hsa_executable_symbol_get_info");
171
172 if (!pHsaInit || !pGetExtTable || !pHsaIterateAgents || !pHsaAgentGetInfo ||
173 !pHsaExecIterAgentSyms || !pHsaSymGetInfo) {
174 PROF_WARN("%s",
175 "required HSA symbols missing - HSA device profiling disabled\n");
176 return setHsaRuntimeState(-1);
177 }
178
179 /* Bring HSA up lazily on the first drain (idempotent, refcounted), never from
180 * a library constructor -- see the fork-safety note at end of file. */
181 prof_hsa_status_t St = pHsaInit();
182 if (!hsaOkOrBreak(St)) {
183 if (isVerboseMode())
184 PROF_NOTE("hsa_init failed (0x%x) - HSA device profiling disabled\n", St);
185 return setHsaRuntimeState(-1);
186 }
187
188 prof_hsa_loader_pfn_t LoaderApi;
189 __builtin_memset(&LoaderApi, 0, sizeof(LoaderApi));
190 St = pGetExtTable(PROF_HSA_EXTENSION_AMD_LOADER, 1, sizeof(LoaderApi),
191 &LoaderApi);
192 if (!hsaOk(St) || !LoaderApi.query_segment_descriptors) {
193 PROF_WARN("AMD loader extension unavailable (0x%x) - "
194 "HSA device profiling disabled\n",
195 St);
196 return setHsaRuntimeState(-1);
197 }
198 pQuerySegDescs = LoaderApi.query_segment_descriptors;
199
200 /* The device-to-host copies go through the shared HIP loader. */
201 ensureHipLoaded();
202 if (!hipMemcpyAvailable()) {
203 PROF_WARN("%s", "hipMemcpy unavailable - HSA device profiling disabled\n");
204 return setHsaRuntimeState(-1);
205 }
206
207 if (isVerboseMode())
208 PROF_NOTE("%s", "HSA + HIP runtime resolved for device profiling\n");
209 return setHsaRuntimeState(1);
210}
211
212/* The canonical device bounds-table symbol from InstrProfilingPlatformGPU.c. */
213static const char ProfileSectionsSymbol[] = "__llvm_profile_sections";
214
215/* Dedup of drained section-bounds tuples, shared with the host-shadow path
216 * (processDeviceOffloadPrf records here on every successful drain) so each
217 * unique counter set is drained exactly once across both paths.
218 */
219static ProfBoundsSet SeenBounds;
220
221/* Has this bounds tuple already been drained? Pure check, no state mutation. */
222static int profBoundsAlreadyDrained(const void *D, const void *C,
223 const void *N) {
224 return SeenBounds.contains(D, C, N);
225}
226
227/* Record a drained bounds tuple. Idempotent; call only after a successful drain
228 * so a failed attempt stays retryable. */
229void __prof_rocm::profRecordDrainedBounds(const void *D, const void *C,
230 const void *N) {
231 SeenBounds.record(D, C, N);
232}
233
234#define PROF_MAX_GPU_AGENTS 64
235
236/* Buffer size for HSA agent names and symbol names we read back; both the
237 * device arch string and the __llvm_profile_sections symbol are far shorter. */
238#define PROF_HSA_NAME_MAX 64
239
240namespace {
241struct GpuAgent {
242 prof_hsa_agent_t agent;
243 char arch[PROF_HSA_NAME_MAX];
244};
245
246struct WalkState {
247 GpuAgent agents[PROF_MAX_GPU_AGENTS];
248 int num_agents;
249 int total_found;
250 int total_drained;
251};
252
253/* Per (agent, executable) symbol-iteration state. */
254struct SymbolState {
255 const char *arch;
256 int found;
257 int drained;
258};
259} // namespace
260
261/* HSA per-symbol callback: when it finds a __llvm_profile_sections variable,
262 * drain it via processDeviceOffloadPrf() unless the host-shadow path (or an
263 * earlier agent) already handled the same bounds. */
264static prof_hsa_status_t onSymbol(prof_hsa_executable_t, prof_hsa_agent_t,
265 prof_hsa_executable_symbol_t Sym,
266 void *Data) {
267 SymbolState *S = (SymbolState *)Data;
268
269 prof_hsa_symbol_kind_t Kind;
270 if (!hsaOk(
271 St: pHsaSymGetInfo(Sym, PROF_HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &Kind)) ||
272 Kind != PROF_HSA_SYMBOL_KIND_VARIABLE)
273 return PROF_HSA_STATUS_SUCCESS;
274
275 uint32_t NameLen = 0;
276 if (!hsaOk(St: pHsaSymGetInfo(Sym, PROF_HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH,
277 &NameLen)) ||
278 NameLen != sizeof(ProfileSectionsSymbol) - 1)
279 return PROF_HSA_STATUS_SUCCESS;
280
281 char NameBuf[PROF_HSA_NAME_MAX];
282 if (NameLen + 1 > sizeof(NameBuf))
283 return PROF_HSA_STATUS_SUCCESS;
284 if (!hsaOk(
285 St: pHsaSymGetInfo(Sym, PROF_HSA_EXECUTABLE_SYMBOL_INFO_NAME, NameBuf)))
286 return PROF_HSA_STATUS_SUCCESS;
287 NameBuf[NameLen] = '\0';
288
289 if (strcmp(s1: NameBuf, s2: ProfileSectionsSymbol) != 0)
290 return PROF_HSA_STATUS_SUCCESS;
291
292 uint64_t Addr = 0;
293 if (!hsaOk(St: pHsaSymGetInfo(
294 Sym, PROF_HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &Addr)) ||
295 Addr == 0) {
296 if (isVerboseMode())
297 PROF_NOTE("%s", "failed to read __llvm_profile_sections address\n");
298 return PROF_HSA_STATUS_SUCCESS;
299 }
300
301 S->found++;
302
303 // Read the bounds table first to dedup (and detect empty sections) before
304 // the full copy/relocate done by processDeviceOffloadPrf.
305 __llvm_profile_gpu_sections Sec;
306 if (memcpyDeviceToHost(Dst: &Sec, Src: (void *)(uintptr_t)Addr, Size: sizeof(Sec)) != 0) {
307 PROF_WARN("%s", "failed to copy device bounds table\n");
308 return PROF_HSA_STATUS_SUCCESS;
309 }
310 if (profBoundsAlreadyDrained(D: Sec.DataStart, C: Sec.CountersStart,
311 N: Sec.NamesStart)) {
312 if (isVerboseMode())
313 PROF_NOTE("%s", "device bounds already drained, skipping\n");
314 return PROF_HSA_STATUS_SUCCESS;
315 }
316
317 size_t DataBytes = (const char *)Sec.DataStop - (const char *)Sec.DataStart;
318 size_t CntsBytes =
319 (const char *)Sec.CountersStop - (const char *)Sec.CountersStart;
320 if (DataBytes == 0 || CntsBytes == 0) {
321 // Empty code object: nothing to write. Mark seen so we don't revisit it.
322 profRecordDrainedBounds(D: Sec.DataStart, C: Sec.CountersStart, N: Sec.NamesStart);
323 return PROF_HSA_STATUS_SUCCESS;
324 }
325
326 // Name HSA-drained objects in their own ".hsaN" suffix space so they never
327 // collide with the host-shadow path's "arch"/"arch.<i>" filenames. The drain
328 // latch (HsaDrainCompleted) already prevents re-draining an object, so a
329 // plain per-drain counter is enough for uniqueness.
330 static int DrainIndex = 0;
331 char Target[96];
332 snprintf(s: Target, maxlen: sizeof(Target), format: "%s.hsa%d", S->arch, DrainIndex);
333
334 // Record the bounds (and advance the index) only on a successful write so a
335 // transient error stays retryable on a later agent or collect call.
336 if (processDeviceOffloadPrf(DeviceOffloadPrf: (void *)(uintptr_t)Addr, Target, Sections: nullptr) == 0) {
337 S->drained++;
338 DrainIndex++;
339 profRecordDrainedBounds(D: Sec.DataStart, C: Sec.CountersStart, N: Sec.NamesStart);
340 }
341
342 return PROF_HSA_STATUS_SUCCESS;
343}
344
345static prof_hsa_status_t collectAgent(prof_hsa_agent_t Agent, void *Data) {
346 prof_hsa_device_type_t DevType;
347 if (!hsaOk(St: pHsaAgentGetInfo(Agent, PROF_HSA_AGENT_INFO_DEVICE, &DevType)) ||
348 DevType != PROF_HSA_DEVICE_TYPE_GPU)
349 return PROF_HSA_STATUS_SUCCESS;
350
351 WalkState *W = (WalkState *)Data;
352 if (W->num_agents >= PROF_MAX_GPU_AGENTS)
353 return PROF_HSA_STATUS_SUCCESS;
354
355 GpuAgent &GA = W->agents[W->num_agents++];
356 GA.agent = Agent;
357 char Name[PROF_HSA_NAME_MAX];
358 __builtin_memset(Name, 0, sizeof(Name));
359 pHsaAgentGetInfo(Agent, PROF_HSA_AGENT_INFO_NAME, Name);
360 size_t N = strnlen(string: Name, maxlen: sizeof(GA.arch) - 1);
361 __builtin_memcpy(GA.arch, Name, N);
362 GA.arch[N] = '\0';
363 if (!GA.arch[0])
364 strncpy(dest: GA.arch, src: "amdgpu", n: sizeof(GA.arch) - 1);
365
366 if (isVerboseMode())
367 PROF_NOTE("GPU agent %d: %s\n", W->num_agents - 1, GA.arch);
368 return PROF_HSA_STATUS_SUCCESS;
369}
370
371/* Reentrancy guard and "drained at least once" latch (both acquire/release). */
372static int HsaDrainInProgress = 0;
373static int HsaDrainCompleted = 0;
374
375int __prof_rocm::drainDevicesViaHsa(void) {
376 if (__atomic_load_n(&HsaDrainCompleted, __ATOMIC_ACQUIRE))
377 return 0;
378
379 int Expected = 0;
380 if (!__atomic_compare_exchange_n(&HsaDrainInProgress, &Expected, 1,
381 /*weak=*/0, __ATOMIC_ACQ_REL,
382 __ATOMIC_ACQUIRE))
383 return 0;
384
385 struct InProgressGuard {
386 ~InProgressGuard() {
387 __atomic_store_n(&HsaDrainInProgress, 0, __ATOMIC_RELEASE);
388 }
389 } _Guard;
390
391 if (loadHsaRuntimePointers() != 0)
392 return 0; /* Runtime unavailable: stay retryable. */
393
394 WalkState W;
395 __builtin_memset(&W, 0, sizeof(W));
396 prof_hsa_status_t St = pHsaIterateAgents(collectAgent, &W);
397 if (!hsaOkOrBreak(St)) {
398 PROF_WARN("hsa_iterate_agents failed (0x%x)\n", St);
399 return -1;
400 }
401 if (W.num_agents == 0) {
402 if (isVerboseMode())
403 PROF_NOTE("%s", "no GPU agents present; nothing to drain (will retry)\n");
404 return 0;
405 }
406
407 /* query_segment_descriptors ships in every loader-extension version, is more
408 * permissive than iterate_executables on ROCm, and yields the loaded
409 * (agent, executable) pairs directly. */
410 size_t NumSegs = 0;
411 St = pQuerySegDescs(nullptr, &NumSegs);
412 if (!hsaOk(St)) {
413 PROF_WARN("query_segment_descriptors(count) failed (0x%x)\n", St);
414 return -1;
415 }
416 if (NumSegs == 0) {
417 if (isVerboseMode())
418 PROF_NOTE("%s", "no loaded segments; nothing to drain (will retry)\n");
419 return 0;
420 }
421
422 prof_hsa_loader_segment_descriptor_t *Segs =
423 (prof_hsa_loader_segment_descriptor_t *)calloc(nmemb: NumSegs, size: sizeof(*Segs));
424 if (!Segs) {
425 PROF_ERR("%s\n", "failed to allocate segment descriptor array");
426 return -1;
427 }
428 UniqueFree SegsOwner(Segs);
429
430 St = pQuerySegDescs(Segs, &NumSegs);
431 if (!hsaOk(St)) {
432 PROF_WARN("query_segment_descriptors(fetch) failed (0x%x)\n", St);
433 return -1;
434 }
435
436 if (isVerboseMode())
437 PROF_NOTE("query_segment_descriptors: %zu segments\n", NumSegs);
438
439 // Walk each unique (agent, executable) pair once.
440 struct SeenPair {
441 uint64_t agent;
442 uint64_t exec;
443 };
444 enum { kSeenPairsInitCap = 64 };
445 SeenPair *Seen = nullptr;
446 int NumPairs = 0;
447 int CapPairs = 0;
448 int IterFailures = 0;
449
450 for (size_t i = 0; i < NumSegs; ++i) {
451 if (Segs[i].executable.handle == 0 || Segs[i].agent.handle == 0)
452 continue;
453
454 bool AlreadySeen = false;
455 for (int j = 0; j < NumPairs; ++j)
456 if (Seen[j].agent == Segs[i].agent.handle &&
457 Seen[j].exec == Segs[i].executable.handle) {
458 AlreadySeen = true;
459 break;
460 }
461 if (AlreadySeen)
462 continue;
463 if (growArray(Arr: (void **)&Seen, Cap: &CapPairs, MinCount: NumPairs + 1, InitCap: kSeenPairsInitCap,
464 ElemSize: sizeof(*Seen)) == 0) {
465 Seen[NumPairs].agent = Segs[i].agent.handle;
466 Seen[NumPairs].exec = Segs[i].executable.handle;
467 NumPairs++;
468 }
469
470 const char *Arch = nullptr;
471 for (int k = 0; k < W.num_agents; ++k)
472 if (W.agents[k].agent.handle == Segs[i].agent.handle) {
473 Arch = W.agents[k].arch;
474 break;
475 }
476 if (!Arch)
477 continue; /* not a GPU agent we collected */
478
479 SymbolState S;
480 __builtin_memset(&S, 0, sizeof(S));
481 S.arch = Arch;
482 if (isVerboseMode())
483 PROF_NOTE("walking executable 0x%llx on %s\n",
484 (unsigned long long)Segs[i].executable.handle, Arch);
485 prof_hsa_status_t IterSt =
486 pHsaExecIterAgentSyms(Segs[i].executable, Segs[i].agent, onSymbol, &S);
487 if (!hsaOkOrBreak(St: IterSt)) {
488 PROF_WARN("hsa_executable_iterate_agent_symbols on executable 0x%llx "
489 "failed (0x%x)\n",
490 (unsigned long long)Segs[i].executable.handle, IterSt);
491 IterFailures++;
492 }
493 W.total_found += S.found;
494 W.total_drained += S.drained;
495 }
496
497 if (isVerboseMode())
498 PROF_NOTE("HSA walk complete: agents=%d pairs=%d found=%d drained=%d "
499 "iter-failures=%d\n",
500 W.num_agents, NumPairs, W.total_found, W.total_drained,
501 IterFailures);
502
503 free(ptr: Seen);
504
505 /* Latch only when we actually drained data. A "found nothing new" walk is
506 * deliberately not latched: an early collect can precede any kernel launch,
507 * and latching it would suppress the real exit-time drain. No-op walks are
508 * cheap to repeat. */
509 if (W.total_drained > 0)
510 __atomic_store_n(&HsaDrainCompleted, 1, __ATOMIC_RELEASE);
511 return (IterFailures > 0) ? -1 : 0;
512}
513
514/* Fork-safety: deliberately no library constructor calling hsa_init(). */
515
516#endif /* defined(__linux__) && !defined(_WIN32) -- HSA drain */
517