1//===- InstrProfilingPlatformROCm.cpp - Profile data ROCm platform -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9extern "C" {
10#include "InstrProfiling.h"
11#include "InstrProfilingPort.h"
12}
13
14#include "interception/interception.h"
15// C library headers (not <cstdio> etc.): clang_rt.profile is built with
16// -nostdinc++ and avoids the C++ standard library (see profile/CMakeLists.txt).
17#include <stddef.h>
18#include <stdio.h>
19#include <stdlib.h>
20#include <string.h>
21#ifdef _WIN32
22#include <wchar.h>
23#endif
24
25#ifdef _WIN32
26#define WIN32_LEAN_AND_MEAN
27#include <windows.h>
28// windows.h needs to be included before tlhelp32.h.
29#include <tlhelp32.h>
30#else
31#include <dlfcn.h>
32#include <pthread.h>
33#endif
34
35#include "InstrProfilingPlatformROCmInternal.h"
36
37// shortcut to shared helper names
38using namespace __prof_rocm;
39
40/* Serialize one-time HIP loader resolution and DynamicModules mutations.
41 * Inline to avoid a sanitizer_common dependency. */
42#ifdef _WIN32
43static INIT_ONCE HipLoadedOnce = INIT_ONCE_STATIC_INIT;
44static CRITICAL_SECTION DynamicModulesLock;
45static INIT_ONCE DynamicModulesLockInit = INIT_ONCE_STATIC_INIT;
46static BOOL CALLBACK initDynamicModulesLockCb(PINIT_ONCE, PVOID, PVOID *) {
47 InitializeCriticalSection(&DynamicModulesLock);
48 return TRUE;
49}
50static void lockDynamicModules(void) {
51 InitOnceExecuteOnce(&DynamicModulesLockInit, initDynamicModulesLockCb, NULL,
52 NULL);
53 EnterCriticalSection(&DynamicModulesLock);
54}
55static void unlockDynamicModules(void) {
56 LeaveCriticalSection(&DynamicModulesLock);
57}
58#else
59static pthread_once_t HipLoadedOnce = PTHREAD_ONCE_INIT;
60static pthread_mutex_t DynamicModulesLock = PTHREAD_MUTEX_INITIALIZER;
61static void lockDynamicModules(void) {
62 pthread_mutex_lock(mutex: &DynamicModulesLock);
63}
64static void unlockDynamicModules(void) {
65 pthread_mutex_unlock(mutex: &DynamicModulesLock);
66}
67#endif
68
69int __prof_rocm::isVerboseMode() {
70 static int IsVerbose = -1;
71 if (IsVerbose == -1)
72 IsVerbose = getenv(name: "LLVM_PROFILE_VERBOSE") != nullptr;
73 return IsVerbose;
74}
75
76/* -------------------------------------------------------------------------- */
77/* Dynamic loading of HIP runtime symbols */
78/* -------------------------------------------------------------------------- */
79
80typedef int (*hipGetSymbolAddressTy)(void **, const void *);
81typedef int (*hipGetSymbolSizeTy)(size_t *, const void *);
82typedef int (*hipMemcpyTy)(void *, const void *, size_t, int);
83typedef int (*hipModuleGetGlobalTy)(void **, size_t *, void *, const char *);
84typedef int (*hipGetDeviceCountTy)(int *);
85typedef int (*hipGetDeviceTy)(int *);
86typedef int (*hipSetDeviceTy)(int);
87#if defined(__linux__) && !defined(_WIN32)
88typedef void *HipStream;
89typedef int (*hipStreamGetDeviceTy)(HipStream, int *);
90#endif
91
92/* Minimal hipDeviceProp_t (HIP 6.x R0600): only gcnArchName at offset 1160
93 * is read. Padded to 4096 to tolerate ABI growth. */
94typedef struct {
95 char padding[1160];
96 char gcnArchName[256];
97 char tail_padding[2680];
98} HipDevicePropMinimal;
99typedef int (*hipGetDevicePropertiesTy)(HipDevicePropMinimal *, int);
100
101static hipGetSymbolAddressTy pHipGetSymbolAddress = nullptr;
102static hipGetSymbolSizeTy pHipGetSymbolSize = nullptr;
103static hipMemcpyTy pHipMemcpy = nullptr;
104static hipModuleGetGlobalTy pHipModuleGetGlobal = nullptr;
105static hipGetDeviceCountTy pHipGetDeviceCount = nullptr;
106static hipGetDeviceTy pHipGetDevice = nullptr;
107static hipSetDeviceTy pHipSetDevice = nullptr;
108#if defined(__linux__) && !defined(_WIN32)
109static hipStreamGetDeviceTy pHipStreamGetDevice = nullptr;
110#endif
111static hipGetDevicePropertiesTy pHipGetDeviceProperties = nullptr;
112
113static int NumDevices = 0;
114/* 256 matches hipDeviceProp_t::gcnArchName, the source field width. */
115static char (*DeviceArchNames)[256] = nullptr;
116#if defined(__linux__) && !defined(_WIN32)
117static unsigned char *UsedDevices = nullptr;
118static int AnyDeviceUsed = 0;
119#endif
120
121#ifdef _WIN32
122static wchar_t toLowerAsciiW(wchar_t C) {
123 return C >= L'A' && C <= L'Z' ? C - L'A' + L'a' : C;
124}
125
126static int wcsEqualNoCase(const wchar_t *A, const wchar_t *B) {
127 while (*A && *B) {
128 if (toLowerAsciiW(*A) != toLowerAsciiW(*B))
129 return 0;
130 ++A;
131 ++B;
132 }
133 return *A == *B;
134}
135
136static int wcsStartsWithNoCase(const wchar_t *S, const wchar_t *Prefix) {
137 while (*Prefix) {
138 if (toLowerAsciiW(*S) != toLowerAsciiW(*Prefix))
139 return 0;
140 ++S;
141 ++Prefix;
142 }
143 return 1;
144}
145
146static int wcsEndsWithNoCase(const wchar_t *S, const wchar_t *Suffix) {
147 size_t SLen = wcslen(S);
148 size_t SuffixLen = wcslen(Suffix);
149 return SLen >= SuffixLen && wcsEqualNoCase(S + SLen - SuffixLen, Suffix);
150}
151
152static int isHipRuntimeModuleName(const wchar_t *Name) {
153 return wcsEqualNoCase(Name, L"amdhip64.dll") ||
154 (wcsStartsWithNoCase(Name, L"amdhip64_") &&
155 wcsEndsWithNoCase(Name, L".dll"));
156}
157
158static void *findLoadedHipRuntime(void) {
159 HMODULE Handle = GetModuleHandleW(L"amdhip64.dll");
160 if (Handle)
161 return (void *)Handle;
162
163 HANDLE Snapshot = CreateToolhelp32Snapshot(
164 TH32CS_SNAPMODULE | TH32CS_SNAPMODULE32, GetCurrentProcessId());
165 if (Snapshot == INVALID_HANDLE_VALUE)
166 return nullptr;
167
168 MODULEENTRY32W Entry;
169 Entry.dwSize = sizeof(Entry);
170 if (Module32FirstW(Snapshot, &Entry)) {
171 do {
172 if (isHipRuntimeModuleName(Entry.szModule)) {
173 Handle = Entry.hModule;
174 break;
175 }
176 } while (Module32NextW(Snapshot, &Entry));
177 }
178
179 CloseHandle(Snapshot);
180 return (void *)Handle;
181}
182#endif
183
184/* -------------------------------------------------------------------------- */
185/* Device-to-host copies */
186/* Keep HIP-only to avoid an HSA dependency. */
187/* -------------------------------------------------------------------------- */
188
189static void doEnsureHipLoaded(void) {
190 if (!__interception::DynamicLoaderAvailable()) {
191 if (isVerboseMode())
192 PROF_NOTE("%s", "Dynamic library loading not available - "
193 "HIP profiling disabled\n");
194 return;
195 }
196
197#ifdef _WIN32
198 /* Use the app's loaded HIP runtime to avoid binding another ROCm version. */
199 void *Handle = findLoadedHipRuntime();
200#else
201 const char *HipLibName = "libamdhip64.so";
202 void *Handle = __interception::OpenLibrary(name: HipLibName);
203#endif
204 if (!Handle)
205 return;
206
207 pHipGetSymbolAddress = (hipGetSymbolAddressTy)__interception::LookupSymbol(
208 handle: Handle, symbol: "hipGetSymbolAddress");
209 pHipGetSymbolSize = (hipGetSymbolSizeTy)__interception::LookupSymbol(
210 handle: Handle, symbol: "hipGetSymbolSize");
211 pHipMemcpy = (hipMemcpyTy)__interception::LookupSymbol(handle: Handle, symbol: "hipMemcpy");
212 pHipModuleGetGlobal = (hipModuleGetGlobalTy)__interception::LookupSymbol(
213 handle: Handle, symbol: "hipModuleGetGlobal");
214 pHipGetDeviceCount = (hipGetDeviceCountTy)__interception::LookupSymbol(
215 handle: Handle, symbol: "hipGetDeviceCount");
216 pHipGetDevice =
217 (hipGetDeviceTy)__interception::LookupSymbol(handle: Handle, symbol: "hipGetDevice");
218 pHipSetDevice =
219 (hipSetDeviceTy)__interception::LookupSymbol(handle: Handle, symbol: "hipSetDevice");
220#if defined(__linux__) && !defined(_WIN32)
221 pHipStreamGetDevice = (hipStreamGetDeviceTy)__interception::LookupSymbol(
222 handle: Handle, symbol: "hipStreamGetDevice");
223#endif
224 pHipGetDeviceProperties =
225 (hipGetDevicePropertiesTy)__interception::LookupSymbol(
226 handle: Handle, symbol: "hipGetDevicePropertiesR0600");
227 if (!pHipGetDeviceProperties)
228 pHipGetDeviceProperties =
229 (hipGetDevicePropertiesTy)__interception::LookupSymbol(
230 handle: Handle, symbol: "hipGetDeviceProperties");
231
232 if (pHipGetDeviceCount && pHipGetDeviceProperties) {
233 int Count = 0;
234 if (pHipGetDeviceCount(&Count) == 0 && Count > 0) {
235 DeviceArchNames = (char (*)[256])calloc(nmemb: Count, size: sizeof(*DeviceArchNames));
236 if (!DeviceArchNames) {
237 PROF_ERR("%s\n", "failed to allocate device arch name table");
238 return;
239 }
240#if defined(__linux__) && !defined(_WIN32)
241 UsedDevices = (unsigned char *)calloc(nmemb: Count, size: sizeof(*UsedDevices));
242 if (!UsedDevices && isVerboseMode())
243 PROF_NOTE("%s\n", "Device-use tracking disabled");
244#endif
245 HipDevicePropMinimal Prop;
246 for (int i = 0; i < Count; ++i) {
247 __builtin_memset(&Prop, 0, sizeof(Prop));
248 if (pHipGetDeviceProperties(&Prop, i) == 0) {
249 strncpy(dest: DeviceArchNames[i], src: Prop.gcnArchName,
250 n: sizeof(DeviceArchNames[i]) - 1);
251 DeviceArchNames[i][sizeof(DeviceArchNames[i]) - 1] = '\0';
252 if (isVerboseMode())
253 PROF_NOTE("Device %d arch: %s\n", i, DeviceArchNames[i]);
254 }
255 }
256 NumDevices = Count;
257 }
258 }
259}
260
261#ifdef _WIN32
262static BOOL CALLBACK ensureHipLoadedCb(PINIT_ONCE, PVOID, PVOID *) {
263 doEnsureHipLoaded();
264 return TRUE;
265}
266#endif
267
268void __prof_rocm::ensureHipLoaded(void) {
269#ifdef _WIN32
270 InitOnceExecuteOnce(&HipLoadedOnce, ensureHipLoadedCb, NULL, NULL);
271#else
272 pthread_once(once_control: &HipLoadedOnce, init_routine: doEnsureHipLoaded);
273#endif
274}
275
276// Accessor for the HSA drain: true once the loaded HIP runtime exposes
277// hipMemcpy. Kept here so pHipMemcpy stays file-private to this TU.
278int __prof_rocm::hipMemcpyAvailable() { return pHipMemcpy != nullptr; }
279
280/* -------------------------------------------------------------------------- */
281/* Public wrappers that forward to the loaded HIP symbols */
282/* -------------------------------------------------------------------------- */
283
284static int hipGetSymbolAddress(void **devPtr, const void *symbol) {
285 ensureHipLoaded();
286 return pHipGetSymbolAddress ? pHipGetSymbolAddress(devPtr, symbol) : -1;
287}
288
289static int hipGetSymbolSize(size_t *size, const void *symbol) {
290 ensureHipLoaded();
291 return pHipGetSymbolSize ? pHipGetSymbolSize(size, symbol) : -1;
292}
293
294static int hipMemcpy(void *dest, const void *src, size_t len,
295 int kind /*2=DToH*/) {
296 ensureHipLoaded();
297 return pHipMemcpy ? pHipMemcpy(dest, src, len, kind) : -1;
298}
299
300/* Device section symbols must be registered with CLR first; otherwise
301 * hipMemcpy may take a CPU path and crash. */
302int __prof_rocm::memcpyDeviceToHost(void *Dst, const void *Src, size_t Size) {
303 return hipMemcpy(dest: Dst, src: Src, len: Size, kind: 2 /* DToH */);
304}
305
306static int hipModuleGetGlobal(void **DevPtr, size_t *Bytes, void *Module,
307 const char *Name) {
308 ensureHipLoaded();
309 return pHipModuleGetGlobal ? pHipModuleGetGlobal(DevPtr, Bytes, Module, Name)
310 : -1;
311}
312
313static int hipGetDevice(int *DeviceId) {
314 ensureHipLoaded();
315 return pHipGetDevice ? pHipGetDevice(DeviceId) : -1;
316}
317
318static int hipSetDevice(int DeviceId) {
319 ensureHipLoaded();
320 return pHipSetDevice ? pHipSetDevice(DeviceId) : -1;
321}
322
323#if defined(__linux__) && !defined(_WIN32)
324static int hipStreamGetDevice(HipStream Stream, int *DeviceId) {
325 ensureHipLoaded();
326 return pHipStreamGetDevice ? pHipStreamGetDevice(Stream, DeviceId) : -1;
327}
328
329static void markDeviceUsed(int DeviceId) {
330 if (DeviceId < 0 || DeviceId >= NumDevices || !UsedDevices)
331 return;
332 __atomic_store_n(&UsedDevices[DeviceId], 1, __ATOMIC_RELAXED);
333 __atomic_store_n(&AnyDeviceUsed, 1, __ATOMIC_RELEASE);
334}
335
336static void markCurrentDeviceUsed(void) {
337 int DeviceId = -1;
338 if (hipGetDevice(DeviceId: &DeviceId) == 0)
339 markDeviceUsed(DeviceId);
340}
341
342static void markLaunchStreamDeviceUsed(HipStream Stream) {
343 int DeviceId = -1;
344 if (Stream && hipStreamGetDevice(Stream, DeviceId: &DeviceId) == 0) {
345 markDeviceUsed(DeviceId);
346 return;
347 }
348 markCurrentDeviceUsed();
349}
350
351static int shouldCollectDevice(int DeviceId) {
352 if (UsedDevices && __atomic_load_n(&AnyDeviceUsed, __ATOMIC_ACQUIRE) &&
353 !__atomic_load_n(&UsedDevices[DeviceId], __ATOMIC_RELAXED))
354 return 0;
355 return 1;
356}
357#else
358static int shouldCollectDevice(int) { return 1; }
359#endif
360
361static const char *getDeviceArchName(int DeviceId) {
362 if (DeviceId < 0 || DeviceId >= NumDevices || !DeviceArchNames[DeviceId][0])
363 return "amdgpu";
364 return DeviceArchNames[DeviceId];
365}
366
367/* -------------------------------------------------------------------------- */
368/* Dynamic module tracking */
369/* -------------------------------------------------------------------------- */
370
371/* Per-TU profile entry inside a dynamic module.
372 * A single dynamic module may contain multiple TUs (e.g. -fgpu-rdc). */
373typedef struct {
374 void *DeviceVar; /* device address of __llvm_profile_sections_<CUID> */
375 int Processed; /* 0 = not yet collected, 1 = data already copied */
376} OffloadDynamicTUInfo;
377
378/* One entry per hipModuleLoad call. */
379typedef struct {
380 void *ModulePtr; /* hipModule_t handle */
381 OffloadDynamicTUInfo *TUs; /* array of per-TU entries */
382 int NumTUs;
383 int CapTUs;
384} OffloadDynamicModuleInfo;
385
386static OffloadDynamicModuleInfo *DynamicModules = nullptr;
387static int NumDynamicModules = 0;
388static int CapDynamicModules = 0;
389
390/* -------------------------------------------------------------------------- */
391/* ELF symbol enumeration (manual parse: compiler-rt cannot link LLVM Support)
392 */
393/* -------------------------------------------------------------------------- */
394
395#if __has_include(<elf.h>)
396#include <elf.h>
397
398/* Callback invoked for every matching symbol name found in the ELF image.
399 * Return 0 to continue iteration, non-zero to stop. */
400typedef int (*SymbolCallback)(const char *Name, void *UserData);
401
402/* If Image is a clang offload bundle, return a pointer to the first embedded
403 * ELF. Returns Image if not a bundle, nullptr if a bundle holds no ELF. */
404static const void *unwrapOffloadBundle(const void *Image) {
405 static const char BundleMagic[] = "__CLANG_OFFLOAD_BUNDLE__";
406 if (memcmp(s1: Image, s2: BundleMagic, n: sizeof(BundleMagic) - 1) != 0)
407 return Image; /* Not a bundle, return as-is. */
408
409 const char *Buf = (const char *)Image;
410 uint64_t NumEntries;
411 __builtin_memcpy(&NumEntries, Buf + sizeof(BundleMagic) - 1,
412 sizeof(uint64_t));
413
414 /* Walk the entry table (starts at offset 32). */
415 const char *Cursor = Buf + 32;
416 for (uint64_t I = 0; I < NumEntries; ++I) {
417 uint64_t EntryOffset, EntrySize, IDSize;
418 __builtin_memcpy(&EntryOffset, Cursor, sizeof(EntryOffset));
419 Cursor += sizeof(EntryOffset);
420 __builtin_memcpy(&EntrySize, Cursor, sizeof(EntrySize));
421 Cursor += sizeof(EntrySize);
422 __builtin_memcpy(&IDSize, Cursor, sizeof(IDSize));
423 Cursor += sizeof(IDSize);
424 Cursor += IDSize; /* skip entry ID */
425
426 if (EntrySize >= sizeof(Elf64_Ehdr)) {
427 const Elf64_Ehdr *E = (const Elf64_Ehdr *)(Buf + EntryOffset);
428 if (E->e_ident[EI_MAG0] == ELFMAG0 && E->e_ident[EI_MAG1] == ELFMAG1 &&
429 E->e_ident[EI_MAG2] == ELFMAG2 && E->e_ident[EI_MAG3] == ELFMAG3) {
430 return (const void *)(Buf + EntryOffset);
431 }
432 }
433 }
434
435 PROF_WARN("%s", "offload bundle contains no valid ELF entries\n");
436 return nullptr;
437}
438
439/* Invoke CB for every global symbol in Image (an AMDGPU ELF or offload bundle)
440 * whose name starts with PREFIX. Image may be null. */
441static void enumerateElfSymbols(const void *Image, const char *Prefix,
442 SymbolCallback CB, void *UserData) {
443 if (!Image)
444 return;
445
446 Image = unwrapOffloadBundle(Image);
447 if (!Image)
448 return;
449
450 const Elf64_Ehdr *Ehdr = (const Elf64_Ehdr *)Image;
451 if (Ehdr->e_ident[EI_MAG0] != ELFMAG0 || Ehdr->e_ident[EI_MAG1] != ELFMAG1 ||
452 Ehdr->e_ident[EI_MAG2] != ELFMAG2 || Ehdr->e_ident[EI_MAG3] != ELFMAG3) {
453 if (isVerboseMode())
454 PROF_NOTE("%s", "Image is not a valid ELF, skipping enumeration\n");
455 return;
456 }
457
458 size_t PrefixLen = strlen(s: Prefix);
459 const char *Base = (const char *)Image;
460 const Elf64_Shdr *Shdrs = (const Elf64_Shdr *)(Base + Ehdr->e_shoff);
461
462 for (int i = 0; i < Ehdr->e_shnum; ++i) {
463 if (Shdrs[i].sh_type != SHT_SYMTAB)
464 continue;
465
466 const Elf64_Sym *Syms = (const Elf64_Sym *)(Base + Shdrs[i].sh_offset);
467 int NumSyms = Shdrs[i].sh_size / sizeof(Elf64_Sym);
468 /* String table is the section referenced by sh_link. */
469 const char *StrTab = Base + Shdrs[Shdrs[i].sh_link].sh_offset;
470
471 for (int j = 0; j < NumSyms; ++j) {
472 if (Syms[j].st_name == 0)
473 continue;
474 const char *Name = StrTab + Syms[j].st_name;
475 if (strncmp(s1: Name, s2: Prefix, n: PrefixLen) == 0) {
476 if (CB(Name, UserData))
477 return;
478 }
479 }
480 }
481}
482
483/* State passed through the enumeration callback. */
484typedef struct {
485 void *Module; /* hipModule_t */
486 OffloadDynamicModuleInfo *ModInfo;
487} EnumState;
488
489/* Register one __llvm_profile_sections_<CUID> symbol on the module entry.
490 * hipModuleGetGlobal also registers the device address with CLR so hipMemcpy
491 * can copy from it later. */
492static int registerPrfSymbol(const char *Name, void *UserData) {
493 EnumState *S = (EnumState *)UserData;
494 OffloadDynamicModuleInfo *MI = S->ModInfo;
495
496 /* The symbol is the per-TU sections struct itself, not a pointer
497 * indirection, so this address is the hipMemcpy source. */
498 void *DeviceVar = nullptr;
499 size_t Bytes = 0;
500 if (hipModuleGetGlobal(DevPtr: &DeviceVar, Bytes: &Bytes, Module: S->Module, Name) != 0) {
501 PROF_WARN("failed to get symbol %s for module %p\n", Name, S->Module);
502 return 0; /* continue */
503 }
504
505 if (growArray(Arr: (void **)&MI->TUs, Cap: &MI->CapTUs, MinCount: MI->NumTUs + 1, InitCap: 4,
506 ElemSize: sizeof(*MI->TUs))) {
507 PROF_ERR("%s\n", "failed to grow TU array");
508 return 0;
509 }
510 OffloadDynamicTUInfo *TU = &MI->TUs[MI->NumTUs++];
511 TU->DeviceVar = DeviceVar;
512 TU->Processed = 0;
513
514 (void)Name;
515 return 0; /* continue enumeration */
516}
517
518#endif /* __has_include(<elf.h>) */
519
520/* -------------------------------------------------------------------------- */
521/* Registration / un-registration helpers */
522/* -------------------------------------------------------------------------- */
523
524extern "C" void
525__llvm_profile_offload_register_dynamic_module(int ModuleLoadRc, void **Ptr,
526 const void *Image) {
527 if (ModuleLoadRc)
528 return;
529
530 lockDynamicModules();
531
532 if (isVerboseMode())
533 PROF_NOTE("Registering loaded module %d: rc=%d, module=%p, image=%p\n",
534 NumDynamicModules, ModuleLoadRc, *Ptr, Image);
535
536 if (growArray(Arr: (void **)&DynamicModules, Cap: &CapDynamicModules,
537 MinCount: NumDynamicModules + 1, InitCap: 64, ElemSize: sizeof(*DynamicModules))) {
538 unlockDynamicModules();
539 return;
540 }
541
542 OffloadDynamicModuleInfo *MI = &DynamicModules[NumDynamicModules++];
543 MI->ModulePtr = *Ptr;
544 MI->TUs = nullptr;
545 MI->NumTUs = 0;
546 MI->CapTUs = 0;
547
548 /* Dynamic-module profiling needs ELF parsing for symbol enumeration. */
549#if __has_include(<elf.h>)
550 EnumState State = {.Module: *Ptr, .ModInfo: MI};
551 enumerateElfSymbols(Image, Prefix: "__llvm_profile_sections_", CB: registerPrfSymbol,
552 UserData: &State);
553#else
554 (void)Image;
555 if (isVerboseMode())
556 PROF_NOTE("%s",
557 "Dynamic module profiling not supported on this platform\n");
558#endif
559
560 if (MI->NumTUs == 0) {
561 PROF_WARN("no __llvm_profile_sections_* symbols found in module %p\n",
562 *Ptr);
563 } else if (isVerboseMode()) {
564 PROF_NOTE("Module %p: registered %d TU(s)\n", *Ptr, MI->NumTUs);
565 }
566
567 unlockDynamicModules();
568}
569
570extern "C" void __llvm_profile_offload_unregister_dynamic_module(void *Ptr) {
571 lockDynamicModules();
572 for (int i = 0; i < NumDynamicModules; ++i) {
573 OffloadDynamicModuleInfo *MI = &DynamicModules[i];
574
575 /* HIP recycles hipModule_t addresses; drained slots are cleared so a
576 * recycled handle finds the new slot, not the dead one. */
577 if (MI->ModulePtr != Ptr)
578 continue;
579
580 if (isVerboseMode())
581 PROF_NOTE("Unregistering module %p (%d TUs)\n", MI->ModulePtr,
582 MI->NumTUs);
583
584 static int NextTUIndex = 0;
585 for (int t = 0; t < MI->NumTUs; ++t) {
586 OffloadDynamicTUInfo *TU = &MI->TUs[t];
587 if (TU->Processed) {
588 if (isVerboseMode())
589 PROF_NOTE("Module %p TU %d already processed, skipping\n", Ptr, t);
590 continue;
591 }
592 int TUIndex = __atomic_fetch_add(&NextTUIndex, 1, __ATOMIC_RELAXED);
593 if (TU->DeviceVar) {
594 int CurDev = 0;
595 hipGetDevice(DeviceId: &CurDev);
596 const char *ArchName = getDeviceArchName(DeviceId: CurDev);
597 /* Encode TUIndex in Target so each drain writes a distinct profraw;
598 * otherwise back-to-back drains overwrite the same file. */
599 char TargetWithTU[64];
600 snprintf(s: TargetWithTU, maxlen: sizeof(TargetWithTU), format: "%s.%d", ArchName,
601 TUIndex);
602 if (processDeviceOffloadPrf(DeviceOffloadPrf: TU->DeviceVar, Target: TargetWithTU, Sections: nullptr) == 0)
603 TU->Processed = 1;
604 else
605 PROF_WARN("failed to process profile data for module %p TU %d\n", Ptr,
606 t);
607 }
608 }
609 MI->ModulePtr = nullptr;
610 unlockDynamicModules();
611 return;
612 }
613
614 if (isVerboseMode())
615 PROF_WARN("unregister called for unknown module %p\n", Ptr);
616 unlockDynamicModules();
617}
618
619static void **OffloadShadowVariables = nullptr;
620static int NumShadowVariables = 0;
621static int CapShadowVariables = 0;
622
623struct OffloadSectionShadow {
624 void *Data;
625 void *Counters;
626 void *UniformCounters;
627 void *Names;
628};
629
630struct OffloadSectionShadowGroup {
631 OffloadSectionShadow *Shadows;
632 int NumShadows;
633 int CapShadows;
634 int NumSections;
635};
636
637static OffloadSectionShadowGroup *OffloadSectionShadowGroups = nullptr;
638static int CapSectionShadowGroups = 0;
639
640static int ensureSectionShadowGroupCapacity(void) {
641 return growArray(Arr: (void **)&OffloadSectionShadowGroups,
642 Cap: &CapSectionShadowGroups, MinCount: CapShadowVariables,
643 InitCap: CapShadowVariables, ElemSize: sizeof(*OffloadSectionShadowGroups));
644}
645
646static int ensureSectionShadowCapacity(OffloadSectionShadowGroup *Group,
647 int MinCapacity) {
648 return growArray(Arr: (void **)&Group->Shadows, Cap: &Group->CapShadows, MinCount: MinCapacity, InitCap: 4,
649 ElemSize: sizeof(*Group->Shadows));
650}
651
652extern "C" void __llvm_profile_offload_register_shadow_variable(void *ptr) {
653 if (growArray(Arr: (void **)&OffloadShadowVariables, Cap: &CapShadowVariables,
654 MinCount: NumShadowVariables + 1, InitCap: 64, ElemSize: sizeof(*OffloadShadowVariables)))
655 return;
656 if (ensureSectionShadowGroupCapacity())
657 return;
658 int Index = NumShadowVariables++;
659 OffloadShadowVariables[Index] = ptr;
660 __builtin_memset(&OffloadSectionShadowGroups[Index], 0,
661 sizeof(OffloadSectionShadowGroups[Index]));
662}
663
664extern "C" void
665__llvm_profile_offload_register_section_shadow_variable(void *ptr) {
666 if (NumShadowVariables == 0)
667 return;
668
669 /* Match CGCUDANV.cpp: data, counters, uniform counters, then names for each
670 * kernel. */
671 OffloadSectionShadowGroup *Group =
672 &OffloadSectionShadowGroups[NumShadowVariables - 1];
673 int ShadowIndex = Group->NumSections / 4;
674 if (ensureSectionShadowCapacity(Group, MinCapacity: ShadowIndex + 1))
675 return;
676 if (ShadowIndex >= Group->NumShadows)
677 Group->NumShadows = ShadowIndex + 1;
678
679 OffloadSectionShadow *Shadow = &Group->Shadows[ShadowIndex];
680 switch (Group->NumSections % 4) {
681 case 0:
682 Shadow->Data = ptr;
683 break;
684 case 1:
685 Shadow->Counters = ptr;
686 break;
687 case 2:
688 Shadow->UniformCounters = ptr;
689 break;
690 case 3:
691 Shadow->Names = ptr;
692 break;
693 }
694 ++Group->NumSections;
695}
696
697namespace {
698
699struct ProfileSectionCopy {
700 const char *Name;
701 const void *DevBegin;
702 size_t Size;
703 const void *&CachedDevBegin;
704 char *&CachedHost;
705 size_t &CachedSize;
706 UniqueFree Owner;
707 char *HostBegin = nullptr;
708 bool Reused = false;
709
710 ProfileSectionCopy(const char *Name, const void *DevBegin, size_t Size,
711 const void *&CachedDevBegin, char *&CachedHost,
712 size_t &CachedSize)
713 : Name(Name), DevBegin(DevBegin), Size(Size),
714 CachedDevBegin(CachedDevBegin), CachedHost(CachedHost),
715 CachedSize(CachedSize) {}
716
717 ProfileSectionCopy(const ProfileSectionCopy &) = delete;
718 ProfileSectionCopy &operator=(const ProfileSectionCopy &) = delete;
719
720 int prepare() {
721 if (Size == 0)
722 return 0;
723 if (DevBegin == CachedDevBegin && Size == CachedSize) {
724 HostBegin = CachedHost;
725 Reused = true;
726 if (isVerboseMode())
727 PROF_NOTE("Reusing cached %s section (%zu bytes)\n", Name, Size);
728 } else {
729 HostBegin = static_cast<char *>(malloc(size: Size));
730 Owner.reset(P: HostBegin);
731 }
732 return HostBegin ? 0 : -1;
733 }
734
735 int copy() {
736 if (Size == 0 || Reused)
737 return 0;
738 return memcpyDeviceToHost(Dst: HostBegin, Src: DevBegin, Size);
739 }
740
741 void commitCache() {
742 if (Reused || Size == 0)
743 return;
744 CachedDevBegin = DevBegin;
745 CachedHost = HostBegin;
746 CachedSize = Size;
747 Owner.release();
748 }
749};
750
751} // namespace
752
753static int getRegisteredSectionBounds(void *Shadow, void **DevicePtr,
754 size_t *Size) {
755 *DevicePtr = nullptr;
756 *Size = 0;
757 int AddrRc = hipGetSymbolAddress(devPtr: DevicePtr, symbol: Shadow);
758 int SizeRc = hipGetSymbolSize(size: Size, symbol: Shadow);
759 return AddrRc == 0 && SizeRc == 0 && *DevicePtr && *Size > 0 ? 0 : -1;
760}
761
762struct RegisteredSectionRange {
763 const void *Data;
764 const void *Counters;
765 const void *UniformCounters;
766 const void *Names;
767 size_t DataSize;
768 size_t CountersSize;
769 size_t UniformCountersSize;
770 size_t NamesSize;
771 size_t DataOffset;
772 size_t CountersOffset;
773 size_t UniformCountersOffset;
774 size_t NamesOffset;
775};
776
777static int
778hasCompleteSectionShadows(const OffloadSectionShadowGroup *Sections) {
779 if (!Sections || Sections->NumShadows == 0 || Sections->NumSections % 4 != 0)
780 return 0;
781 for (int I = 0; I < Sections->NumShadows; ++I) {
782 if (!Sections->Shadows[I].Data || !Sections->Shadows[I].Counters ||
783 !Sections->Shadows[I].UniformCounters || !Sections->Shadows[I].Names)
784 return 0;
785 }
786 return 1;
787}
788
789int __prof_rocm::processDeviceOffloadPrf(
790 void *DeviceOffloadPrf, const char *Target,
791 const OffloadSectionShadowGroup *Sections) {
792 __llvm_profile_gpu_sections HostSections;
793
794 if (hipMemcpy(dest: &HostSections, src: DeviceOffloadPrf, len: sizeof(HostSections),
795 kind: 2 /*DToH*/) != 0) {
796 PROF_ERR("%s\n", "failed to copy offload prf structure from device");
797 return -1;
798 }
799
800 const void *DevCntsBegin = HostSections.CountersStart;
801 const void *DevDataBegin = HostSections.DataStart;
802 const void *DevNamesBegin = HostSections.NamesStart;
803 const void *DevUniformCntsBegin = HostSections.UniformCountersStart;
804 const void *DevCntsEnd = HostSections.CountersStop;
805 const void *DevDataEnd = HostSections.DataStop;
806 const void *DevNamesEnd = HostSections.NamesStop;
807 const void *DevUniformCntsEnd = HostSections.UniformCountersStop;
808
809 size_t CountersSize = (const char *)DevCntsEnd - (const char *)DevCntsBegin;
810 size_t DataSize = (const char *)DevDataEnd - (const char *)DevDataBegin;
811 size_t NamesSize = (const char *)DevNamesEnd - (const char *)DevNamesBegin;
812 size_t UniformCountersSize =
813 (const char *)DevUniformCntsEnd - (const char *)DevUniformCntsBegin;
814
815 int UseRegisteredSections = hasCompleteSectionShadows(Sections);
816 RegisteredSectionRange *RegisteredRanges = nullptr;
817 int NumRegisteredRanges = 0;
818
819 if (isVerboseMode())
820 PROF_NOTE("Section pointers: Cnts=[%p,%p]=%zu Data=[%p,%p]=%zu "
821 "Names=[%p,%p]=%zu UCnts=[%p,%p]=%zu\n",
822 DevCntsBegin, DevCntsEnd, CountersSize, DevDataBegin, DevDataEnd,
823 DataSize, DevNamesBegin, DevNamesEnd, NamesSize,
824 DevUniformCntsBegin, DevUniformCntsEnd, UniformCountersSize);
825
826 if (CountersSize == 0 || DataSize == 0)
827 return 0;
828
829 int ret = -1;
830
831 /* Sections using linker-defined __start_/__stop_ bounds are shared across
832 TU structs in RDC mode. Deduplicate by caching the last copied range. */
833 static const void *CachedDevNamesBegin = nullptr;
834 static char *CachedHostNames = nullptr;
835 static size_t CachedNamesSize = 0;
836
837 static const void *CachedDevCntsBegin = nullptr;
838 static char *CachedHostCnts = nullptr;
839 static size_t CachedCntsSize = 0;
840
841 static const void *CachedDevDataBegin = nullptr;
842 static char *CachedHostData = nullptr;
843 static size_t CachedDataSize = 0;
844
845 static const void *CachedDevUCntsBegin = nullptr;
846 static char *CachedHostUCnts = nullptr;
847 static size_t CachedUCntsSize = 0;
848
849 ProfileSectionCopy Cnts("counters", DevCntsBegin, CountersSize,
850 CachedDevCntsBegin, CachedHostCnts, CachedCntsSize);
851 ProfileSectionCopy Data("data", DevDataBegin, DataSize, CachedDevDataBegin,
852 CachedHostData, CachedDataSize);
853 ProfileSectionCopy Names("names", DevNamesBegin, NamesSize,
854 CachedDevNamesBegin, CachedHostNames,
855 CachedNamesSize);
856 ProfileSectionCopy UCnts("ucnts", DevUniformCntsBegin, UniformCountersSize,
857 CachedDevUCntsBegin, CachedHostUCnts,
858 CachedUCntsSize);
859
860 UniqueFree RegisteredRangeOwner;
861
862 if (UseRegisteredSections) {
863 NumRegisteredRanges = Sections->NumShadows;
864 RegisteredRangeOwner.reset(
865 P: malloc(size: NumRegisteredRanges * sizeof(RegisteredSectionRange)));
866 RegisteredRanges = (RegisteredSectionRange *)RegisteredRangeOwner.get();
867 if (!RegisteredRanges) {
868 PROF_ERR("%s\n", "failed to allocate registered section table");
869 return -1;
870 }
871 __builtin_memset(RegisteredRanges, 0,
872 NumRegisteredRanges * sizeof(*RegisteredRanges));
873
874 size_t RegisteredDataSize = 0;
875 size_t RegisteredCountersSize = 0;
876 size_t RegisteredUniformCountersSize = 0;
877 size_t RegisteredNamesSize = 0;
878 for (int I = 0; I < NumRegisteredRanges; ++I) {
879 void *Data = nullptr;
880 void *Counters = nullptr;
881 void *UniformCounters = nullptr;
882 void *Names = nullptr;
883 size_t ThisDataSize = 0;
884 size_t ThisCountersSize = 0;
885 size_t ThisUniformCountersSize = 0;
886 size_t ThisNamesSize = 0;
887 OffloadSectionShadow *Shadow = &Sections->Shadows[I];
888 if (getRegisteredSectionBounds(Shadow: Shadow->Data, DevicePtr: &Data, Size: &ThisDataSize) != 0 ||
889 getRegisteredSectionBounds(Shadow: Shadow->Counters, DevicePtr: &Counters,
890 Size: &ThisCountersSize) != 0 ||
891 getRegisteredSectionBounds(Shadow: Shadow->UniformCounters, DevicePtr: &UniformCounters,
892 Size: &ThisUniformCountersSize) != 0 ||
893 getRegisteredSectionBounds(Shadow: Shadow->Names, DevicePtr: &Names, Size: &ThisNamesSize) !=
894 0) {
895 PROF_ERR("%s\n", "failed to get registered section bounds");
896 return -1;
897 }
898
899 RegisteredRanges[I].Data = Data;
900 RegisteredRanges[I].Counters = Counters;
901 RegisteredRanges[I].UniformCounters = UniformCounters;
902 RegisteredRanges[I].Names = Names;
903 RegisteredRanges[I].DataSize = ThisDataSize;
904 RegisteredRanges[I].CountersSize = ThisCountersSize;
905 RegisteredRanges[I].UniformCountersSize = ThisUniformCountersSize;
906 RegisteredRanges[I].NamesSize = ThisNamesSize;
907 RegisteredRanges[I].DataOffset = RegisteredDataSize;
908 RegisteredRanges[I].CountersOffset = RegisteredCountersSize;
909 RegisteredRanges[I].UniformCountersOffset = RegisteredUniformCountersSize;
910 RegisteredDataSize += ThisDataSize;
911 RegisteredCountersSize += ThisCountersSize;
912 RegisteredUniformCountersSize += ThisUniformCountersSize;
913
914 int ReuseNames = 0;
915 for (int J = 0; J < I; ++J) {
916 if (RegisteredRanges[J].Names == Names &&
917 RegisteredRanges[J].NamesSize == ThisNamesSize) {
918 RegisteredRanges[I].NamesOffset = RegisteredRanges[J].NamesOffset;
919 ReuseNames = 1;
920 break;
921 }
922 }
923 if (!ReuseNames) {
924 RegisteredRanges[I].NamesOffset = RegisteredNamesSize;
925 RegisteredNamesSize += ThisNamesSize;
926 }
927 }
928
929 DataSize = RegisteredDataSize;
930 CountersSize = RegisteredCountersSize;
931 UniformCountersSize = RegisteredUniformCountersSize;
932 NamesSize = RegisteredNamesSize;
933 Data.HostBegin = DataSize ? (char *)malloc(size: DataSize) : nullptr;
934 Cnts.HostBegin = CountersSize ? (char *)malloc(size: CountersSize) : nullptr;
935 UCnts.HostBegin =
936 UniformCountersSize ? (char *)malloc(size: UniformCountersSize) : nullptr;
937 Names.HostBegin = NamesSize ? (char *)malloc(size: NamesSize) : nullptr;
938 Data.Owner.reset(P: Data.HostBegin);
939 Cnts.Owner.reset(P: Cnts.HostBegin);
940 UCnts.Owner.reset(P: UCnts.HostBegin);
941 Names.Owner.reset(P: Names.HostBegin);
942 if ((DataSize > 0 && !Data.HostBegin) ||
943 (CountersSize > 0 && !Cnts.HostBegin) ||
944 (UniformCountersSize > 0 && !UCnts.HostBegin) ||
945 (NamesSize > 0 && !Names.HostBegin)) {
946 PROF_ERR("%s\n", "failed to allocate host memory for device sections");
947 return -1;
948 }
949
950 for (int I = 0; I < NumRegisteredRanges; ++I) {
951 RegisteredSectionRange *R = &RegisteredRanges[I];
952 if (memcpyDeviceToHost(Dst: Data.HostBegin + R->DataOffset, Src: R->Data,
953 Size: R->DataSize) != 0 ||
954 memcpyDeviceToHost(Dst: Cnts.HostBegin + R->CountersOffset, Src: R->Counters,
955 Size: R->CountersSize) != 0 ||
956 memcpyDeviceToHost(Dst: UCnts.HostBegin + R->UniformCountersOffset,
957 Src: R->UniformCounters, Size: R->UniformCountersSize) != 0) {
958 PROF_ERR("%s\n", "failed to copy profile sections from device");
959 return -1;
960 }
961
962 int CopyNames = 1;
963 for (int J = 0; J < I; ++J) {
964 if (RegisteredRanges[J].Names == R->Names &&
965 RegisteredRanges[J].NamesSize == R->NamesSize) {
966 CopyNames = 0;
967 break;
968 }
969 }
970 if (CopyNames && R->NamesSize > 0 &&
971 memcpyDeviceToHost(Dst: Names.HostBegin + R->NamesOffset, Src: R->Names,
972 Size: R->NamesSize) != 0) {
973 PROF_ERR("%s\n", "failed to copy profile sections from device");
974 return -1;
975 }
976 }
977 } else {
978 if (Cnts.prepare() != 0 || Data.prepare() != 0 || Names.prepare() != 0 ||
979 UCnts.prepare() != 0) {
980 PROF_ERR("%s\n", "failed to allocate host memory for device sections");
981 return -1;
982 }
983
984 if (Data.copy() != 0 || Cnts.copy() != 0 || Names.copy() != 0 ||
985 UCnts.copy() != 0) {
986 PROF_ERR("%s\n", "failed to copy profile sections from device");
987 return -1;
988 }
989
990 /* Cache buffers so RDC-mode multi-shadow drains can reuse them.
991 * release() prevents the scope guards from freeing what the cache owns. */
992 Cnts.commitCache();
993 Data.commitCache();
994 Names.commitCache();
995 UCnts.commitCache();
996 }
997
998 if (isVerboseMode())
999 PROF_NOTE("Copied device sections: Counters=%zu, Data=%zu, Names=%zu, "
1000 "UniformCounters=%zu\n",
1001 CountersSize, DataSize, NamesSize, UniformCountersSize);
1002
1003 // Arrange buffer as [Data][Padding][Counters][Names] to match the layout
1004 // expected by lprofWriteDataImpl (CountersDelta = CountersBegin - DataBegin).
1005 const uint64_t NumData = DataSize / sizeof(__llvm_profile_data);
1006 const uint64_t NumBitmapBytes = 0;
1007 const uint64_t NumUniformCounters = UniformCountersSize / sizeof(uint64_t);
1008 const uint64_t VTableSectionSize = 0;
1009 const uint64_t VNamesSize = 0;
1010 uint64_t PaddingBytesBeforeCounters, PaddingBytesAfterCounters,
1011 PaddingBytesAfterBitmapBytes, PaddingBytesAfterUniformCounters,
1012 PaddingBytesAfterNames, PaddingBytesAfterVTable, PaddingBytesAfterVNames;
1013
1014 if (__llvm_profile_get_padding_sizes_for_counters(
1015 DataSize, CountersSize, NumBitmapBytes, NumUniformCounters, NamesSize,
1016 VTableSize: VTableSectionSize, VNameSize: VNamesSize, PaddingBytesBeforeCounters: &PaddingBytesBeforeCounters,
1017 PaddingBytesAfterCounters: &PaddingBytesAfterCounters, PaddingBytesAfterBitmap: &PaddingBytesAfterBitmapBytes,
1018 PaddingBytesAfterUniformCounters: &PaddingBytesAfterUniformCounters, PaddingBytesAfterNames: &PaddingBytesAfterNames,
1019 PaddingBytesAfterVTable: &PaddingBytesAfterVTable, PaddingBytesAfterVNames: &PaddingBytesAfterVNames) != 0) {
1020 PROF_ERR("%s\n", "failed to get padding sizes");
1021 return -1;
1022 }
1023
1024 size_t ContiguousBufferSize =
1025 DataSize + PaddingBytesBeforeCounters + CountersSize + NamesSize;
1026 UniqueFree ContiguousBuf(malloc(size: ContiguousBufferSize));
1027 if (!ContiguousBuf.get()) {
1028 PROF_ERR("%s\n", "failed to allocate contiguous buffer");
1029 return -1;
1030 }
1031 char *ContiguousBuffer = ContiguousBuf.get();
1032 __builtin_memset(ContiguousBuffer, 0, ContiguousBufferSize);
1033
1034 char *BufDataBegin = ContiguousBuffer;
1035 char *BufCountersBegin =
1036 ContiguousBuffer + DataSize + PaddingBytesBeforeCounters;
1037 char *BufNamesBegin = BufCountersBegin + CountersSize;
1038
1039 __builtin_memcpy(BufDataBegin, Data.HostBegin, DataSize);
1040 __builtin_memcpy(BufCountersBegin, Cnts.HostBegin, CountersSize);
1041 __builtin_memcpy(BufNamesBegin, Names.HostBegin, NamesSize);
1042
1043 // CounterPtr and UniformCounterPtr are device-relative offsets; relocate
1044 // them for the file layout where the Data section precedes the Counters and
1045 // UniformCounters sections. Uniform counters are copied in linker (section)
1046 // order and located via their relative pointer, exactly like the regular
1047 // counters: llvm-profdata reads them through UniformCounterPtr (decrementing
1048 // UniformCountersDelta per record, just like CountersDelta) and does not
1049 // assume data-record order, so no reordering is needed.
1050 ptrdiff_t UCFileOffset = DataSize + PaddingBytesBeforeCounters +
1051 CountersSize + PaddingBytesAfterCounters +
1052 NumBitmapBytes + PaddingBytesAfterBitmapBytes;
1053 __llvm_profile_data *RelocatedData = (__llvm_profile_data *)BufDataBegin;
1054 for (uint64_t i = 0; i < NumData; ++i) {
1055 size_t DataRecordOffset = i * sizeof(__llvm_profile_data);
1056 const char *RangeDevDataBegin = (const char *)DevDataBegin;
1057 const char *RangeDevCountersBegin = (const char *)DevCntsBegin;
1058 const char *RangeDevUCntsBegin = (const char *)DevUniformCntsBegin;
1059 size_t RangeCountersOffset = 0;
1060 size_t RangeUCntsOffset = 0;
1061 if (UseRegisteredSections) {
1062 int FoundRange = 0;
1063 for (int R = 0; R < NumRegisteredRanges; ++R) {
1064 RegisteredSectionRange *Range = &RegisteredRanges[R];
1065 if (DataRecordOffset < Range->DataOffset ||
1066 DataRecordOffset >= Range->DataOffset + Range->DataSize)
1067 continue;
1068 RangeDevDataBegin = (const char *)Range->Data;
1069 RangeDevCountersBegin = (const char *)Range->Counters;
1070 RangeDevUCntsBegin = (const char *)Range->UniformCounters;
1071 RangeCountersOffset = Range->CountersOffset;
1072 RangeUCntsOffset = Range->UniformCountersOffset;
1073 DataRecordOffset -= Range->DataOffset;
1074 FoundRange = 1;
1075 break;
1076 }
1077 if (!FoundRange) {
1078 PROF_ERR("%s\n", "failed to locate profile data record range");
1079 return -1;
1080 }
1081 }
1082 const char *DeviceDataStructAddr = RangeDevDataBegin + DataRecordOffset;
1083 if (RelocatedData[i].CounterPtr) {
1084 const char *DeviceCountersAddr =
1085 DeviceDataStructAddr + (ptrdiff_t)RelocatedData[i].CounterPtr;
1086 ptrdiff_t OffsetIntoCountersSection =
1087 DeviceCountersAddr - RangeDevCountersBegin;
1088 ptrdiff_t NewRelativeOffset =
1089 DataSize + PaddingBytesBeforeCounters + RangeCountersOffset +
1090 OffsetIntoCountersSection - (i * sizeof(__llvm_profile_data));
1091 __builtin_memcpy((char *)RelocatedData + i * sizeof(__llvm_profile_data) +
1092 offsetof(__llvm_profile_data, CounterPtr),
1093 &NewRelativeOffset, sizeof(NewRelativeOffset));
1094 }
1095 if (UCnts.HostBegin && RelocatedData[i].UniformCounterPtr) {
1096 const char *DeviceUCAddr =
1097 DeviceDataStructAddr + (ptrdiff_t)RelocatedData[i].UniformCounterPtr;
1098 ptrdiff_t OffsetIntoUCSection = DeviceUCAddr - RangeDevUCntsBegin;
1099 ptrdiff_t NewUCRelativeOffset = UCFileOffset + RangeUCntsOffset +
1100 OffsetIntoUCSection -
1101 (i * sizeof(__llvm_profile_data));
1102 __builtin_memcpy((char *)RelocatedData + i * sizeof(__llvm_profile_data) +
1103 offsetof(__llvm_profile_data, UniformCounterPtr),
1104 &NewUCRelativeOffset, sizeof(NewUCRelativeOffset));
1105 } else {
1106 __builtin_memset((char *)RelocatedData + i * sizeof(__llvm_profile_data) +
1107 offsetof(__llvm_profile_data, UniformCounterPtr),
1108 0, sizeof(RelocatedData[i].UniformCounterPtr));
1109 }
1110 __builtin_memset((char *)RelocatedData + i * sizeof(__llvm_profile_data) +
1111 offsetof(__llvm_profile_data, BitmapPtr),
1112 0,
1113 sizeof(RelocatedData[i].BitmapPtr) +
1114 sizeof(RelocatedData[i].FunctionPointer) +
1115 sizeof(RelocatedData[i].Values));
1116 }
1117
1118 ret = __llvm_write_custom_profile(
1119 Target, DataBegin: (__llvm_profile_data *)BufDataBegin,
1120 DataEnd: (__llvm_profile_data *)(BufDataBegin + DataSize), CountersBegin: BufCountersBegin,
1121 CountersEnd: BufCountersBegin + CountersSize, UniformCountersBegin: UCnts.HostBegin,
1122 UniformCountersEnd: UCnts.HostBegin ? UCnts.HostBegin + UniformCountersSize : nullptr,
1123 NamesBegin: BufNamesBegin, NamesEnd: BufNamesBegin + NamesSize, VersionOverride: nullptr);
1124
1125 if (ret != 0) {
1126 PROF_ERR("%s\n", "failed to write device profile using shared API");
1127 } else {
1128#if defined(__linux__) && !defined(_WIN32)
1129 // Dedup against the supplemental HSA pass: this section is now drained, so
1130 // the HSA walk must not drain the same device code object again.
1131 profRecordDrainedBounds(Data: DevDataBegin, Counters: DevCntsBegin, Names: DevNamesBegin);
1132#endif
1133 if (isVerboseMode())
1134 PROF_NOTE("%s\n", "Successfully wrote device profile using shared API");
1135 }
1136
1137 return ret;
1138}
1139
1140static int processShadowVariable(int Index, const char *Target) {
1141 void *ShadowVar = OffloadShadowVariables[Index];
1142 void *DeviceSections = nullptr;
1143 if (hipGetSymbolAddress(devPtr: &DeviceSections, symbol: ShadowVar) != 0) {
1144 PROF_WARN("failed to get symbol address for shadow variable %p\n",
1145 ShadowVar);
1146 return -1;
1147 }
1148 /* DeviceSections points at the per-TU sections struct itself. */
1149 const OffloadSectionShadowGroup *Sections = nullptr;
1150 if (Index < CapSectionShadowGroups)
1151 Sections = &OffloadSectionShadowGroups[Index];
1152 if (!hasCompleteSectionShadows(Sections))
1153 return 0;
1154 return processDeviceOffloadPrf(DeviceOffloadPrf: DeviceSections, Target, Sections);
1155}
1156
1157static int isHipAvailable(void) {
1158 ensureHipLoaded();
1159 return pHipMemcpy != nullptr && pHipGetSymbolAddress != nullptr;
1160}
1161
1162/* -------------------------------------------------------------------------- */
1163/* Collect device-side profile data */
1164/* -------------------------------------------------------------------------- */
1165
1166/* Host-shadow drain: static-linked kernels (host __hipRegisterVar shadows) and
1167 * intercepted dynamic modules. The caller gates this on
1168 * (NumShadowVariables || NumDynamicModules) && isHipAvailable(); pure
1169 * device-linked programs (RCCL) are handled by the supplemental HSA pass. */
1170static int collectHostShadowData(void) {
1171 int Ret = 0;
1172
1173 /* Shadow variables (static-linked kernels): drain from every device. */
1174 if (NumShadowVariables > 0) {
1175 int OrigDevice = -1;
1176 hipGetDevice(DeviceId: &OrigDevice);
1177
1178 for (int Dev = 0; Dev < NumDevices; ++Dev) {
1179 if (!shouldCollectDevice(DeviceId: Dev)) {
1180 if (isVerboseMode())
1181 PROF_NOTE("Skipping unused device %d\n", Dev);
1182 continue;
1183 }
1184#if defined(__linux__) && !defined(_WIN32)
1185 /* When no kernel launch was tracked at all, shouldCollectDevice() falls
1186 * back to collect-all, which can fault/hang reading a non-resident
1187 * device's sections on a multi-GPU host. On Linux the supplemental HSA
1188 * drain covers those cases safely. */
1189 if (!__atomic_load_n(&AnyDeviceUsed, __ATOMIC_ACQUIRE)) {
1190 if (isVerboseMode())
1191 PROF_NOTE("No tracked launch; deferring device %d to HSA drain\n",
1192 Dev);
1193 continue;
1194 }
1195#endif
1196 if (hipSetDevice(DeviceId: Dev) != 0) {
1197 if (isVerboseMode())
1198 PROF_NOTE("Failed to set device %d, skipping\n", Dev);
1199 continue;
1200 }
1201 const char *ArchName = getDeviceArchName(DeviceId: Dev);
1202 if (isVerboseMode())
1203 PROF_NOTE("Collecting static profile data from device %d (%s)\n", Dev,
1204 ArchName);
1205 for (int i = 0; i < NumShadowVariables; ++i) {
1206 /* Stable name per shadow so a repeated drain (explicit collect plus the
1207 * atexit drain) overwrites its own profraw rather than emitting a
1208 * second one: bare arch for a single TU, arch.<i> for RDC multi-TU. */
1209 const char *Target = ArchName;
1210 char TargetWithIdx[64];
1211 if (NumShadowVariables > 1) {
1212 snprintf(s: TargetWithIdx, maxlen: sizeof(TargetWithIdx), format: "%s.%d", ArchName, i);
1213 Target = TargetWithIdx;
1214 }
1215 if (processShadowVariable(Index: i, Target) != 0)
1216 Ret = -1;
1217 }
1218 }
1219
1220 if (OrigDevice >= 0)
1221 hipSetDevice(DeviceId: OrigDevice);
1222 }
1223
1224 /* Warn about unprocessed TUs; skip cleared slots (already drained). */
1225 lockDynamicModules();
1226 for (int i = 0; i < NumDynamicModules; ++i) {
1227 OffloadDynamicModuleInfo *MI = &DynamicModules[i];
1228 if (!MI->ModulePtr)
1229 continue;
1230 for (int t = 0; t < MI->NumTUs; ++t) {
1231 if (!MI->TUs[t].Processed) {
1232 PROF_WARN("dynamic module %p TU %d was not processed before exit\n",
1233 MI->ModulePtr, t);
1234 Ret = -1;
1235 }
1236 }
1237 }
1238 unlockDynamicModules();
1239
1240 return Ret;
1241}
1242
1243extern "C" int __llvm_profile_hip_collect_device_data(void) {
1244 int Ret = 0;
1245
1246 if ((NumShadowVariables != 0 || NumDynamicModules != 0) && isHipAvailable() &&
1247 collectHostShadowData() != 0)
1248 Ret = -1;
1249
1250#if defined(__linux__) && !defined(_WIN32)
1251 /* Supplemental HSA-introspection drain */
1252 if (drainDevicesViaHsa() != 0)
1253 Ret = -1;
1254#endif
1255
1256 if (Ret != 0)
1257 PROF_WARN("%s\n", "failed to collect device profile data");
1258 return Ret;
1259}
1260
1261/* Linux HIP interceptors. */
1262
1263#if defined(__linux__) && !defined(_WIN32)
1264
1265typedef struct {
1266 unsigned int x;
1267 unsigned int y;
1268 unsigned int z;
1269} HipDim3;
1270
1271typedef struct {
1272 void *Func;
1273 HipDim3 GridDim;
1274 HipDim3 BlockDim;
1275 void **Args;
1276 size_t SharedMem;
1277 HipStream Stream;
1278} HipLaunchParams;
1279
1280typedef struct {
1281 HipDim3 GridDim;
1282 HipDim3 BlockDim;
1283 size_t DynamicSmemBytes;
1284 HipStream Stream;
1285 void *Attrs;
1286 unsigned NumAttrs;
1287} HipLaunchConfig;
1288
1289typedef void *HipFunction;
1290typedef void *HipEvent;
1291typedef void *HipGraphExec;
1292
1293static int recordHipLaunchResult(int Rc, HipStream Stream) {
1294 if (Rc == 0)
1295 markLaunchStreamDeviceUsed(Stream);
1296 return Rc;
1297}
1298
1299static int recordHipMultiDeviceLaunchResult(int Rc,
1300 HipLaunchParams *LaunchParams,
1301 int NumLaunches) {
1302 if (Rc != 0 || !LaunchParams || NumLaunches <= 0)
1303 return Rc;
1304 for (int I = 0; I < NumLaunches; ++I)
1305 markLaunchStreamDeviceUsed(Stream: LaunchParams[I].Stream);
1306 return Rc;
1307}
1308
1309// interceptors must have external linkage
1310// NOLINTBEGIN(misc-use-internal-linkage)
1311INTERCEPTOR(int, hipLaunchKernel, const void *Function, HipDim3 GridDim,
1312 HipDim3 BlockDim, void **Args, size_t SharedMemBytes,
1313 HipStream Stream) {
1314 return recordHipLaunchResult(REAL(hipLaunchKernel)(Function, GridDim,
1315 BlockDim, Args,
1316 SharedMemBytes, Stream),
1317 Stream);
1318}
1319
1320INTERCEPTOR(int, hipLaunchKernel_spt, const void *Function, HipDim3 GridDim,
1321 HipDim3 BlockDim, void **Args, size_t SharedMemBytes,
1322 HipStream Stream) {
1323 return recordHipLaunchResult(
1324 REAL(hipLaunchKernel_spt)(Function, GridDim, BlockDim, Args,
1325 SharedMemBytes, Stream),
1326 Stream);
1327}
1328
1329INTERCEPTOR(int, hipExtLaunchKernel, const void *Function, HipDim3 GridDim,
1330 HipDim3 BlockDim, void **Args, size_t SharedMemBytes,
1331 HipStream Stream, HipEvent StartEvent, HipEvent StopEvent,
1332 int Flags) {
1333 return recordHipLaunchResult(
1334 REAL(hipExtLaunchKernel)(Function, GridDim, BlockDim, Args,
1335 SharedMemBytes, Stream, StartEvent, StopEvent,
1336 Flags),
1337 Stream);
1338}
1339
1340INTERCEPTOR(int, hipLaunchKernelExC, const HipLaunchConfig *Config,
1341 const void *Function, void **Args) {
1342 int Rc = REAL(hipLaunchKernelExC)(Config, Function, Args);
1343 return recordHipLaunchResult(Rc, Stream: Config ? Config->Stream : nullptr);
1344}
1345
1346INTERCEPTOR(int, hipLaunchCooperativeKernel, const void *Function,
1347 HipDim3 GridDim, HipDim3 BlockDim, void **KernelParams,
1348 unsigned SharedMemBytes, HipStream Stream) {
1349 return recordHipLaunchResult(
1350 REAL(hipLaunchCooperativeKernel)(Function, GridDim, BlockDim,
1351 KernelParams, SharedMemBytes, Stream),
1352 Stream);
1353}
1354
1355INTERCEPTOR(int, hipLaunchCooperativeKernel_spt, const void *Function,
1356 HipDim3 GridDim, HipDim3 BlockDim, void **KernelParams,
1357 unsigned SharedMemBytes, HipStream Stream) {
1358 return recordHipLaunchResult(
1359 REAL(hipLaunchCooperativeKernel_spt)(
1360 Function, GridDim, BlockDim, KernelParams, SharedMemBytes, Stream),
1361 Stream);
1362}
1363
1364INTERCEPTOR(int, hipLaunchCooperativeKernelMultiDevice,
1365 HipLaunchParams *LaunchParams, int NumDevices, unsigned Flags) {
1366 return recordHipMultiDeviceLaunchResult(
1367 REAL(hipLaunchCooperativeKernelMultiDevice)(LaunchParams, NumDevices,
1368 Flags),
1369 LaunchParams, NumLaunches: NumDevices);
1370}
1371
1372INTERCEPTOR(int, hipExtLaunchMultiKernelMultiDevice,
1373 HipLaunchParams *LaunchParams, int NumDevices, unsigned Flags) {
1374 return recordHipMultiDeviceLaunchResult(
1375 REAL(hipExtLaunchMultiKernelMultiDevice)(LaunchParams, NumDevices, Flags),
1376 LaunchParams, NumLaunches: NumDevices);
1377}
1378
1379INTERCEPTOR(int, hipModuleLaunchKernel, HipFunction Function, unsigned GridDimX,
1380 unsigned GridDimY, unsigned GridDimZ, unsigned BlockDimX,
1381 unsigned BlockDimY, unsigned BlockDimZ, unsigned SharedMemBytes,
1382 HipStream Stream, void **KernelParams, void **Extra) {
1383 return recordHipLaunchResult(
1384 REAL(hipModuleLaunchKernel)(Function, GridDimX, GridDimY, GridDimZ,
1385 BlockDimX, BlockDimY, BlockDimZ,
1386 SharedMemBytes, Stream, KernelParams, Extra),
1387 Stream);
1388}
1389
1390INTERCEPTOR(int, hipExtModuleLaunchKernel, HipFunction Function,
1391 unsigned GridDimX, unsigned GridDimY, unsigned GridDimZ,
1392 unsigned BlockDimX, unsigned BlockDimY, unsigned BlockDimZ,
1393 size_t SharedMemBytes, HipStream Stream, void **KernelParams,
1394 void **Extra, HipEvent StartEvent, HipEvent StopEvent,
1395 unsigned Flags) {
1396 return recordHipLaunchResult(
1397 REAL(hipExtModuleLaunchKernel)(Function, GridDimX, GridDimY, GridDimZ,
1398 BlockDimX, BlockDimY, BlockDimZ,
1399 SharedMemBytes, Stream, KernelParams,
1400 Extra, StartEvent, StopEvent, Flags),
1401 Stream);
1402}
1403
1404INTERCEPTOR(int, hipGraphLaunch, HipGraphExec GraphExec, HipStream Stream) {
1405 return recordHipLaunchResult(REAL(hipGraphLaunch)(GraphExec, Stream), Stream);
1406}
1407
1408INTERCEPTOR(int, hipGraphLaunch_spt, HipGraphExec GraphExec, HipStream Stream) {
1409 return recordHipLaunchResult(REAL(hipGraphLaunch_spt)(GraphExec, Stream),
1410 Stream);
1411}
1412
1413INTERCEPTOR(int, hipModuleLoad, void **module, const char *fname) {
1414 int rc = REAL(hipModuleLoad)(module, fname);
1415 /* Pass NULL image: no in-memory ELF is available for filename loads,
1416 * so the register hook skips symbol enumeration. */
1417 __llvm_profile_offload_register_dynamic_module(ModuleLoadRc: rc, Ptr: module, Image: nullptr);
1418 return rc;
1419}
1420
1421INTERCEPTOR(int, hipModuleLoadData, void **module, const void *image) {
1422 int rc = REAL(hipModuleLoadData)(module, image);
1423 __llvm_profile_offload_register_dynamic_module(ModuleLoadRc: rc, Ptr: module, Image: image);
1424 return rc;
1425}
1426
1427INTERCEPTOR(int, hipModuleLoadDataEx, void **module, const void *image,
1428 unsigned numOptions, void **options, void **optionValues) {
1429 int rc = REAL(hipModuleLoadDataEx)(module, image, numOptions, options,
1430 optionValues);
1431 __llvm_profile_offload_register_dynamic_module(ModuleLoadRc: rc, Ptr: module, Image: image);
1432 return rc;
1433}
1434
1435INTERCEPTOR(int, hipModuleUnload, void *module) {
1436 /* Drain counters before the module is destroyed; device addresses
1437 * captured at register time are invalid after unload. */
1438 __llvm_profile_offload_unregister_dynamic_module(Ptr: module);
1439 return REAL(hipModuleUnload)(module);
1440}
1441// NOLINTEND(misc-use-internal-linkage)
1442
1443__attribute__((constructor)) static void installHipInterceptors() {
1444 /* Avoid interception unless the HIP runtime is already loaded. */
1445 int HasModuleLoad = dlsym(RTLD_DEFAULT, name: "hipModuleLoad") != nullptr;
1446 int InstalledLaunch = 0;
1447#define TRY_INTERCEPT_LAUNCH(Name) \
1448 do { \
1449 if (dlsym(RTLD_DEFAULT, #Name)) \
1450 InstalledLaunch |= INTERCEPT_FUNCTION(Name); \
1451 } while (0)
1452 TRY_INTERCEPT_LAUNCH(hipLaunchKernel);
1453 TRY_INTERCEPT_LAUNCH(hipLaunchKernel_spt);
1454 TRY_INTERCEPT_LAUNCH(hipExtLaunchKernel);
1455 TRY_INTERCEPT_LAUNCH(hipLaunchKernelExC);
1456 TRY_INTERCEPT_LAUNCH(hipLaunchCooperativeKernel);
1457 TRY_INTERCEPT_LAUNCH(hipLaunchCooperativeKernel_spt);
1458 TRY_INTERCEPT_LAUNCH(hipLaunchCooperativeKernelMultiDevice);
1459 TRY_INTERCEPT_LAUNCH(hipExtLaunchMultiKernelMultiDevice);
1460 TRY_INTERCEPT_LAUNCH(hipModuleLaunchKernel);
1461 TRY_INTERCEPT_LAUNCH(hipExtModuleLaunchKernel);
1462 TRY_INTERCEPT_LAUNCH(hipGraphLaunch);
1463 TRY_INTERCEPT_LAUNCH(hipGraphLaunch_spt);
1464#undef TRY_INTERCEPT_LAUNCH
1465 int InstalledAny = InstalledLaunch;
1466 if (HasModuleLoad) {
1467 HasModuleLoad = INTERCEPT_FUNCTION(hipModuleLoad);
1468 InstalledAny |= HasModuleLoad;
1469 }
1470 if (!InstalledAny)
1471 return;
1472 if (isVerboseMode())
1473 PROF_NOTE("%s", "Installing HIP interceptors\n");
1474 if (HasModuleLoad) {
1475 INTERCEPT_FUNCTION(hipModuleLoadData);
1476 INTERCEPT_FUNCTION(hipModuleLoadDataEx);
1477 INTERCEPT_FUNCTION(hipModuleUnload);
1478 }
1479}
1480
1481#endif /* __linux__ */
1482