| 1 | //===- InstrProfilingPlatformROCmHSA.cpp - ROCm HSA device drain ---------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // Supplemental HSA-introspection drain (Linux only). |
| 10 | // |
| 11 | // The host-shadow drain in InstrProfilingPlatformROCm.cpp only sees device code |
| 12 | // objects with a host-side shadow (__hipRegisterVar) or an intercepted |
| 13 | // hipModuleLoad*. Device-linked code with no host shadow (e.g. RCCL) is |
| 14 | // invisible to it. This pass walks every GPU agent's loaded executables via |
| 15 | // HSA, finds each __llvm_profile_sections table on the device, and drains the |
| 16 | // ones the host-shadow pass missed (deduped by the section-bounds tuple). It |
| 17 | // reuses processDeviceOffloadPrf() so the profraw layout is identical. |
| 18 | // |
| 19 | //===----------------------------------------------------------------------===// |
| 20 | |
| 21 | #if defined(__linux__) |
| 22 | |
| 23 | extern "C" { |
| 24 | #include "InstrProfiling.h" |
| 25 | #include "InstrProfilingPort.h" |
| 26 | } |
| 27 | |
| 28 | #include "InstrProfilingPlatformROCmInternal.h" |
| 29 | #include "interception/interception.h" |
| 30 | // C (not C++) headers: clang_rt.profile is built -nostdinc++. |
| 31 | #include <stddef.h> |
| 32 | #include <stdint.h> |
| 33 | #include <stdio.h> |
| 34 | #include <stdlib.h> |
| 35 | #include <string.h> |
| 36 | |
| 37 | using namespace __prof_rocm; |
| 38 | |
| 39 | // Mirrored HSA declarations the drain needs (dlopen'd, not linked). See the |
| 40 | // header for the rationale; the values are HSA's stable C ABI. |
| 41 | #include "InstrProfilingPlatformROCmHSADefs.h" |
| 42 | |
| 43 | #ifdef PROFILE_VERIFY_HSA_ABI |
| 44 | // When the real ROCm headers are available at build time (developer installs |
| 45 | // and the downstream GPU CI), check that the mirror above still matches them. |
| 46 | #include <hsa/hsa.h> |
| 47 | #include <hsa/hsa_ven_amd_loader.h> |
| 48 | |
| 49 | static_assert(PROF_HSA_STATUS_SUCCESS == HSA_STATUS_SUCCESS, "HSA ABI drift" ); |
| 50 | static_assert(PROF_HSA_STATUS_INFO_BREAK == HSA_STATUS_INFO_BREAK, |
| 51 | "HSA ABI drift" ); |
| 52 | static_assert(PROF_HSA_AGENT_INFO_NAME == HSA_AGENT_INFO_NAME, "HSA ABI drift" ); |
| 53 | static_assert(PROF_HSA_AGENT_INFO_DEVICE == HSA_AGENT_INFO_DEVICE, |
| 54 | "HSA ABI drift" ); |
| 55 | static_assert(PROF_HSA_DEVICE_TYPE_GPU == HSA_DEVICE_TYPE_GPU, "HSA ABI drift" ); |
| 56 | static_assert(PROF_HSA_SYMBOL_KIND_VARIABLE == HSA_SYMBOL_KIND_VARIABLE, |
| 57 | "HSA ABI drift" ); |
| 58 | static_assert(PROF_HSA_EXECUTABLE_SYMBOL_INFO_TYPE == |
| 59 | HSA_EXECUTABLE_SYMBOL_INFO_TYPE, |
| 60 | "HSA ABI drift" ); |
| 61 | static_assert(PROF_HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH == |
| 62 | HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, |
| 63 | "HSA ABI drift" ); |
| 64 | static_assert(PROF_HSA_EXECUTABLE_SYMBOL_INFO_NAME == |
| 65 | HSA_EXECUTABLE_SYMBOL_INFO_NAME, |
| 66 | "HSA ABI drift" ); |
| 67 | static_assert(PROF_HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS == |
| 68 | HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, |
| 69 | "HSA ABI drift" ); |
| 70 | static_assert(PROF_HSA_EXTENSION_AMD_LOADER == HSA_EXTENSION_AMD_LOADER, |
| 71 | "HSA ABI drift" ); |
| 72 | |
| 73 | static_assert(sizeof(prof_hsa_agent_t) == sizeof(hsa_agent_t), "HSA ABI drift" ); |
| 74 | static_assert(sizeof(prof_hsa_executable_t) == sizeof(hsa_executable_t), |
| 75 | "HSA ABI drift" ); |
| 76 | static_assert(sizeof(prof_hsa_executable_symbol_t) == |
| 77 | sizeof(hsa_executable_symbol_t), |
| 78 | "HSA ABI drift" ); |
| 79 | |
| 80 | static_assert(sizeof(prof_hsa_loader_segment_descriptor_t) == |
| 81 | sizeof(hsa_ven_amd_loader_segment_descriptor_t), |
| 82 | "HSA ABI drift" ); |
| 83 | static_assert(offsetof(prof_hsa_loader_segment_descriptor_t, agent) == |
| 84 | offsetof(hsa_ven_amd_loader_segment_descriptor_t, agent), |
| 85 | "HSA ABI drift" ); |
| 86 | static_assert(offsetof(prof_hsa_loader_segment_descriptor_t, executable) == |
| 87 | offsetof(hsa_ven_amd_loader_segment_descriptor_t, executable), |
| 88 | "HSA ABI drift" ); |
| 89 | static_assert(offsetof(prof_hsa_loader_segment_descriptor_t, segment_base) == |
| 90 | offsetof(hsa_ven_amd_loader_segment_descriptor_t, |
| 91 | segment_base), |
| 92 | "HSA ABI drift" ); |
| 93 | static_assert(offsetof(prof_hsa_loader_segment_descriptor_t, segment_size) == |
| 94 | offsetof(hsa_ven_amd_loader_segment_descriptor_t, |
| 95 | segment_size), |
| 96 | "HSA ABI drift" ); |
| 97 | |
| 98 | // We fetch the loader pfn table by raw layout, so query_segment_descriptors |
| 99 | // must sit at the same offset as in the real table. |
| 100 | static_assert(offsetof(prof_hsa_loader_pfn_t, query_segment_descriptors) == |
| 101 | offsetof(hsa_ven_amd_loader_1_00_pfn_t, |
| 102 | hsa_ven_amd_loader_query_segment_descriptors), |
| 103 | "HSA ABI drift" ); |
| 104 | #endif // PROFILE_VERIFY_HSA_ABI |
| 105 | |
| 106 | static hsa_iterate_agents_ty pHsaIterateAgents = nullptr; |
| 107 | static hsa_agent_get_info_ty pHsaAgentGetInfo = nullptr; |
| 108 | static hsa_executable_iterate_agent_symbols_ty pHsaExecIterAgentSyms = nullptr; |
| 109 | static hsa_executable_symbol_get_info_ty pHsaSymGetInfo = nullptr; |
| 110 | static hsa_loader_query_segment_descriptors_ty pQuerySegDescs = nullptr; |
| 111 | |
| 112 | /* Status-check shorthands, in the spirit of the thin HIP wrappers in |
| 113 | * InstrProfilingPlatformROCm.cpp: every HSA entry point returns |
| 114 | * prof_hsa_status_t. hsaOkOrBreak() also accepts INFO_BREAK, which the |
| 115 | * iterate_* callbacks use to stop early and is not an error. */ |
| 116 | static inline bool hsaOk(prof_hsa_status_t St) { |
| 117 | return St == PROF_HSA_STATUS_SUCCESS; |
| 118 | } |
| 119 | static inline bool hsaOkOrBreak(prof_hsa_status_t St) { |
| 120 | return St == PROF_HSA_STATUS_SUCCESS || St == PROF_HSA_STATUS_INFO_BREAK; |
| 121 | } |
| 122 | |
| 123 | /* 0 = not attempted, 1 = ready, -1 = unavailable. Acquire/release atomics: a |
| 124 | * thread observing HsaRuntimeState==1 also sees the published p* pointers. */ |
| 125 | static int HsaRuntimeState = 0; |
| 126 | |
| 127 | static int setHsaRuntimeState(int S) { |
| 128 | __atomic_store_n(&HsaRuntimeState, S, __ATOMIC_RELEASE); |
| 129 | return S > 0 ? 0 : -1; |
| 130 | } |
| 131 | |
| 132 | /* Resolve HSA entry points and the AMD loader extension once, and confirm HIP's |
| 133 | * hipMemcpy is reachable for the device-to-host copies. */ |
| 134 | static int loadHsaRuntimePointers(void) { |
| 135 | int State = __atomic_load_n(&HsaRuntimeState, __ATOMIC_ACQUIRE); |
| 136 | if (State) |
| 137 | return State > 0 ? 0 : -1; |
| 138 | |
| 139 | if (!__interception::DynamicLoaderAvailable()) { |
| 140 | if (isVerboseMode()) |
| 141 | PROF_NOTE("%s" , "Dynamic library loading not available - " |
| 142 | "HSA device profiling disabled\n" ); |
| 143 | return setHsaRuntimeState(-1); |
| 144 | } |
| 145 | |
| 146 | void *Hsa = __interception::OpenLibrary(name: "libhsa-runtime64.so" ); |
| 147 | if (!Hsa) |
| 148 | Hsa = __interception::OpenLibrary(name: "libhsa-runtime64.so.1" ); |
| 149 | if (!Hsa) { |
| 150 | if (isVerboseMode()) |
| 151 | PROF_NOTE("%s" , "libhsa-runtime64.so not loadable - " |
| 152 | "HSA device profiling disabled\n" ); |
| 153 | return setHsaRuntimeState(-1); |
| 154 | } |
| 155 | |
| 156 | hsa_init_ty pHsaInit = |
| 157 | (hsa_init_ty)__interception::LookupSymbol(handle: Hsa, symbol: "hsa_init" ); |
| 158 | hsa_system_get_major_extension_table_ty pGetExtTable = |
| 159 | (hsa_system_get_major_extension_table_ty)__interception::LookupSymbol( |
| 160 | handle: Hsa, symbol: "hsa_system_get_major_extension_table" ); |
| 161 | pHsaIterateAgents = (hsa_iterate_agents_ty)__interception::LookupSymbol( |
| 162 | handle: Hsa, symbol: "hsa_iterate_agents" ); |
| 163 | pHsaAgentGetInfo = (hsa_agent_get_info_ty)__interception::LookupSymbol( |
| 164 | handle: Hsa, symbol: "hsa_agent_get_info" ); |
| 165 | pHsaExecIterAgentSyms = |
| 166 | (hsa_executable_iterate_agent_symbols_ty)__interception::LookupSymbol( |
| 167 | handle: Hsa, symbol: "hsa_executable_iterate_agent_symbols" ); |
| 168 | pHsaSymGetInfo = |
| 169 | (hsa_executable_symbol_get_info_ty)__interception::LookupSymbol( |
| 170 | handle: Hsa, symbol: "hsa_executable_symbol_get_info" ); |
| 171 | |
| 172 | if (!pHsaInit || !pGetExtTable || !pHsaIterateAgents || !pHsaAgentGetInfo || |
| 173 | !pHsaExecIterAgentSyms || !pHsaSymGetInfo) { |
| 174 | PROF_WARN("%s" , |
| 175 | "required HSA symbols missing - HSA device profiling disabled\n" ); |
| 176 | return setHsaRuntimeState(-1); |
| 177 | } |
| 178 | |
| 179 | /* Bring HSA up lazily on the first drain (idempotent, refcounted), never from |
| 180 | * a library constructor -- see the fork-safety note at end of file. */ |
| 181 | prof_hsa_status_t St = pHsaInit(); |
| 182 | if (!hsaOkOrBreak(St)) { |
| 183 | if (isVerboseMode()) |
| 184 | PROF_NOTE("hsa_init failed (0x%x) - HSA device profiling disabled\n" , St); |
| 185 | return setHsaRuntimeState(-1); |
| 186 | } |
| 187 | |
| 188 | prof_hsa_loader_pfn_t LoaderApi; |
| 189 | __builtin_memset(&LoaderApi, 0, sizeof(LoaderApi)); |
| 190 | St = pGetExtTable(PROF_HSA_EXTENSION_AMD_LOADER, 1, sizeof(LoaderApi), |
| 191 | &LoaderApi); |
| 192 | if (!hsaOk(St) || !LoaderApi.query_segment_descriptors) { |
| 193 | PROF_WARN("AMD loader extension unavailable (0x%x) - " |
| 194 | "HSA device profiling disabled\n" , |
| 195 | St); |
| 196 | return setHsaRuntimeState(-1); |
| 197 | } |
| 198 | pQuerySegDescs = LoaderApi.query_segment_descriptors; |
| 199 | |
| 200 | /* The device-to-host copies go through the shared HIP loader. */ |
| 201 | ensureHipLoaded(); |
| 202 | if (!hipMemcpyAvailable()) { |
| 203 | PROF_WARN("%s" , "hipMemcpy unavailable - HSA device profiling disabled\n" ); |
| 204 | return setHsaRuntimeState(-1); |
| 205 | } |
| 206 | |
| 207 | if (isVerboseMode()) |
| 208 | PROF_NOTE("%s" , "HSA + HIP runtime resolved for device profiling\n" ); |
| 209 | return setHsaRuntimeState(1); |
| 210 | } |
| 211 | |
| 212 | /* The canonical device bounds-table symbol from InstrProfilingPlatformGPU.c. */ |
| 213 | static const char ProfileSectionsSymbol[] = "__llvm_profile_sections" ; |
| 214 | |
| 215 | /* Dedup of drained section-bounds tuples, shared with the host-shadow path |
| 216 | * (processDeviceOffloadPrf records here on every successful drain) so each |
| 217 | * unique counter set is drained exactly once across both paths. |
| 218 | */ |
| 219 | static ProfBoundsSet SeenBounds; |
| 220 | |
| 221 | /* Has this bounds tuple already been drained? Pure check, no state mutation. */ |
| 222 | static int profBoundsAlreadyDrained(const void *D, const void *C, |
| 223 | const void *N) { |
| 224 | return SeenBounds.contains(D, C, N); |
| 225 | } |
| 226 | |
| 227 | /* Record a drained bounds tuple. Idempotent; call only after a successful drain |
| 228 | * so a failed attempt stays retryable. */ |
| 229 | void __prof_rocm::profRecordDrainedBounds(const void *D, const void *C, |
| 230 | const void *N) { |
| 231 | SeenBounds.record(D, C, N); |
| 232 | } |
| 233 | |
| 234 | #define PROF_MAX_GPU_AGENTS 64 |
| 235 | |
| 236 | /* Buffer size for HSA agent names and symbol names we read back; both the |
| 237 | * device arch string and the __llvm_profile_sections symbol are far shorter. */ |
| 238 | #define PROF_HSA_NAME_MAX 64 |
| 239 | |
| 240 | namespace { |
| 241 | struct GpuAgent { |
| 242 | prof_hsa_agent_t agent; |
| 243 | char arch[PROF_HSA_NAME_MAX]; |
| 244 | }; |
| 245 | |
| 246 | struct WalkState { |
| 247 | GpuAgent agents[PROF_MAX_GPU_AGENTS]; |
| 248 | int num_agents; |
| 249 | int total_found; |
| 250 | int total_drained; |
| 251 | }; |
| 252 | |
| 253 | /* Per (agent, executable) symbol-iteration state. */ |
| 254 | struct SymbolState { |
| 255 | const char *arch; |
| 256 | int found; |
| 257 | int drained; |
| 258 | }; |
| 259 | } // namespace |
| 260 | |
| 261 | /* HSA per-symbol callback: when it finds a __llvm_profile_sections variable, |
| 262 | * drain it via processDeviceOffloadPrf() unless the host-shadow path (or an |
| 263 | * earlier agent) already handled the same bounds. */ |
| 264 | static prof_hsa_status_t onSymbol(prof_hsa_executable_t, prof_hsa_agent_t, |
| 265 | prof_hsa_executable_symbol_t Sym, |
| 266 | void *Data) { |
| 267 | SymbolState *S = (SymbolState *)Data; |
| 268 | |
| 269 | prof_hsa_symbol_kind_t Kind; |
| 270 | if (!hsaOk( |
| 271 | St: pHsaSymGetInfo(Sym, PROF_HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &Kind)) || |
| 272 | Kind != PROF_HSA_SYMBOL_KIND_VARIABLE) |
| 273 | return PROF_HSA_STATUS_SUCCESS; |
| 274 | |
| 275 | uint32_t NameLen = 0; |
| 276 | if (!hsaOk(St: pHsaSymGetInfo(Sym, PROF_HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, |
| 277 | &NameLen)) || |
| 278 | NameLen != sizeof(ProfileSectionsSymbol) - 1) |
| 279 | return PROF_HSA_STATUS_SUCCESS; |
| 280 | |
| 281 | char NameBuf[PROF_HSA_NAME_MAX]; |
| 282 | if (NameLen + 1 > sizeof(NameBuf)) |
| 283 | return PROF_HSA_STATUS_SUCCESS; |
| 284 | if (!hsaOk( |
| 285 | St: pHsaSymGetInfo(Sym, PROF_HSA_EXECUTABLE_SYMBOL_INFO_NAME, NameBuf))) |
| 286 | return PROF_HSA_STATUS_SUCCESS; |
| 287 | NameBuf[NameLen] = '\0'; |
| 288 | |
| 289 | if (strcmp(s1: NameBuf, s2: ProfileSectionsSymbol) != 0) |
| 290 | return PROF_HSA_STATUS_SUCCESS; |
| 291 | |
| 292 | uint64_t Addr = 0; |
| 293 | if (!hsaOk(St: pHsaSymGetInfo( |
| 294 | Sym, PROF_HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &Addr)) || |
| 295 | Addr == 0) { |
| 296 | if (isVerboseMode()) |
| 297 | PROF_NOTE("%s" , "failed to read __llvm_profile_sections address\n" ); |
| 298 | return PROF_HSA_STATUS_SUCCESS; |
| 299 | } |
| 300 | |
| 301 | S->found++; |
| 302 | |
| 303 | // Read the bounds table first to dedup (and detect empty sections) before |
| 304 | // the full copy/relocate done by processDeviceOffloadPrf. |
| 305 | __llvm_profile_gpu_sections Sec; |
| 306 | if (memcpyDeviceToHost(Dst: &Sec, Src: (void *)(uintptr_t)Addr, Size: sizeof(Sec)) != 0) { |
| 307 | PROF_WARN("%s" , "failed to copy device bounds table\n" ); |
| 308 | return PROF_HSA_STATUS_SUCCESS; |
| 309 | } |
| 310 | if (profBoundsAlreadyDrained(D: Sec.DataStart, C: Sec.CountersStart, |
| 311 | N: Sec.NamesStart)) { |
| 312 | if (isVerboseMode()) |
| 313 | PROF_NOTE("%s" , "device bounds already drained, skipping\n" ); |
| 314 | return PROF_HSA_STATUS_SUCCESS; |
| 315 | } |
| 316 | |
| 317 | size_t DataBytes = (const char *)Sec.DataStop - (const char *)Sec.DataStart; |
| 318 | size_t CntsBytes = |
| 319 | (const char *)Sec.CountersStop - (const char *)Sec.CountersStart; |
| 320 | if (DataBytes == 0 || CntsBytes == 0) { |
| 321 | // Empty code object: nothing to write. Mark seen so we don't revisit it. |
| 322 | profRecordDrainedBounds(D: Sec.DataStart, C: Sec.CountersStart, N: Sec.NamesStart); |
| 323 | return PROF_HSA_STATUS_SUCCESS; |
| 324 | } |
| 325 | |
| 326 | // Name HSA-drained objects in their own ".hsaN" suffix space so they never |
| 327 | // collide with the host-shadow path's "arch"/"arch.<i>" filenames. The drain |
| 328 | // latch (HsaDrainCompleted) already prevents re-draining an object, so a |
| 329 | // plain per-drain counter is enough for uniqueness. |
| 330 | static int DrainIndex = 0; |
| 331 | char Target[96]; |
| 332 | snprintf(s: Target, maxlen: sizeof(Target), format: "%s.hsa%d" , S->arch, DrainIndex); |
| 333 | |
| 334 | // Record the bounds (and advance the index) only on a successful write so a |
| 335 | // transient error stays retryable on a later agent or collect call. |
| 336 | if (processDeviceOffloadPrf(DeviceOffloadPrf: (void *)(uintptr_t)Addr, Target, Sections: nullptr) == 0) { |
| 337 | S->drained++; |
| 338 | DrainIndex++; |
| 339 | profRecordDrainedBounds(D: Sec.DataStart, C: Sec.CountersStart, N: Sec.NamesStart); |
| 340 | } |
| 341 | |
| 342 | return PROF_HSA_STATUS_SUCCESS; |
| 343 | } |
| 344 | |
| 345 | static prof_hsa_status_t collectAgent(prof_hsa_agent_t Agent, void *Data) { |
| 346 | prof_hsa_device_type_t DevType; |
| 347 | if (!hsaOk(St: pHsaAgentGetInfo(Agent, PROF_HSA_AGENT_INFO_DEVICE, &DevType)) || |
| 348 | DevType != PROF_HSA_DEVICE_TYPE_GPU) |
| 349 | return PROF_HSA_STATUS_SUCCESS; |
| 350 | |
| 351 | WalkState *W = (WalkState *)Data; |
| 352 | if (W->num_agents >= PROF_MAX_GPU_AGENTS) |
| 353 | return PROF_HSA_STATUS_SUCCESS; |
| 354 | |
| 355 | GpuAgent &GA = W->agents[W->num_agents++]; |
| 356 | GA.agent = Agent; |
| 357 | char Name[PROF_HSA_NAME_MAX]; |
| 358 | __builtin_memset(Name, 0, sizeof(Name)); |
| 359 | pHsaAgentGetInfo(Agent, PROF_HSA_AGENT_INFO_NAME, Name); |
| 360 | size_t N = strnlen(string: Name, maxlen: sizeof(GA.arch) - 1); |
| 361 | __builtin_memcpy(GA.arch, Name, N); |
| 362 | GA.arch[N] = '\0'; |
| 363 | if (!GA.arch[0]) |
| 364 | strncpy(dest: GA.arch, src: "amdgpu" , n: sizeof(GA.arch) - 1); |
| 365 | |
| 366 | if (isVerboseMode()) |
| 367 | PROF_NOTE("GPU agent %d: %s\n" , W->num_agents - 1, GA.arch); |
| 368 | return PROF_HSA_STATUS_SUCCESS; |
| 369 | } |
| 370 | |
| 371 | /* Reentrancy guard and "drained at least once" latch (both acquire/release). */ |
| 372 | static int HsaDrainInProgress = 0; |
| 373 | static int HsaDrainCompleted = 0; |
| 374 | |
| 375 | int __prof_rocm::drainDevicesViaHsa(void) { |
| 376 | if (__atomic_load_n(&HsaDrainCompleted, __ATOMIC_ACQUIRE)) |
| 377 | return 0; |
| 378 | |
| 379 | int Expected = 0; |
| 380 | if (!__atomic_compare_exchange_n(&HsaDrainInProgress, &Expected, 1, |
| 381 | /*weak=*/0, __ATOMIC_ACQ_REL, |
| 382 | __ATOMIC_ACQUIRE)) |
| 383 | return 0; |
| 384 | |
| 385 | struct InProgressGuard { |
| 386 | ~InProgressGuard() { |
| 387 | __atomic_store_n(&HsaDrainInProgress, 0, __ATOMIC_RELEASE); |
| 388 | } |
| 389 | } _Guard; |
| 390 | |
| 391 | if (loadHsaRuntimePointers() != 0) |
| 392 | return 0; /* Runtime unavailable: stay retryable. */ |
| 393 | |
| 394 | WalkState W; |
| 395 | __builtin_memset(&W, 0, sizeof(W)); |
| 396 | prof_hsa_status_t St = pHsaIterateAgents(collectAgent, &W); |
| 397 | if (!hsaOkOrBreak(St)) { |
| 398 | PROF_WARN("hsa_iterate_agents failed (0x%x)\n" , St); |
| 399 | return -1; |
| 400 | } |
| 401 | if (W.num_agents == 0) { |
| 402 | if (isVerboseMode()) |
| 403 | PROF_NOTE("%s" , "no GPU agents present; nothing to drain (will retry)\n" ); |
| 404 | return 0; |
| 405 | } |
| 406 | |
| 407 | /* query_segment_descriptors ships in every loader-extension version, is more |
| 408 | * permissive than iterate_executables on ROCm, and yields the loaded |
| 409 | * (agent, executable) pairs directly. */ |
| 410 | size_t NumSegs = 0; |
| 411 | St = pQuerySegDescs(nullptr, &NumSegs); |
| 412 | if (!hsaOk(St)) { |
| 413 | PROF_WARN("query_segment_descriptors(count) failed (0x%x)\n" , St); |
| 414 | return -1; |
| 415 | } |
| 416 | if (NumSegs == 0) { |
| 417 | if (isVerboseMode()) |
| 418 | PROF_NOTE("%s" , "no loaded segments; nothing to drain (will retry)\n" ); |
| 419 | return 0; |
| 420 | } |
| 421 | |
| 422 | prof_hsa_loader_segment_descriptor_t *Segs = |
| 423 | (prof_hsa_loader_segment_descriptor_t *)calloc(nmemb: NumSegs, size: sizeof(*Segs)); |
| 424 | if (!Segs) { |
| 425 | PROF_ERR("%s\n" , "failed to allocate segment descriptor array" ); |
| 426 | return -1; |
| 427 | } |
| 428 | UniqueFree SegsOwner(Segs); |
| 429 | |
| 430 | St = pQuerySegDescs(Segs, &NumSegs); |
| 431 | if (!hsaOk(St)) { |
| 432 | PROF_WARN("query_segment_descriptors(fetch) failed (0x%x)\n" , St); |
| 433 | return -1; |
| 434 | } |
| 435 | |
| 436 | if (isVerboseMode()) |
| 437 | PROF_NOTE("query_segment_descriptors: %zu segments\n" , NumSegs); |
| 438 | |
| 439 | // Walk each unique (agent, executable) pair once. |
| 440 | struct SeenPair { |
| 441 | uint64_t agent; |
| 442 | uint64_t exec; |
| 443 | }; |
| 444 | enum { kSeenPairsInitCap = 64 }; |
| 445 | SeenPair *Seen = nullptr; |
| 446 | int NumPairs = 0; |
| 447 | int CapPairs = 0; |
| 448 | int IterFailures = 0; |
| 449 | |
| 450 | for (size_t i = 0; i < NumSegs; ++i) { |
| 451 | if (Segs[i].executable.handle == 0 || Segs[i].agent.handle == 0) |
| 452 | continue; |
| 453 | |
| 454 | bool AlreadySeen = false; |
| 455 | for (int j = 0; j < NumPairs; ++j) |
| 456 | if (Seen[j].agent == Segs[i].agent.handle && |
| 457 | Seen[j].exec == Segs[i].executable.handle) { |
| 458 | AlreadySeen = true; |
| 459 | break; |
| 460 | } |
| 461 | if (AlreadySeen) |
| 462 | continue; |
| 463 | if (growArray(Arr: (void **)&Seen, Cap: &CapPairs, MinCount: NumPairs + 1, InitCap: kSeenPairsInitCap, |
| 464 | ElemSize: sizeof(*Seen)) == 0) { |
| 465 | Seen[NumPairs].agent = Segs[i].agent.handle; |
| 466 | Seen[NumPairs].exec = Segs[i].executable.handle; |
| 467 | NumPairs++; |
| 468 | } |
| 469 | |
| 470 | const char *Arch = nullptr; |
| 471 | for (int k = 0; k < W.num_agents; ++k) |
| 472 | if (W.agents[k].agent.handle == Segs[i].agent.handle) { |
| 473 | Arch = W.agents[k].arch; |
| 474 | break; |
| 475 | } |
| 476 | if (!Arch) |
| 477 | continue; /* not a GPU agent we collected */ |
| 478 | |
| 479 | SymbolState S; |
| 480 | __builtin_memset(&S, 0, sizeof(S)); |
| 481 | S.arch = Arch; |
| 482 | if (isVerboseMode()) |
| 483 | PROF_NOTE("walking executable 0x%llx on %s\n" , |
| 484 | (unsigned long long)Segs[i].executable.handle, Arch); |
| 485 | prof_hsa_status_t IterSt = |
| 486 | pHsaExecIterAgentSyms(Segs[i].executable, Segs[i].agent, onSymbol, &S); |
| 487 | if (!hsaOkOrBreak(St: IterSt)) { |
| 488 | PROF_WARN("hsa_executable_iterate_agent_symbols on executable 0x%llx " |
| 489 | "failed (0x%x)\n" , |
| 490 | (unsigned long long)Segs[i].executable.handle, IterSt); |
| 491 | IterFailures++; |
| 492 | } |
| 493 | W.total_found += S.found; |
| 494 | W.total_drained += S.drained; |
| 495 | } |
| 496 | |
| 497 | if (isVerboseMode()) |
| 498 | PROF_NOTE("HSA walk complete: agents=%d pairs=%d found=%d drained=%d " |
| 499 | "iter-failures=%d\n" , |
| 500 | W.num_agents, NumPairs, W.total_found, W.total_drained, |
| 501 | IterFailures); |
| 502 | |
| 503 | free(ptr: Seen); |
| 504 | |
| 505 | /* Latch only when we actually drained data. A "found nothing new" walk is |
| 506 | * deliberately not latched: an early collect can precede any kernel launch, |
| 507 | * and latching it would suppress the real exit-time drain. No-op walks are |
| 508 | * cheap to repeat. */ |
| 509 | if (W.total_drained > 0) |
| 510 | __atomic_store_n(&HsaDrainCompleted, 1, __ATOMIC_RELEASE); |
| 511 | return (IterFailures > 0) ? -1 : 0; |
| 512 | } |
| 513 | |
| 514 | /* Fork-safety: deliberately no library constructor calling hsa_init(). */ |
| 515 | |
| 516 | #endif /* defined(__linux__) && !defined(_WIN32) -- HSA drain */ |
| 517 | |