| 1 | /*===- InstrProfilingPlatformGPU.c - GPU profiling support ----------------===*\ |
| 2 | |* |
| 3 | |* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | |* See https://llvm.org/LICENSE.txt for license information. |
| 5 | |* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | |* |
| 7 | \*===----------------------------------------------------------------------===*/ |
| 8 | |
| 9 | // GPU-specific profiling functions for AMDGPU and NVPTX targets. This file |
| 10 | // provides: |
| 11 | // |
| 12 | // Platform plumbing (section boundaries, binary IDs, VNodes) are handled by |
| 13 | // InstrProfilingPlatformLinux.c via the COMPILER_RT_PROFILE_BAREMETAL path. |
| 14 | |
| 15 | #if defined(__NVPTX__) || defined(__AMDGPU__) |
| 16 | |
| 17 | #include "InstrProfiling.h" |
| 18 | #include <gpuintrin.h> |
| 19 | #include <stdint.h> |
| 20 | |
| 21 | // Symbols exported to the GPU runtime need to be visible in the .dynsym table. |
| 22 | #define COMPILER_RT_GPU_VISIBILITY __attribute__((visibility("protected"))) |
| 23 | |
| 24 | // Indicates that the current wave is fully occupied. |
| 25 | static int is_uniform(uint64_t mask) { |
| 26 | const uint64_t uniform_mask = ~0ull >> (64 - __gpu_num_lanes()); |
| 27 | return mask == uniform_mask; |
| 28 | } |
| 29 | |
| 30 | // Wave-cooperative counter increment. The instrumentation pass emits calls to |
| 31 | // this in place of the default non-atomic load/add/store or atomicrmw sequence. |
| 32 | // The optional uniform counter allows calculating wave uniformity if present. |
| 33 | COMPILER_RT_VISIBILITY void INSTR_PROF_INSTRUMENT_GPU_FUNC(uint64_t *counter, |
| 34 | uint64_t *uniform, |
| 35 | uint64_t step) { |
| 36 | uint64_t mask = __gpu_lane_mask(); |
| 37 | if (__gpu_is_first_in_lane(mask)) { |
| 38 | __scoped_atomic_fetch_add(counter, step * __builtin_popcountg(mask), |
| 39 | __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE); |
| 40 | if (uniform && is_uniform(mask)) |
| 41 | __scoped_atomic_fetch_add(uniform, step * __builtin_popcountg(mask), |
| 42 | __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE); |
| 43 | } |
| 44 | } |
| 45 | |
| 46 | // Block-level sampling for offload PGO. For GPU kernels with stationary |
| 47 | // behavior (where all thread blocks execute the same code paths regardless of |
| 48 | // block ID), partial sampling significantly reduces instrumentation overhead |
| 49 | // without losing PGO performance gains. |
| 50 | // |
| 51 | // Returns 1 if this block should be instrumented, 0 to skip. Samples by |
| 52 | // matching lower bits of the x-dimension block ID to zero. |
| 53 | // sampling_bits=0: all blocks (100%) |
| 54 | // sampling_bits=3: every 8th block in x (12.5%, default) |
| 55 | // |
| 56 | // Note: We use only block_id_x rather than a fully linearized 3D block ID. |
| 57 | // The 3D linearization requires __gpu_num_blocks_x/y which expands to |
| 58 | // __builtin_amdgcn_workgroup_size_x/y. With -mcode-object-version=none (used |
| 59 | // to build compiler-rt profile runtime), the compiler emits a load of |
| 60 | // __oclc_ABI_version to select the correct ABI path. Since the profile runtime |
| 61 | // is linked after device libs are internalized, __oclc_ABI_version is no longer |
| 62 | // available. Using block_id_x directly avoids this dependency. For typical |
| 63 | // kernels with large 1D or x-dominant grids this is sufficient; blocks sharing |
| 64 | // the same x-index are sampled together in 3D grids (minor uniformity loss). |
| 65 | COMPILER_RT_VISIBILITY int __llvm_profile_sampling_gpu(uint32_t sampling_bits) { |
| 66 | if (sampling_bits == 0) |
| 67 | return 1; |
| 68 | |
| 69 | uint32_t block_id = __gpu_block_id_x(); |
| 70 | if (sampling_bits >= 32) |
| 71 | return block_id == 0; |
| 72 | |
| 73 | uint32_t mask = (1u << sampling_bits) - 1; |
| 74 | return (block_id & mask) == 0; |
| 75 | } |
| 76 | |
| 77 | #if defined(__AMDGPU__) |
| 78 | |
| 79 | #define PROF_NAME_START INSTR_PROF_SECT_START(INSTR_PROF_NAME_COMMON) |
| 80 | #define PROF_NAME_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_NAME_COMMON) |
| 81 | #define PROF_CNTS_START INSTR_PROF_SECT_START(INSTR_PROF_CNTS_COMMON) |
| 82 | #define PROF_CNTS_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_CNTS_COMMON) |
| 83 | #define PROF_DATA_START INSTR_PROF_SECT_START(INSTR_PROF_DATA_COMMON) |
| 84 | #define PROF_DATA_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_DATA_COMMON) |
| 85 | #define PROF_UCNTS_START INSTR_PROF_SECT_START(INSTR_PROF_UCNTS_COMMON) |
| 86 | #define PROF_UCNTS_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_UCNTS_COMMON) |
| 87 | |
| 88 | extern char PROF_NAME_START[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK; |
| 89 | extern char PROF_NAME_STOP[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK; |
| 90 | extern char PROF_CNTS_START[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK; |
| 91 | extern char PROF_CNTS_STOP[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK; |
| 92 | extern __llvm_profile_data PROF_DATA_START[] COMPILER_RT_VISIBILITY |
| 93 | COMPILER_RT_WEAK; |
| 94 | extern __llvm_profile_data PROF_DATA_STOP[] COMPILER_RT_VISIBILITY |
| 95 | COMPILER_RT_WEAK; |
| 96 | extern char PROF_UCNTS_START[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK; |
| 97 | extern char PROF_UCNTS_STOP[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK; |
| 98 | |
| 99 | // AMDGPU is a proper ELF target and exports the linker-defined section bounds. |
| 100 | COMPILER_RT_GPU_VISIBILITY |
| 101 | __llvm_profile_gpu_sections INSTR_PROF_SECT_BOUNDS_TABLE = { |
| 102 | PROF_NAME_START, PROF_NAME_STOP, PROF_CNTS_START, |
| 103 | PROF_CNTS_STOP, PROF_DATA_START, PROF_DATA_STOP, |
| 104 | PROF_UCNTS_START, PROF_UCNTS_STOP, &INSTR_PROF_RAW_VERSION_VAR}; |
| 105 | |
| 106 | #elif defined(__NVPTX__) |
| 107 | |
| 108 | // NVPTX supports neither sections nor ELF symbols, we rely on the handling in |
| 109 | // the 'InstrProfilingPlatformOther.c' file to fill this at initialization time. |
| 110 | // FIXME: This will not work until we make the NVPTX backend emit section |
| 111 | // globals next to each other. |
| 112 | COMPILER_RT_GPU_VISIBILITY |
| 113 | __llvm_profile_gpu_sections INSTR_PROF_SECT_BOUNDS_TABLE = { |
| 114 | NULL, NULL, NULL, |
| 115 | NULL, NULL, NULL, |
| 116 | NULL, NULL, &INSTR_PROF_RAW_VERSION_VAR}; |
| 117 | #endif |
| 118 | |
| 119 | #endif |
| 120 | |