1/*===- InstrProfilingPlatformGPU.c - GPU profiling support ----------------===*\
2|*
3|* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4|* See https://llvm.org/LICENSE.txt for license information.
5|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6|*
7\*===----------------------------------------------------------------------===*/
8
9// GPU-specific profiling functions for AMDGPU and NVPTX targets. This file
10// provides:
11//
12// Platform plumbing (section boundaries, binary IDs, VNodes) are handled by
13// InstrProfilingPlatformLinux.c via the COMPILER_RT_PROFILE_BAREMETAL path.
14
15#if defined(__NVPTX__) || defined(__AMDGPU__)
16
17#include "InstrProfiling.h"
18#include <gpuintrin.h>
19#include <stdint.h>
20
21// Symbols exported to the GPU runtime need to be visible in the .dynsym table.
22#define COMPILER_RT_GPU_VISIBILITY __attribute__((visibility("protected")))
23
24// Indicates that the current wave is fully occupied.
25static int is_uniform(uint64_t mask) {
26 const uint64_t uniform_mask = ~0ull >> (64 - __gpu_num_lanes());
27 return mask == uniform_mask;
28}
29
30// Wave-cooperative counter increment. The instrumentation pass emits calls to
31// this in place of the default non-atomic load/add/store or atomicrmw sequence.
32// The optional uniform counter allows calculating wave uniformity if present.
33COMPILER_RT_VISIBILITY void INSTR_PROF_INSTRUMENT_GPU_FUNC(uint64_t *counter,
34 uint64_t *uniform,
35 uint64_t step) {
36 uint64_t mask = __gpu_lane_mask();
37 if (__gpu_is_first_in_lane(mask)) {
38 __scoped_atomic_fetch_add(counter, step * __builtin_popcountg(mask),
39 __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
40 if (uniform && is_uniform(mask))
41 __scoped_atomic_fetch_add(uniform, step * __builtin_popcountg(mask),
42 __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
43 }
44}
45
46// Block-level sampling for offload PGO. For GPU kernels with stationary
47// behavior (where all thread blocks execute the same code paths regardless of
48// block ID), partial sampling significantly reduces instrumentation overhead
49// without losing PGO performance gains.
50//
51// Returns 1 if this block should be instrumented, 0 to skip. Samples by
52// matching lower bits of the x-dimension block ID to zero.
53// sampling_bits=0: all blocks (100%)
54// sampling_bits=3: every 8th block in x (12.5%, default)
55//
56// Note: We use only block_id_x rather than a fully linearized 3D block ID.
57// The 3D linearization requires __gpu_num_blocks_x/y which expands to
58// __builtin_amdgcn_workgroup_size_x/y. With -mcode-object-version=none (used
59// to build compiler-rt profile runtime), the compiler emits a load of
60// __oclc_ABI_version to select the correct ABI path. Since the profile runtime
61// is linked after device libs are internalized, __oclc_ABI_version is no longer
62// available. Using block_id_x directly avoids this dependency. For typical
63// kernels with large 1D or x-dominant grids this is sufficient; blocks sharing
64// the same x-index are sampled together in 3D grids (minor uniformity loss).
65COMPILER_RT_VISIBILITY int __llvm_profile_sampling_gpu(uint32_t sampling_bits) {
66 if (sampling_bits == 0)
67 return 1;
68
69 uint32_t block_id = __gpu_block_id_x();
70 if (sampling_bits >= 32)
71 return block_id == 0;
72
73 uint32_t mask = (1u << sampling_bits) - 1;
74 return (block_id & mask) == 0;
75}
76
77#if defined(__AMDGPU__)
78
79#define PROF_NAME_START INSTR_PROF_SECT_START(INSTR_PROF_NAME_COMMON)
80#define PROF_NAME_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_NAME_COMMON)
81#define PROF_CNTS_START INSTR_PROF_SECT_START(INSTR_PROF_CNTS_COMMON)
82#define PROF_CNTS_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_CNTS_COMMON)
83#define PROF_DATA_START INSTR_PROF_SECT_START(INSTR_PROF_DATA_COMMON)
84#define PROF_DATA_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_DATA_COMMON)
85#define PROF_UCNTS_START INSTR_PROF_SECT_START(INSTR_PROF_UCNTS_COMMON)
86#define PROF_UCNTS_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_UCNTS_COMMON)
87
88extern char PROF_NAME_START[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
89extern char PROF_NAME_STOP[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
90extern char PROF_CNTS_START[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
91extern char PROF_CNTS_STOP[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
92extern __llvm_profile_data PROF_DATA_START[] COMPILER_RT_VISIBILITY
93 COMPILER_RT_WEAK;
94extern __llvm_profile_data PROF_DATA_STOP[] COMPILER_RT_VISIBILITY
95 COMPILER_RT_WEAK;
96extern char PROF_UCNTS_START[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
97extern char PROF_UCNTS_STOP[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
98
99// AMDGPU is a proper ELF target and exports the linker-defined section bounds.
100COMPILER_RT_GPU_VISIBILITY
101__llvm_profile_gpu_sections INSTR_PROF_SECT_BOUNDS_TABLE = {
102 PROF_NAME_START, PROF_NAME_STOP, PROF_CNTS_START,
103 PROF_CNTS_STOP, PROF_DATA_START, PROF_DATA_STOP,
104 PROF_UCNTS_START, PROF_UCNTS_STOP, &INSTR_PROF_RAW_VERSION_VAR};
105
106#elif defined(__NVPTX__)
107
108// NVPTX supports neither sections nor ELF symbols, we rely on the handling in
109// the 'InstrProfilingPlatformOther.c' file to fill this at initialization time.
110// FIXME: This will not work until we make the NVPTX backend emit section
111// globals next to each other.
112COMPILER_RT_GPU_VISIBILITY
113__llvm_profile_gpu_sections INSTR_PROF_SECT_BOUNDS_TABLE = {
114 NULL, NULL, NULL,
115 NULL, NULL, NULL,
116 NULL, NULL, &INSTR_PROF_RAW_VERSION_VAR};
117#endif
118
119#endif
120