InstrProfilingPlatformGPU.c source code [llvm_runtimes/compiler-rt/lib/profile/InstrProfilingPlatformGPU.c]

1	/===- InstrProfilingPlatformGPU.c - GPU profiling support ----------------===\
2	\|*
3	\| Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.*
4	\| See https://llvm.org/LICENSE.txt for license information.*
5	\| SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception*
6	\|*
7	\===----------------------------------------------------------------------===/
8
9	// GPU-specific profiling functions for AMDGPU and NVPTX targets. This file
10	// provides:
11	//
12	// Platform plumbing (section boundaries, binary IDs, VNodes) are handled by
13	// InstrProfilingPlatformLinux.c via the COMPILER_RT_PROFILE_BAREMETAL path.
14
15	#if defined(__NVPTX__) \|\| defined(__AMDGPU__)
16
17	#include "InstrProfiling.h"
18	#include <gpuintrin.h>
19	#include <stdint.h>
20
21	// Symbols exported to the GPU runtime need to be visible in the .dynsym table.
22	#define COMPILER_RT_GPU_VISIBILITY __attribute__((visibility("protected")))
23
24	// Indicates that the current wave is fully occupied.
25	static int is_uniform(uint64_t mask) {
26	const uint64_t uniform_mask = ~`0ull` >> (`64` - __gpu_num_lanes());
27	return mask == uniform_mask;
28	}
29
30	// Wave-cooperative counter increment. The instrumentation pass emits calls to
31	// this in place of the default non-atomic load/add/store or atomicrmw sequence.
32	// The optional uniform counter allows calculating wave uniformity if present.
33	COMPILER_RT_VISIBILITY void INSTR_PROF_INSTRUMENT_GPU_FUNC(uint64_t *counter,
34	uint64_t *uniform,
35	uint64_t step) {
36	uint64_t mask = __gpu_lane_mask();
37	if (__gpu_is_first_in_lane(mask)) {
38	__scoped_atomic_fetch_add(counter, step * __builtin_popcountg(mask),
39	__ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
40	if (uniform && is_uniform(mask))
41	__scoped_atomic_fetch_add(uniform, step * __builtin_popcountg(mask),
42	__ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
43	}
44	}
45
46	// Block-level sampling for offload PGO. For GPU kernels with stationary
47	// behavior (where all thread blocks execute the same code paths regardless of
48	// block ID), partial sampling significantly reduces instrumentation overhead
49	// without losing PGO performance gains.
50	//
51	// Returns 1 if this block should be instrumented, 0 to skip. Samples by
52	// matching lower bits of the x-dimension block ID to zero.
53	// sampling_bits=0: all blocks (100%)
54	// sampling_bits=3: every 8th block in x (12.5%, default)
55	//
56	// Note: We use only block_id_x rather than a fully linearized 3D block ID.
57	// The 3D linearization requires __gpu_num_blocks_x/y which expands to
58	// __builtin_amdgcn_workgroup_size_x/y. With -mcode-object-version=none (used
59	// to build compiler-rt profile runtime), the compiler emits a load of
60	// __oclc_ABI_version to select the correct ABI path. Since the profile runtime
61	// is linked after device libs are internalized, __oclc_ABI_version is no longer
62	// available. Using block_id_x directly avoids this dependency. For typical
63	// kernels with large 1D or x-dominant grids this is sufficient; blocks sharing
64	// the same x-index are sampled together in 3D grids (minor uniformity loss).
65	COMPILER_RT_VISIBILITY int __llvm_profile_sampling_gpu(uint32_t sampling_bits) {
66	if (sampling_bits == `0`)
67	return `1`;
68
69	uint32_t block_id = __gpu_block_id_x();
70	if (sampling_bits >= `32`)
71	return block_id == `0`;
72
73	uint32_t mask = (`1u` << sampling_bits) - `1`;
74	return (block_id & mask) == `0`;
75	}
76
77	#if defined(__AMDGPU__)
78
79	#define PROF_NAME_START INSTR_PROF_SECT_START(INSTR_PROF_NAME_COMMON)
80	#define PROF_NAME_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_NAME_COMMON)
81	#define PROF_CNTS_START INSTR_PROF_SECT_START(INSTR_PROF_CNTS_COMMON)
82	#define PROF_CNTS_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_CNTS_COMMON)
83	#define PROF_DATA_START INSTR_PROF_SECT_START(INSTR_PROF_DATA_COMMON)
84	#define PROF_DATA_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_DATA_COMMON)
85	#define PROF_UCNTS_START INSTR_PROF_SECT_START(INSTR_PROF_UCNTS_COMMON)
86	#define PROF_UCNTS_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_UCNTS_COMMON)
87
88	extern char PROF_NAME_START[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
89	extern char PROF_NAME_STOP[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
90	extern char PROF_CNTS_START[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
91	extern char PROF_CNTS_STOP[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
92	extern __llvm_profile_data PROF_DATA_START[] COMPILER_RT_VISIBILITY
93	COMPILER_RT_WEAK;
94	extern __llvm_profile_data PROF_DATA_STOP[] COMPILER_RT_VISIBILITY
95	COMPILER_RT_WEAK;
96	extern char PROF_UCNTS_START[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
97	extern char PROF_UCNTS_STOP[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
98
99	// AMDGPU is a proper ELF target and exports the linker-defined section bounds.
100	COMPILER_RT_GPU_VISIBILITY
101	__llvm_profile_gpu_sections INSTR_PROF_SECT_BOUNDS_TABLE = {
102	PROF_NAME_START, PROF_NAME_STOP, PROF_CNTS_START,
103	PROF_CNTS_STOP, PROF_DATA_START, PROF_DATA_STOP,
104	PROF_UCNTS_START, PROF_UCNTS_STOP, &INSTR_PROF_RAW_VERSION_VAR};
105
106	#elif defined(__NVPTX__)
107
108	// NVPTX supports neither sections nor ELF symbols, we rely on the handling in
109	// the 'InstrProfilingPlatformOther.c' file to fill this at initialization time.
110	// FIXME: This will not work until we make the NVPTX backend emit section
111	// globals next to each other.
112	COMPILER_RT_GPU_VISIBILITY
113	__llvm_profile_gpu_sections INSTR_PROF_SECT_BOUNDS_TABLE = {
114	NULL, NULL, NULL,
115	NULL, NULL, NULL,
116	NULL, NULL, &INSTR_PROF_RAW_VERSION_VAR};
117	#endif
118
119	#endif
120

Browse the source code of llvm_runtimes/compiler-rt/lib/profile/InstrProfilingPlatformGPU.c