1/*===- InstrProfilingPlatformGPU.c - GPU profiling support ----------------===*\
2|*
3|* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4|* See https://llvm.org/LICENSE.txt for license information.
5|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6|*
7\*===----------------------------------------------------------------------===*/
8
9// GPU-specific profiling functions for AMDGPU and NVPTX targets. This file
10// provides:
11//
12// Platform plumbing (section boundaries, binary IDs, VNodes) are handled by
13// InstrProfilingPlatformLinux.c via the COMPILER_RT_PROFILE_BAREMETAL path.
14
15#if defined(__NVPTX__) || defined(__AMDGPU__)
16
17#include "InstrProfiling.h"
18#include <gpuintrin.h>
19
20// Symbols exported to the GPU runtime need to be visible in the .dynsym table.
21#define COMPILER_RT_GPU_VISIBILITY __attribute__((visibility("protected")))
22
23// Indicates that the current wave is fully occupied.
24static int is_uniform(uint64_t mask) {
25 const uint64_t uniform_mask = ~0ull >> (64 - __gpu_num_lanes());
26 return mask == uniform_mask;
27}
28
29// Wave-cooperative counter increment. The instrumentation pass emits calls to
30// this in place of the default non-atomic load/add/store or atomicrmw sequence.
31// The optional uniform counter allows calculating wave uniformity if present.
32COMPILER_RT_VISIBILITY void __llvm_profile_instrument_gpu(uint64_t *counter,
33 uint64_t *uniform,
34 uint64_t step) {
35 uint64_t mask = __gpu_lane_mask();
36 if (__gpu_is_first_in_lane(mask)) {
37 __scoped_atomic_fetch_add(counter, step * __builtin_popcountg(mask),
38 __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
39 if (uniform && is_uniform(mask))
40 __scoped_atomic_fetch_add(uniform, step * __builtin_popcountg(mask),
41 __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
42 }
43}
44
45#if defined(__AMDGPU__)
46
47#define PROF_NAME_START INSTR_PROF_SECT_START(INSTR_PROF_NAME_COMMON)
48#define PROF_NAME_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_NAME_COMMON)
49#define PROF_CNTS_START INSTR_PROF_SECT_START(INSTR_PROF_CNTS_COMMON)
50#define PROF_CNTS_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_CNTS_COMMON)
51#define PROF_DATA_START INSTR_PROF_SECT_START(INSTR_PROF_DATA_COMMON)
52#define PROF_DATA_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_DATA_COMMON)
53
54extern char PROF_NAME_START[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
55extern char PROF_NAME_STOP[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
56extern char PROF_CNTS_START[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
57extern char PROF_CNTS_STOP[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
58extern __llvm_profile_data PROF_DATA_START[] COMPILER_RT_VISIBILITY
59 COMPILER_RT_WEAK;
60extern __llvm_profile_data PROF_DATA_STOP[] COMPILER_RT_VISIBILITY
61 COMPILER_RT_WEAK;
62
63// AMDGPU is a proper ELF target and exports the linker-defined section bounds.
64COMPILER_RT_GPU_VISIBILITY
65__llvm_profile_gpu_sections INSTR_PROF_SECT_BOUNDS_TABLE = {
66 PROF_NAME_START,
67 PROF_NAME_STOP,
68 PROF_CNTS_START,
69 PROF_CNTS_STOP,
70 PROF_DATA_START,
71 PROF_DATA_STOP,
72 &INSTR_PROF_RAW_VERSION_VAR};
73
74#elif defined(__NVPTX__)
75
76// NVPTX supports neither sections nor ELF symbols, we rely on the handling in
77// the 'InstrProfilingPlatformOther.c' file to fill this at initialization time.
78// FIXME: This will not work until we make the NVPTX backend emit section
79// globals next to each other.
80COMPILER_RT_GPU_VISIBILITY
81__llvm_profile_gpu_sections INSTR_PROF_SECT_BOUNDS_TABLE = {
82 NULL, NULL, NULL, NULL, NULL, NULL, &INSTR_PROF_RAW_VERSION_VAR};
83
84#endif
85
86#endif
87