| 1 | /*===- InstrProfilingValue.c - Support library for PGO instrumentation ----===*\ |
| 2 | |* |
| 3 | |* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | |* See https://llvm.org/LICENSE.txt for license information. |
| 5 | |* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | |* |
| 7 | \*===----------------------------------------------------------------------===*/ |
| 8 | |
| 9 | #include <assert.h> |
| 10 | #include <limits.h> |
| 11 | #include <stdio.h> |
| 12 | #include <stdlib.h> |
| 13 | #include <string.h> |
| 14 | |
| 15 | #include "InstrProfiling.h" |
| 16 | #include "InstrProfilingInternal.h" |
| 17 | #include "InstrProfilingUtil.h" |
| 18 | |
| 19 | #define INSTR_PROF_VALUE_PROF_DATA |
| 20 | #define INSTR_PROF_COMMON_API_IMPL |
| 21 | #define INSTR_PROF_VALUE_PROF_MEMOP_API |
| 22 | #include "profile/InstrProfData.inc" |
| 23 | |
| 24 | static int hasStaticCounters = 1; |
| 25 | static int OutOfNodesWarnings = 0; |
| 26 | static int hasNonDefaultValsPerSite = 0; |
| 27 | #define INSTR_PROF_MAX_VP_WARNS 10 |
| 28 | #define INSTR_PROF_DEFAULT_NUM_VAL_PER_SITE 24 |
| 29 | #define INSTR_PROF_VNODE_POOL_SIZE 1024 |
| 30 | |
| 31 | #ifndef _MSC_VER |
| 32 | /* A shared static pool in addition to the vnodes statically |
| 33 | * allocated by the compiler. */ |
| 34 | COMPILER_RT_VISIBILITY ValueProfNode |
| 35 | lprofValueProfNodes[INSTR_PROF_VNODE_POOL_SIZE] COMPILER_RT_SECTION( |
| 36 | COMPILER_RT_SEG INSTR_PROF_VNODES_SECT_NAME); |
| 37 | #endif |
| 38 | |
| 39 | COMPILER_RT_VISIBILITY uint32_t VPMaxNumValsPerSite = |
| 40 | INSTR_PROF_DEFAULT_NUM_VAL_PER_SITE; |
| 41 | |
| 42 | COMPILER_RT_VISIBILITY void lprofSetupValueProfiler(void) { |
| 43 | const char *Str = 0; |
| 44 | Str = getenv(name: "LLVM_VP_MAX_NUM_VALS_PER_SITE" ); |
| 45 | if (Str && Str[0]) { |
| 46 | VPMaxNumValsPerSite = atoi(nptr: Str); |
| 47 | hasNonDefaultValsPerSite = 1; |
| 48 | } |
| 49 | if (VPMaxNumValsPerSite > INSTR_PROF_MAX_NUM_VAL_PER_SITE) |
| 50 | VPMaxNumValsPerSite = INSTR_PROF_MAX_NUM_VAL_PER_SITE; |
| 51 | } |
| 52 | |
| 53 | COMPILER_RT_VISIBILITY void lprofSetMaxValsPerSite(uint32_t MaxVals) { |
| 54 | VPMaxNumValsPerSite = MaxVals; |
| 55 | hasNonDefaultValsPerSite = 1; |
| 56 | } |
| 57 | |
| 58 | /* This method is only used in value profiler mock testing. */ |
| 59 | COMPILER_RT_VISIBILITY void |
| 60 | __llvm_profile_set_num_value_sites(__llvm_profile_data *Data, |
| 61 | uint32_t ValueKind, uint16_t NumValueSites) { |
| 62 | #ifdef __GNUC__ |
| 63 | #pragma GCC diagnostic push |
| 64 | #pragma GCC diagnostic ignored "-Wcast-qual" |
| 65 | #elif defined(__clang__) |
| 66 | #pragma clang diagnostic push |
| 67 | #pragma clang diagnostic ignored "-Wcast-qual" |
| 68 | #endif |
| 69 | *((uint16_t *)&Data->NumValueSites[ValueKind]) = NumValueSites; |
| 70 | #ifdef __GNUC__ |
| 71 | #pragma GCC diagnostic pop |
| 72 | #elif defined(__clang__) |
| 73 | #pragma clang diagnostic pop |
| 74 | #endif |
| 75 | } |
| 76 | |
| 77 | /* This method is only used in value profiler mock testing. */ |
| 78 | COMPILER_RT_VISIBILITY const __llvm_profile_data * |
| 79 | __llvm_profile_iterate_data(const __llvm_profile_data *Data) { |
| 80 | return Data + 1; |
| 81 | } |
| 82 | |
| 83 | /* This method is only used in value profiler mock testing. */ |
| 84 | COMPILER_RT_VISIBILITY void * |
| 85 | __llvm_get_function_addr(const __llvm_profile_data *Data) { |
| 86 | return Data->FunctionPointer; |
| 87 | } |
| 88 | |
| 89 | /* Allocate an array that holds the pointers to the linked lists of |
| 90 | * value profile counter nodes. The number of element of the array |
| 91 | * is the total number of value profile sites instrumented. Returns |
| 92 | * 0 if allocation fails. |
| 93 | */ |
| 94 | |
| 95 | static int allocateValueProfileCounters(__llvm_profile_data *Data) { |
| 96 | uint64_t NumVSites = 0; |
| 97 | uint32_t VKI; |
| 98 | |
| 99 | /* This function will never be called when value site array is allocated |
| 100 | statically at compile time. */ |
| 101 | hasStaticCounters = 0; |
| 102 | /* When dynamic allocation is enabled, allow tracking the max number of |
| 103 | * values allowd. */ |
| 104 | if (!hasNonDefaultValsPerSite) |
| 105 | VPMaxNumValsPerSite = INSTR_PROF_MAX_NUM_VAL_PER_SITE; |
| 106 | |
| 107 | for (VKI = IPVK_First; VKI <= IPVK_Last; ++VKI) |
| 108 | NumVSites += Data->NumValueSites[VKI]; |
| 109 | |
| 110 | // If NumVSites = 0, calloc is allowed to return a non-null pointer. |
| 111 | assert(NumVSites > 0 && "NumVSites can't be zero" ); |
| 112 | ValueProfNode **Mem = |
| 113 | (ValueProfNode **)calloc(nmemb: NumVSites, size: sizeof(ValueProfNode *)); |
| 114 | if (!Mem) |
| 115 | return 0; |
| 116 | if (!COMPILER_RT_BOOL_CMPXCHG(&Data->Values, 0, Mem)) { |
| 117 | free(ptr: Mem); |
| 118 | return 0; |
| 119 | } |
| 120 | return 1; |
| 121 | } |
| 122 | |
| 123 | static ValueProfNode *allocateOneNode(void) { |
| 124 | ValueProfNode *Node; |
| 125 | |
| 126 | if (!hasStaticCounters) |
| 127 | return (ValueProfNode *)calloc(nmemb: 1, size: sizeof(ValueProfNode)); |
| 128 | |
| 129 | /* Early check to avoid value wrapping around. */ |
| 130 | if (CurrentVNode + 1 > EndVNode) { |
| 131 | if (OutOfNodesWarnings++ < INSTR_PROF_MAX_VP_WARNS) { |
| 132 | PROF_WARN("Unable to track new values: %s. " |
| 133 | " Consider using option -mllvm -vp-counters-per-site=<n> to " |
| 134 | "allocate more" |
| 135 | " value profile counters at compile time. \n" , |
| 136 | "Running out of static counters" ); |
| 137 | } |
| 138 | return 0; |
| 139 | } |
| 140 | Node = COMPILER_RT_PTR_FETCH_ADD(ValueProfNode, CurrentVNode, 1); |
| 141 | /* Due to section padding, EndVNode point to a byte which is one pass |
| 142 | * an incomplete VNode, so we need to skip the last incomplete node. */ |
| 143 | if (Node + 1 > EndVNode) |
| 144 | return 0; |
| 145 | |
| 146 | return Node; |
| 147 | } |
| 148 | |
| 149 | static COMPILER_RT_ALWAYS_INLINE void |
| 150 | instrumentTargetValueImpl(uint64_t TargetValue, void *Data, |
| 151 | uint32_t CounterIndex, uint64_t CountValue) { |
| 152 | __llvm_profile_data *PData = (__llvm_profile_data *)Data; |
| 153 | if (!PData) |
| 154 | return; |
| 155 | if (!CountValue) |
| 156 | return; |
| 157 | if (!PData->Values) { |
| 158 | if (!allocateValueProfileCounters(Data: PData)) |
| 159 | return; |
| 160 | } |
| 161 | |
| 162 | ValueProfNode **ValueCounters = (ValueProfNode **)PData->Values; |
| 163 | ValueProfNode *PrevVNode = NULL; |
| 164 | ValueProfNode *MinCountVNode = NULL; |
| 165 | ValueProfNode *CurVNode = ValueCounters[CounterIndex]; |
| 166 | uint64_t MinCount = UINT64_MAX; |
| 167 | |
| 168 | uint8_t VDataCount = 0; |
| 169 | while (CurVNode) { |
| 170 | if (TargetValue == CurVNode->Value) { |
| 171 | CurVNode->Count += CountValue; |
| 172 | return; |
| 173 | } |
| 174 | if (CurVNode->Count < MinCount) { |
| 175 | MinCount = CurVNode->Count; |
| 176 | MinCountVNode = CurVNode; |
| 177 | } |
| 178 | PrevVNode = CurVNode; |
| 179 | CurVNode = CurVNode->Next; |
| 180 | ++VDataCount; |
| 181 | } |
| 182 | |
| 183 | if (VDataCount >= VPMaxNumValsPerSite) { |
| 184 | /* Bump down the min count node's count. If it reaches 0, |
| 185 | * evict it. This eviction/replacement policy makes hot |
| 186 | * targets more sticky while cold targets less so. In other |
| 187 | * words, it makes it less likely for the hot targets to be |
| 188 | * prematurally evicted during warmup/establishment period, |
| 189 | * when their counts are still low. In a special case when |
| 190 | * the number of values tracked is reduced to only one, this |
| 191 | * policy will guarantee that the dominating target with >50% |
| 192 | * total count will survive in the end. Note that this scheme |
| 193 | * allows the runtime to track the min count node in an adaptive |
| 194 | * manner. It can correct previous mistakes and eventually |
| 195 | * lock on a cold target that is alread in stable state. |
| 196 | * |
| 197 | * In very rare cases, this replacement scheme may still lead |
| 198 | * to target loss. For instance, out of \c N value slots, \c N-1 |
| 199 | * slots are occupied by luke warm targets during the warmup |
| 200 | * period and the remaining one slot is competed by two or more |
| 201 | * very hot targets. If those hot targets occur in an interleaved |
| 202 | * way, none of them will survive (gain enough weight to throw out |
| 203 | * other established entries) due to the ping-pong effect. |
| 204 | * To handle this situation, user can choose to increase the max |
| 205 | * number of tracked values per value site. Alternatively, a more |
| 206 | * expensive eviction mechanism can be implemented. It requires |
| 207 | * the runtime to track the total number of evictions per-site. |
| 208 | * When the total number of evictions reaches certain threshold, |
| 209 | * the runtime can wipe out more than one lowest count entries |
| 210 | * to give space for hot targets. |
| 211 | */ |
| 212 | if (MinCountVNode->Count <= CountValue) { |
| 213 | CurVNode = MinCountVNode; |
| 214 | CurVNode->Value = TargetValue; |
| 215 | CurVNode->Count = CountValue; |
| 216 | } else |
| 217 | MinCountVNode->Count -= CountValue; |
| 218 | |
| 219 | return; |
| 220 | } |
| 221 | |
| 222 | CurVNode = allocateOneNode(); |
| 223 | if (!CurVNode) |
| 224 | return; |
| 225 | CurVNode->Value = TargetValue; |
| 226 | CurVNode->Count += CountValue; |
| 227 | |
| 228 | uint32_t Success = 0; |
| 229 | if (!ValueCounters[CounterIndex]) |
| 230 | Success = |
| 231 | COMPILER_RT_BOOL_CMPXCHG(&ValueCounters[CounterIndex], 0, CurVNode); |
| 232 | else if (PrevVNode && !PrevVNode->Next) |
| 233 | Success = COMPILER_RT_BOOL_CMPXCHG(&(PrevVNode->Next), 0, CurVNode); |
| 234 | |
| 235 | if (!Success && !hasStaticCounters) { |
| 236 | free(ptr: CurVNode); |
| 237 | return; |
| 238 | } |
| 239 | } |
| 240 | |
| 241 | COMPILER_RT_VISIBILITY void |
| 242 | __llvm_profile_instrument_target(uint64_t TargetValue, void *Data, |
| 243 | uint32_t CounterIndex) { |
| 244 | instrumentTargetValueImpl(TargetValue, Data, CounterIndex, CountValue: 1); |
| 245 | } |
| 246 | COMPILER_RT_VISIBILITY void |
| 247 | __llvm_profile_instrument_target_value(uint64_t TargetValue, void *Data, |
| 248 | uint32_t CounterIndex, |
| 249 | uint64_t CountValue) { |
| 250 | instrumentTargetValueImpl(TargetValue, Data, CounterIndex, CountValue); |
| 251 | } |
| 252 | |
| 253 | /* |
| 254 | * The target values are partitioned into multiple ranges. The range spec is |
| 255 | * defined in InstrProfData.inc. |
| 256 | */ |
| 257 | COMPILER_RT_VISIBILITY void |
| 258 | __llvm_profile_instrument_memop(uint64_t TargetValue, void *Data, |
| 259 | uint32_t CounterIndex) { |
| 260 | // Map the target value to the representative value of its range. |
| 261 | uint64_t RepValue = InstrProfGetRangeRepValue(Value: TargetValue); |
| 262 | __llvm_profile_instrument_target(TargetValue: RepValue, Data, CounterIndex); |
| 263 | } |
| 264 | |
| 265 | /* |
| 266 | * A wrapper struct that represents value profile runtime data. |
| 267 | * Like InstrProfRecord class which is used by profiling host tools, |
| 268 | * ValueProfRuntimeRecord also implements the abstract interfaces defined in |
| 269 | * ValueProfRecordClosure so that the runtime data can be serialized using |
| 270 | * shared C implementation. |
| 271 | */ |
| 272 | typedef struct ValueProfRuntimeRecord { |
| 273 | const __llvm_profile_data *Data; |
| 274 | ValueProfNode **NodesKind[IPVK_Last + 1]; |
| 275 | uint8_t **SiteCountArray; |
| 276 | } ValueProfRuntimeRecord; |
| 277 | |
| 278 | /* ValueProfRecordClosure Interface implementation. */ |
| 279 | |
| 280 | static uint32_t getNumValueSitesRT(const void *R, uint32_t VK) { |
| 281 | return ((const ValueProfRuntimeRecord *)R)->Data->NumValueSites[VK]; |
| 282 | } |
| 283 | |
| 284 | static uint32_t getNumValueDataRT(const void *R, uint32_t VK) { |
| 285 | uint32_t S = 0, I; |
| 286 | const ValueProfRuntimeRecord *Record = (const ValueProfRuntimeRecord *)R; |
| 287 | if (Record->SiteCountArray[VK] == INSTR_PROF_NULLPTR) |
| 288 | return 0; |
| 289 | for (I = 0; I < Record->Data->NumValueSites[VK]; I++) |
| 290 | S += Record->SiteCountArray[VK][I]; |
| 291 | return S; |
| 292 | } |
| 293 | |
| 294 | static uint32_t getNumValueDataForSiteRT(const void *R, uint32_t VK, |
| 295 | uint32_t S) { |
| 296 | const ValueProfRuntimeRecord *Record = (const ValueProfRuntimeRecord *)R; |
| 297 | return Record->SiteCountArray[VK][S]; |
| 298 | } |
| 299 | |
| 300 | static ValueProfRuntimeRecord RTRecord; |
| 301 | static ValueProfRecordClosure RTRecordClosure = { |
| 302 | &RTRecord, INSTR_PROF_NULLPTR, /* GetNumValueKinds */ |
| 303 | getNumValueSitesRT, getNumValueDataRT, getNumValueDataForSiteRT, |
| 304 | INSTR_PROF_NULLPTR, /* RemapValueData */ |
| 305 | INSTR_PROF_NULLPTR, /* GetValueForSite, */ |
| 306 | INSTR_PROF_NULLPTR /* AllocValueProfData */ |
| 307 | }; |
| 308 | |
| 309 | static uint32_t |
| 310 | initializeValueProfRuntimeRecord(const __llvm_profile_data *Data, |
| 311 | uint8_t *SiteCountArray[]) { |
| 312 | unsigned I, J, S = 0, NumValueKinds = 0; |
| 313 | ValueProfNode **Nodes = (ValueProfNode **)Data->Values; |
| 314 | RTRecord.Data = Data; |
| 315 | RTRecord.SiteCountArray = SiteCountArray; |
| 316 | for (I = 0; I <= IPVK_Last; I++) { |
| 317 | uint16_t N = Data->NumValueSites[I]; |
| 318 | if (!N) |
| 319 | continue; |
| 320 | |
| 321 | NumValueKinds++; |
| 322 | |
| 323 | RTRecord.NodesKind[I] = Nodes ? &Nodes[S] : INSTR_PROF_NULLPTR; |
| 324 | for (J = 0; J < N; J++) { |
| 325 | /* Compute value count for each site. */ |
| 326 | uint32_t C = 0; |
| 327 | ValueProfNode *Site = |
| 328 | Nodes ? RTRecord.NodesKind[I][J] : INSTR_PROF_NULLPTR; |
| 329 | while (Site) { |
| 330 | C++; |
| 331 | Site = Site->Next; |
| 332 | } |
| 333 | if (C > UCHAR_MAX) |
| 334 | C = UCHAR_MAX; |
| 335 | RTRecord.SiteCountArray[I][J] = C; |
| 336 | } |
| 337 | S += N; |
| 338 | } |
| 339 | return NumValueKinds; |
| 340 | } |
| 341 | |
| 342 | static ValueProfNode *getNextNValueData(uint32_t VK, uint32_t Site, |
| 343 | InstrProfValueData *Dst, |
| 344 | ValueProfNode *StartNode, uint32_t N) { |
| 345 | unsigned I; |
| 346 | ValueProfNode *VNode = StartNode ? StartNode : RTRecord.NodesKind[VK][Site]; |
| 347 | for (I = 0; I < N; I++) { |
| 348 | Dst[I].Value = VNode->Value; |
| 349 | Dst[I].Count = VNode->Count; |
| 350 | VNode = VNode->Next; |
| 351 | } |
| 352 | return VNode; |
| 353 | } |
| 354 | |
| 355 | static uint32_t getValueProfDataSizeWrapper(void) { |
| 356 | return getValueProfDataSize(Closure: &RTRecordClosure); |
| 357 | } |
| 358 | |
| 359 | static uint32_t getNumValueDataForSiteWrapper(uint32_t VK, uint32_t S) { |
| 360 | return getNumValueDataForSiteRT(R: &RTRecord, VK, S); |
| 361 | } |
| 362 | |
| 363 | static VPDataReaderType TheVPDataReader = { |
| 364 | initializeValueProfRuntimeRecord, getValueProfRecordHeaderSize, |
| 365 | getFirstValueProfRecord, getNumValueDataForSiteWrapper, |
| 366 | getValueProfDataSizeWrapper, getNextNValueData}; |
| 367 | |
| 368 | COMPILER_RT_VISIBILITY VPDataReaderType *lprofGetVPDataReader(void) { |
| 369 | return &TheVPDataReader; |
| 370 | } |
| 371 | |