1//===- DataAccessProf.h - Data access profile format support ---------*- C++
2//-*-===//
3//
4// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5// See https://llvm.org/LICENSE.txt for license information.
6// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7//
8//===----------------------------------------------------------------------===//
9//
10// This file contains support to construct and use data access profiles.
11//
12// For the original RFC of this pass please see
13// https://discourse.llvm.org/t/rfc-profile-guided-static-data-partitioning/83744
14//
15//===----------------------------------------------------------------------===//
16
17#ifndef LLVM_PROFILEDATA_DATAACCESSPROF_H_
18#define LLVM_PROFILEDATA_DATAACCESSPROF_H_
19
20#include "llvm/ADT/DenseMapInfoVariant.h"
21#include "llvm/ADT/MapVector.h"
22#include "llvm/ADT/SetVector.h"
23#include "llvm/ADT/SmallVector.h"
24#include "llvm/ADT/StringRef.h"
25#include "llvm/ProfileData/InstrProf.h"
26#include "llvm/Support/Allocator.h"
27#include "llvm/Support/Compiler.h"
28#include "llvm/Support/Error.h"
29#include "llvm/Support/StringSaver.h"
30
31#include <cstdint>
32#include <optional>
33#include <variant>
34
35namespace llvm {
36
37namespace memprof {
38
39/// The location of data in the source code. Used by profile lookup API.
40struct SourceLocation {
41 SourceLocation(StringRef FileNameRef, uint32_t Line)
42 : FileName(FileNameRef.str()), Line(Line) {}
43
44 // Empty constructor is used in yaml conversion.
45 SourceLocation() {}
46 /// The filename where the data is located.
47 std::string FileName;
48 /// The line number in the source code.
49 uint32_t Line;
50};
51
52namespace internal {
53
54// Conceptually similar to SourceLocation except that FileNames are StringRef of
55// which strings are owned by `DataAccessProfData`. Used by `DataAccessProfData`
56// to represent data locations internally.
57struct SourceLocationRef {
58 SourceLocationRef(StringRef FileNameRef, uint32_t Line)
59 : FileName(FileNameRef), Line(Line) {}
60 // The filename where the data is located.
61 StringRef FileName;
62 // The line number in the source code.
63 uint32_t Line;
64};
65
66// The data access profiles for a symbol. Used by `DataAccessProfData`
67// to represent records internally.
68struct DataAccessProfRecordRef {
69 DataAccessProfRecordRef(uint64_t SymbolID, uint64_t AccessCount,
70 bool IsStringLiteral)
71 : SymbolID(SymbolID), AccessCount(AccessCount),
72 IsStringLiteral(IsStringLiteral) {}
73
74 // Represents a data symbol. The semantic comes in two forms: a symbol index
75 // for symbol name if `IsStringLiteral` is false, or the hash of a string
76 // content if `IsStringLiteral` is true. For most of the symbolizable static
77 // data, the mangled symbol names remain stable relative to the source code
78 // and therefore used to identify symbols across binary releases. String
79 // literals have unstable name patterns like `.str.N[.llvm.hash]`, so we use
80 // the content hash instead. This is a required field.
81 uint64_t SymbolID;
82
83 // The access count of symbol. Required.
84 uint64_t AccessCount;
85
86 // True iff this is a record for string literal (symbols with name pattern
87 // `.str.*` in the symbol table). Required.
88 bool IsStringLiteral;
89
90 // The locations of data in the source code. Optional.
91 llvm::SmallVector<SourceLocationRef, 0> Locations;
92};
93} // namespace internal
94
95// SymbolID is either a string representing symbol name if the symbol has
96// stable mangled name relative to source code, or a uint64_t representing the
97// content hash of a string literal (with unstable name patterns like
98// `.str.N[.llvm.hash]`). The StringRef is owned by the class's saver object.
99using SymbolHandleRef = std::variant<StringRef, uint64_t>;
100
101// The senamtic is the same as `SymbolHandleRef` above. The strings are owned.
102using SymbolHandle = std::variant<std::string, uint64_t>;
103
104/// The data access profiles for a symbol.
105struct DataAccessProfRecord {
106public:
107 DataAccessProfRecord(SymbolHandleRef SymHandleRef, uint64_t AccessCount,
108 ArrayRef<internal::SourceLocationRef> LocRefs)
109 : AccessCount(AccessCount) {
110 if (std::holds_alternative<StringRef>(v: SymHandleRef)) {
111 SymHandle = std::get<StringRef>(v&: SymHandleRef).str();
112 } else
113 SymHandle = std::get<uint64_t>(v&: SymHandleRef);
114
115 for (auto Loc : LocRefs)
116 Locations.emplace_back(Args&: Loc.FileName, Args&: Loc.Line);
117 }
118 // Empty constructor is used in yaml conversion.
119 DataAccessProfRecord() : AccessCount(0) {}
120 SymbolHandle SymHandle;
121 uint64_t AccessCount;
122 // The locations of data in the source code. Optional.
123 SmallVector<SourceLocation> Locations;
124};
125
126/// Encapsulates the data access profile data and the methods to operate on
127/// it. This class provides profile look-up, serialization and
128/// deserialization.
129class DataAccessProfData {
130public:
131 // Use MapVector to keep input order of strings for serialization and
132 // deserialization.
133 using StringToIndexMap = llvm::MapVector<StringRef, uint64_t>;
134
135 DataAccessProfData() : Saver(Allocator) {}
136
137 /// Serialize profile data to the output stream.
138 /// Storage layout:
139 /// - Serialized strings.
140 /// - The encoded hashes.
141 /// - Records.
142 LLVM_ABI Error serialize(ProfOStream &OS) const;
143
144 /// Deserialize this class from the given buffer.
145 LLVM_ABI Error deserialize(const unsigned char *&Ptr);
146
147 /// Returns a profile record for \p SymbolID, or std::nullopt if there
148 /// isn't a record. Internally, this function will canonicalize the symbol
149 /// name before the lookup.
150 LLVM_ABI std::optional<DataAccessProfRecord>
151 getProfileRecord(const SymbolHandleRef SymID) const;
152
153 /// Returns true if \p SymID is seen in profiled binaries and cold.
154 LLVM_ABI bool isKnownColdSymbol(const SymbolHandleRef SymID) const;
155
156 /// Methods to set symbolized data access profile. Returns error if
157 /// duplicated symbol names or content hashes are seen. The user of this
158 /// class should aggregate counters that correspond to the same symbol name
159 /// or with the same string literal hash before calling 'set*' methods.
160 LLVM_ABI Error setDataAccessProfile(SymbolHandleRef SymbolID,
161 uint64_t AccessCount);
162 /// Similar to the method above, for records with \p Locations representing
163 /// the `filename:line` where this symbol shows up. Note because of linker's
164 /// merge of identical symbols (e.g., unnamed_addr string literals), one
165 /// symbol is likely to have multiple locations.
166 LLVM_ABI Error setDataAccessProfile(SymbolHandleRef SymbolID,
167 uint64_t AccessCount,
168 ArrayRef<SourceLocation> Locations);
169 /// Add a symbol that's seen in the profiled binary without samples.
170 LLVM_ABI Error addKnownSymbolWithoutSamples(SymbolHandleRef SymbolID);
171
172 /// The following methods return array reference for various internal data
173 /// structures.
174 ArrayRef<StringToIndexMap::value_type> getStrToIndexMapRef() const {
175 return StrToIndexMap.getArrayRef();
176 }
177 ArrayRef<
178 MapVector<SymbolHandleRef, internal::DataAccessProfRecordRef>::value_type>
179 getRecords() const {
180 return Records.getArrayRef();
181 }
182 ArrayRef<StringRef> getKnownColdSymbols() const {
183 return KnownColdSymbols.getArrayRef();
184 }
185 ArrayRef<uint64_t> getKnownColdHashes() const {
186 return KnownColdHashes.getArrayRef();
187 }
188
189private:
190 /// Serialize the symbol strings into the output stream.
191 Error serializeSymbolsAndFilenames(ProfOStream &OS) const;
192
193 /// Deserialize the symbol strings from \p Ptr and increment \p Ptr to the
194 /// start of the next payload.
195 Error deserializeSymbolsAndFilenames(const unsigned char *&Ptr,
196 const uint64_t NumSampledSymbols,
197 const uint64_t NumColdKnownSymbols);
198
199 /// Decode the records and increment \p Ptr to the start of the next
200 /// payload.
201 Error deserializeRecords(const unsigned char *&Ptr);
202
203 /// A helper function to compute a storage index for \p SymbolID.
204 uint64_t getEncodedIndex(const SymbolHandleRef SymbolID) const;
205
206 // Keeps owned copies of the input strings.
207 // NOTE: Keep `Saver` initialized before other class members that reference
208 // its string copies and destructed after they are destructed.
209 llvm::BumpPtrAllocator Allocator;
210 llvm::UniqueStringSaver Saver;
211
212 // `Records` stores the records.
213 MapVector<SymbolHandleRef, internal::DataAccessProfRecordRef> Records;
214
215 StringToIndexMap StrToIndexMap;
216 llvm::SetVector<uint64_t> KnownColdHashes;
217 llvm::SetVector<StringRef> KnownColdSymbols;
218};
219
220} // namespace memprof
221} // namespace llvm
222
223#endif // LLVM_PROFILEDATA_DATAACCESSPROF_H_
224