1//===- DbiStreamBuilder.cpp - PDB Dbi Stream Creation -----------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// The data structures defined in this file are based on the reference
10// implementation which is available at
11// https://github.com/Microsoft/microsoft-pdb/blob/master/PDB/dbi/gsi.cpp
12//
13//===----------------------------------------------------------------------===//
14
15#include "llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h"
16#include "llvm/DebugInfo/CodeView/RecordName.h"
17#include "llvm/DebugInfo/CodeView/RecordSerialization.h"
18#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
19#include "llvm/DebugInfo/CodeView/SymbolSerializer.h"
20#include "llvm/DebugInfo/MSF/MSFBuilder.h"
21#include "llvm/DebugInfo/MSF/MSFCommon.h"
22#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
23#include "llvm/DebugInfo/PDB/Native/GlobalsStream.h"
24#include "llvm/DebugInfo/PDB/Native/Hash.h"
25#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
26#include "llvm/Support/BinaryItemStream.h"
27#include "llvm/Support/BinaryStreamWriter.h"
28#include "llvm/Support/FormatVariadic.h"
29#include "llvm/Support/Parallel.h"
30#include "llvm/Support/TimeProfiler.h"
31#include "llvm/Support/xxhash.h"
32#include <algorithm>
33#include <vector>
34
35using namespace llvm;
36using namespace llvm::msf;
37using namespace llvm::pdb;
38using namespace llvm::codeview;
39
40// Helper class for building the public and global PDB hash table buckets.
41struct llvm::pdb::GSIHashStreamBuilder {
42 // Sum of the size of all public or global records.
43 uint64_t RecordByteSize = 0;
44
45 std::vector<PSHashRecord> HashRecords;
46
47 // The hash bitmap has `ceil((IPHR_HASH + 1) / 32)` words in it. The
48 // reference implementation builds a hash table with IPHR_HASH buckets in it.
49 // The last bucket is used to link together free hash table cells in a linked
50 // list, but it is always empty in the compressed, on-disk format. However,
51 // the bitmap must have a bit for it.
52 std::array<support::ulittle32_t, (IPHR_HASH + 32) / 32> HashBitmap;
53
54 std::vector<support::ulittle32_t> HashBuckets;
55
56 uint32_t calculateSerializedLength() const;
57 Error commit(BinaryStreamWriter &Writer);
58
59 void finalizePublicBuckets();
60 void finalizeGlobalBuckets(uint32_t RecordZeroOffset);
61
62 // Assign public and global symbol records into hash table buckets.
63 // Modifies the list of records to store the bucket index, but does not
64 // change the order.
65 void finalizeBuckets(uint32_t RecordZeroOffset,
66 MutableArrayRef<BulkPublic> Globals);
67};
68
69// DenseMapInfo implementation for deduplicating symbol records.
70struct llvm::pdb::SymbolDenseMapInfo {
71 static unsigned getHashValue(const CVSymbol &Val) {
72 return xxh3_64bits(data: Val.RecordData);
73 }
74 static bool isEqual(const CVSymbol &LHS, const CVSymbol &RHS) {
75 return LHS.RecordData == RHS.RecordData;
76 }
77};
78
79namespace {
80LLVM_PACKED_START
81struct PublicSym32Layout {
82 RecordPrefix Prefix;
83 PublicSym32Header Pub;
84 // char Name[];
85};
86LLVM_PACKED_END
87} // namespace
88
89// Calculate how much memory this public needs when serialized.
90static uint32_t sizeOfPublic(const BulkPublic &Pub) {
91 uint32_t NameLen = Pub.NameLen;
92 NameLen = std::min(a: NameLen,
93 b: uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1));
94 return alignTo(Value: sizeof(PublicSym32Layout) + NameLen + 1, Align: 4);
95}
96
97static CVSymbol serializePublic(uint8_t *Mem, const BulkPublic &Pub) {
98 // Assume the caller has allocated sizeOfPublic bytes.
99 uint32_t NameLen = std::min(
100 a: Pub.NameLen, b: uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1));
101 size_t Size = alignTo(Value: sizeof(PublicSym32Layout) + NameLen + 1, Align: 4);
102 assert(Size == sizeOfPublic(Pub));
103 auto *FixedMem = reinterpret_cast<PublicSym32Layout *>(Mem);
104 FixedMem->Prefix.RecordKind = static_cast<uint16_t>(codeview::S_PUB32);
105 FixedMem->Prefix.RecordLen = static_cast<uint16_t>(Size - 2);
106 FixedMem->Pub.Flags = Pub.Flags;
107 FixedMem->Pub.Offset = Pub.Offset;
108 FixedMem->Pub.Segment = Pub.Segment;
109 char *NameMem = reinterpret_cast<char *>(FixedMem + 1);
110 memcpy(dest: NameMem, src: Pub.Name, n: NameLen);
111 // Zero the null terminator and remaining bytes.
112 memset(s: &NameMem[NameLen], c: 0, n: Size - sizeof(PublicSym32Layout) - NameLen);
113 return CVSymbol(ArrayRef(Mem, Size));
114}
115
116uint32_t GSIHashStreamBuilder::calculateSerializedLength() const {
117 uint32_t Size = sizeof(GSIHashHeader);
118 Size += HashRecords.size() * sizeof(PSHashRecord);
119 Size += HashBitmap.size() * sizeof(uint32_t);
120 Size += HashBuckets.size() * sizeof(uint32_t);
121 return Size;
122}
123
124Error GSIHashStreamBuilder::commit(BinaryStreamWriter &Writer) {
125 GSIHashHeader Header;
126 Header.VerSignature = GSIHashHeader::HdrSignature;
127 Header.VerHdr = GSIHashHeader::HdrVersion;
128 Header.HrSize = HashRecords.size() * sizeof(PSHashRecord);
129 Header.NumBuckets = HashBitmap.size() * 4 + HashBuckets.size() * 4;
130
131 if (auto EC = Writer.writeObject(Obj: Header))
132 return EC;
133
134 if (auto EC = Writer.writeArray(Array: ArrayRef(HashRecords)))
135 return EC;
136 if (auto EC = Writer.writeArray(Array: ArrayRef(HashBitmap)))
137 return EC;
138 if (auto EC = Writer.writeArray(Array: ArrayRef(HashBuckets)))
139 return EC;
140 return Error::success();
141}
142
143static bool isAsciiString(StringRef S) {
144 return llvm::all_of(Range&: S, P: [](char C) { return unsigned(C) < 0x80; });
145}
146
147// See `caseInsensitiveComparePchPchCchCch` in gsi.cpp
148static int gsiRecordCmp(StringRef S1, StringRef S2) {
149 size_t LS = S1.size();
150 size_t RS = S2.size();
151 // Shorter strings always compare less than longer strings.
152 if (LS != RS)
153 return (LS > RS) - (LS < RS);
154
155 // If either string contains non ascii characters, memcmp them.
156 if (LLVM_UNLIKELY(!isAsciiString(S1) || !isAsciiString(S2)))
157 return memcmp(s1: S1.data(), s2: S2.data(), n: LS);
158
159 // Both strings are ascii, perform a case-insensitive comparison.
160 return S1.compare_insensitive(RHS: S2);
161}
162
163void GSIStreamBuilder::finalizePublicBuckets() {
164 PSH->finalizeBuckets(RecordZeroOffset: 0, Globals: Publics);
165}
166
167void GSIStreamBuilder::finalizeGlobalBuckets(uint32_t RecordZeroOffset) {
168 // Build up a list of globals to be bucketed. Use the BulkPublic data
169 // structure for this purpose, even though these are global records, not
170 // public records. Most of the same fields are required:
171 // - Name
172 // - NameLen
173 // - SymOffset
174 // - BucketIdx
175 // The dead fields are Offset, Segment, and Flags.
176 std::vector<BulkPublic> Records;
177 Records.resize(new_size: Globals.size());
178 uint32_t SymOffset = RecordZeroOffset;
179 for (size_t I = 0, E = Globals.size(); I < E; ++I) {
180 StringRef Name = getSymbolName(Sym: Globals[I]);
181 Records[I].Name = Name.data();
182 Records[I].NameLen = Name.size();
183 Records[I].SymOffset = SymOffset;
184 SymOffset += Globals[I].length();
185 }
186
187 GSH->finalizeBuckets(RecordZeroOffset, Globals: Records);
188}
189
190void GSIHashStreamBuilder::finalizeBuckets(
191 uint32_t RecordZeroOffset, MutableArrayRef<BulkPublic> Records) {
192 // Hash every name in parallel.
193 parallelFor(Begin: 0, End: Records.size(), Fn: [&](size_t I) {
194 Records[I].setBucketIdx(hashStringV1(Str: Records[I].getName()) % IPHR_HASH);
195 });
196
197 // Count up the size of each bucket. Then, use an exclusive prefix sum to
198 // calculate the bucket start offsets. This is C++17 std::exclusive_scan, but
199 // we can't use it yet.
200 uint32_t BucketStarts[IPHR_HASH] = {0};
201 for (const BulkPublic &P : Records)
202 ++BucketStarts[P.BucketIdx];
203 uint32_t Sum = 0;
204 for (uint32_t &B : BucketStarts) {
205 uint32_t Size = B;
206 B = Sum;
207 Sum += Size;
208 }
209
210 // Place globals into the hash table in bucket order. When placing a global,
211 // update the bucket start. Every hash table slot should be filled. Always use
212 // a refcount of one for now.
213 HashRecords.resize(new_size: Records.size());
214 uint32_t BucketCursors[IPHR_HASH];
215 memcpy(dest: BucketCursors, src: BucketStarts, n: sizeof(BucketCursors));
216 for (int I = 0, E = Records.size(); I < E; ++I) {
217 uint32_t HashIdx = BucketCursors[Records[I].BucketIdx]++;
218 HashRecords[HashIdx].Off = I;
219 HashRecords[HashIdx].CRef = 1;
220 }
221
222 // Within the buckets, sort each bucket by memcmp of the symbol's name. It's
223 // important that we use the same sorting algorithm as is used by the
224 // reference implementation to ensure that the search for a record within a
225 // bucket can properly early-out when it detects the record won't be found.
226 // The algorithm used here corresponds to the function
227 // caseInsensitiveComparePchPchCchCch in the reference implementation.
228 parallelFor(Begin: 0, End: IPHR_HASH, Fn: [&](size_t I) {
229 auto B = HashRecords.begin() + BucketStarts[I];
230 auto E = HashRecords.begin() + BucketCursors[I];
231 if (B == E)
232 return;
233 auto BucketCmp = [Records](const PSHashRecord &LHash,
234 const PSHashRecord &RHash) {
235 const BulkPublic &L = Records[uint32_t(LHash.Off)];
236 const BulkPublic &R = Records[uint32_t(RHash.Off)];
237 assert(L.BucketIdx == R.BucketIdx);
238 int Cmp = gsiRecordCmp(S1: L.getName(), S2: R.getName());
239 if (Cmp != 0)
240 return Cmp < 0;
241 // This comparison is necessary to make the sorting stable in the presence
242 // of two static globals with the same name. The easiest way to observe
243 // this is with S_LDATA32 records.
244 return L.SymOffset < R.SymOffset;
245 };
246 llvm::sort(Start: B, End: E, Comp: BucketCmp);
247
248 // After we are done sorting, replace the global indices with the stream
249 // offsets of each global. Add one when writing symbol offsets to disk.
250 // See GSI1::fixSymRecs.
251 for (PSHashRecord &HRec : make_range(x: B, y: E))
252 HRec.Off = Records[uint32_t(HRec.Off)].SymOffset + 1;
253 });
254
255 // For each non-empty bucket, push the bucket start offset into HashBuckets
256 // and set a bit in the hash bitmap.
257 for (uint32_t I = 0; I < HashBitmap.size(); ++I) {
258 uint32_t Word = 0;
259 for (uint32_t J = 0; J < 32; ++J) {
260 // Skip empty buckets.
261 uint32_t BucketIdx = I * 32 + J;
262 if (BucketIdx >= IPHR_HASH ||
263 BucketStarts[BucketIdx] == BucketCursors[BucketIdx])
264 continue;
265 Word |= (1U << J);
266
267 // Calculate what the offset of the first hash record in the chain would
268 // be if it were inflated to contain 32-bit pointers. On a 32-bit system,
269 // each record would be 12 bytes. See HROffsetCalc in gsi.h.
270 const int SizeOfHROffsetCalc = 12;
271 ulittle32_t ChainStartOff =
272 ulittle32_t(BucketStarts[BucketIdx] * SizeOfHROffsetCalc);
273 HashBuckets.push_back(x: ChainStartOff);
274 }
275 HashBitmap[I] = Word;
276 }
277}
278
279GSIStreamBuilder::GSIStreamBuilder(msf::MSFBuilder &Msf)
280 : Msf(Msf), PSH(std::make_unique<GSIHashStreamBuilder>()),
281 GSH(std::make_unique<GSIHashStreamBuilder>()) {}
282
283GSIStreamBuilder::~GSIStreamBuilder() = default;
284
285uint32_t GSIStreamBuilder::calculatePublicsHashStreamSize() const {
286 uint32_t Size = 0;
287 Size += sizeof(PublicsStreamHeader);
288 Size += PSH->calculateSerializedLength();
289 Size += Publics.size() * sizeof(uint32_t); // AddrMap
290 // FIXME: Add thunk map and section offsets for incremental linking.
291
292 return Size;
293}
294
295uint32_t GSIStreamBuilder::calculateGlobalsHashStreamSize() const {
296 return GSH->calculateSerializedLength();
297}
298
299Error GSIStreamBuilder::finalizeMsfLayout() {
300 // First we write public symbol records, then we write global symbol records.
301 finalizePublicBuckets();
302 finalizeGlobalBuckets(RecordZeroOffset: PSH->RecordByteSize);
303
304 Expected<uint32_t> Idx = Msf.addStream(Size: calculateGlobalsHashStreamSize());
305 if (!Idx)
306 return Idx.takeError();
307 GlobalsStreamIndex = *Idx;
308
309 Idx = Msf.addStream(Size: calculatePublicsHashStreamSize());
310 if (!Idx)
311 return Idx.takeError();
312 PublicsStreamIndex = *Idx;
313
314 uint64_t RecordBytes = PSH->RecordByteSize + GSH->RecordByteSize;
315 if (RecordBytes > UINT32_MAX)
316 return make_error<StringError>(
317 Args: formatv(Fmt: "the public symbols ({0} bytes) and global symbols ({1} bytes) "
318 "are too large to fit in a PDB file; "
319 "the maximum total is {2} bytes.",
320 Vals&: PSH->RecordByteSize, Vals&: GSH->RecordByteSize, UINT32_MAX),
321 Args: inconvertibleErrorCode());
322
323 Idx = Msf.addStream(Size: RecordBytes);
324 if (!Idx)
325 return Idx.takeError();
326 RecordStreamIndex = *Idx;
327 return Error::success();
328}
329
330void GSIStreamBuilder::addPublicSymbols(std::vector<BulkPublic> &&PublicsIn) {
331 assert(Publics.empty() && PSH->RecordByteSize == 0 &&
332 "publics can only be added once");
333 Publics = std::move(PublicsIn);
334
335 // Sort the symbols by name. PDBs contain lots of symbols, so use parallelism.
336 parallelSort(R&: Publics, Comp: [](const BulkPublic &L, const BulkPublic &R) {
337 return L.getName() < R.getName();
338 });
339
340 // Assign offsets and calculate the length of the public symbol records.
341 uint32_t SymOffset = 0;
342 for (BulkPublic &Pub : Publics) {
343 Pub.SymOffset = SymOffset;
344 SymOffset += sizeOfPublic(Pub);
345 }
346
347 // Remember the length of the public stream records.
348 PSH->RecordByteSize = SymOffset;
349}
350
351void GSIStreamBuilder::addGlobalSymbol(const ProcRefSym &Sym) {
352 serializeAndAddGlobal(Symbol: Sym);
353}
354
355void GSIStreamBuilder::addGlobalSymbol(const DataSym &Sym) {
356 serializeAndAddGlobal(Symbol: Sym);
357}
358
359void GSIStreamBuilder::addGlobalSymbol(const ConstantSym &Sym) {
360 serializeAndAddGlobal(Symbol: Sym);
361}
362
363template <typename T>
364void GSIStreamBuilder::serializeAndAddGlobal(const T &Symbol) {
365 T Copy(Symbol);
366 addGlobalSymbol(SymbolSerializer::writeOneSymbol(Copy, Msf.getAllocator(),
367 CodeViewContainer::Pdb));
368}
369
370void GSIStreamBuilder::addGlobalSymbol(const codeview::CVSymbol &Symbol) {
371 // Ignore duplicate typedefs and constants.
372 if (Symbol.kind() == S_UDT || Symbol.kind() == S_CONSTANT) {
373 auto Iter = GlobalsSeen.insert(V: Symbol);
374 if (!Iter.second)
375 return;
376 }
377 GSH->RecordByteSize += Symbol.length();
378 Globals.push_back(x: Symbol);
379}
380
381// Serialize each public and write it.
382static Error writePublics(BinaryStreamWriter &Writer,
383 ArrayRef<BulkPublic> Publics) {
384 std::vector<uint8_t> Storage;
385 for (const BulkPublic &Pub : Publics) {
386 Storage.resize(new_size: sizeOfPublic(Pub));
387 serializePublic(Mem: Storage.data(), Pub);
388 if (Error E = Writer.writeBytes(Buffer: Storage))
389 return E;
390 }
391 return Error::success();
392}
393
394static Error writeRecords(BinaryStreamWriter &Writer,
395 ArrayRef<CVSymbol> Records) {
396 BinaryItemStream<CVSymbol> ItemStream(llvm::endianness::little);
397 ItemStream.setItems(Records);
398 BinaryStreamRef RecordsRef(ItemStream);
399 return Writer.writeStreamRef(Ref: RecordsRef);
400}
401
402Error GSIStreamBuilder::commitSymbolRecordStream(
403 WritableBinaryStreamRef Stream) {
404 BinaryStreamWriter Writer(Stream);
405
406 // Write public symbol records first, followed by global symbol records. This
407 // must match the order that we assume in finalizeMsfLayout when computing
408 // PSHZero and GSHZero.
409 if (auto EC = writePublics(Writer, Publics))
410 return EC;
411 if (auto EC = writeRecords(Writer, Records: Globals))
412 return EC;
413
414 return Error::success();
415}
416
417static std::vector<support::ulittle32_t>
418computeAddrMap(ArrayRef<BulkPublic> Publics) {
419 // Build a parallel vector of indices into the Publics vector, and sort it by
420 // address.
421 std::vector<ulittle32_t> PubAddrMap;
422 PubAddrMap.reserve(n: Publics.size());
423 for (int I = 0, E = Publics.size(); I < E; ++I)
424 PubAddrMap.push_back(x: ulittle32_t(I));
425
426 auto AddrCmp = [Publics](const ulittle32_t &LIdx, const ulittle32_t &RIdx) {
427 const BulkPublic &L = Publics[LIdx];
428 const BulkPublic &R = Publics[RIdx];
429 if (L.Segment != R.Segment)
430 return L.Segment < R.Segment;
431 if (L.Offset != R.Offset)
432 return L.Offset < R.Offset;
433 // parallelSort is unstable, so we have to do name comparison to ensure
434 // that two names for the same location come out in a deterministic order.
435 return L.getName() < R.getName();
436 };
437 parallelSort(R&: PubAddrMap, Comp: AddrCmp);
438
439 // Rewrite the public symbol indices into symbol offsets.
440 for (ulittle32_t &Entry : PubAddrMap)
441 Entry = Publics[Entry].SymOffset;
442 return PubAddrMap;
443}
444
445Error GSIStreamBuilder::commitPublicsHashStream(
446 WritableBinaryStreamRef Stream) {
447 BinaryStreamWriter Writer(Stream);
448 PublicsStreamHeader Header;
449
450 // FIXME: Fill these in. They are for incremental linking.
451 Header.SymHash = PSH->calculateSerializedLength();
452 Header.AddrMap = Publics.size() * 4;
453 Header.NumThunks = 0;
454 Header.SizeOfThunk = 0;
455 Header.ISectThunkTable = 0;
456 memset(s: Header.Padding, c: 0, n: sizeof(Header.Padding));
457 Header.OffThunkTable = 0;
458 Header.NumSections = 0;
459 if (auto EC = Writer.writeObject(Obj: Header))
460 return EC;
461
462 if (auto EC = PSH->commit(Writer))
463 return EC;
464
465 std::vector<support::ulittle32_t> PubAddrMap = computeAddrMap(Publics);
466 assert(PubAddrMap.size() == Publics.size());
467 if (auto EC = Writer.writeArray(Array: ArrayRef(PubAddrMap)))
468 return EC;
469
470 return Error::success();
471}
472
473Error GSIStreamBuilder::commitGlobalsHashStream(
474 WritableBinaryStreamRef Stream) {
475 BinaryStreamWriter Writer(Stream);
476 return GSH->commit(Writer);
477}
478
479Error GSIStreamBuilder::commit(const msf::MSFLayout &Layout,
480 WritableBinaryStreamRef Buffer) {
481 llvm::TimeTraceScope timeScope("Commit GSI stream");
482 auto GS = WritableMappedBlockStream::createIndexedStream(
483 Layout, MsfData: Buffer, StreamIndex: getGlobalsStreamIndex(), Allocator&: Msf.getAllocator());
484 auto PS = WritableMappedBlockStream::createIndexedStream(
485 Layout, MsfData: Buffer, StreamIndex: getPublicsStreamIndex(), Allocator&: Msf.getAllocator());
486 auto PRS = WritableMappedBlockStream::createIndexedStream(
487 Layout, MsfData: Buffer, StreamIndex: getRecordStreamIndex(), Allocator&: Msf.getAllocator());
488
489 if (auto EC = commitSymbolRecordStream(Stream: *PRS))
490 return EC;
491 if (auto EC = commitGlobalsHashStream(Stream: *GS))
492 return EC;
493 if (auto EC = commitPublicsHashStream(Stream: *PS))
494 return EC;
495 return Error::success();
496}
497