1//===- DbiStreamBuilder.cpp - PDB Dbi Stream Creation -----------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// The data structures defined in this file are based on the reference
10// implementation which is available at
11// https://github.com/Microsoft/microsoft-pdb/blob/master/PDB/dbi/gsi.cpp
12//
13//===----------------------------------------------------------------------===//
14
15#include "llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h"
16#include "llvm/DebugInfo/CodeView/RecordName.h"
17#include "llvm/DebugInfo/CodeView/RecordSerialization.h"
18#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
19#include "llvm/DebugInfo/CodeView/SymbolSerializer.h"
20#include "llvm/DebugInfo/MSF/MSFBuilder.h"
21#include "llvm/DebugInfo/MSF/MSFCommon.h"
22#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
23#include "llvm/DebugInfo/PDB/Native/GlobalsStream.h"
24#include "llvm/DebugInfo/PDB/Native/Hash.h"
25#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
26#include "llvm/Support/BinaryItemStream.h"
27#include "llvm/Support/BinaryStreamWriter.h"
28#include "llvm/Support/Parallel.h"
29#include "llvm/Support/TimeProfiler.h"
30#include "llvm/Support/xxhash.h"
31#include <algorithm>
32#include <vector>
33
34using namespace llvm;
35using namespace llvm::msf;
36using namespace llvm::pdb;
37using namespace llvm::codeview;
38
39// Helper class for building the public and global PDB hash table buckets.
40struct llvm::pdb::GSIHashStreamBuilder {
41 // Sum of the size of all public or global records.
42 uint32_t RecordByteSize = 0;
43
44 std::vector<PSHashRecord> HashRecords;
45
46 // The hash bitmap has `ceil((IPHR_HASH + 1) / 32)` words in it. The
47 // reference implementation builds a hash table with IPHR_HASH buckets in it.
48 // The last bucket is used to link together free hash table cells in a linked
49 // list, but it is always empty in the compressed, on-disk format. However,
50 // the bitmap must have a bit for it.
51 std::array<support::ulittle32_t, (IPHR_HASH + 32) / 32> HashBitmap;
52
53 std::vector<support::ulittle32_t> HashBuckets;
54
55 uint32_t calculateSerializedLength() const;
56 Error commit(BinaryStreamWriter &Writer);
57
58 void finalizePublicBuckets();
59 void finalizeGlobalBuckets(uint32_t RecordZeroOffset);
60
61 // Assign public and global symbol records into hash table buckets.
62 // Modifies the list of records to store the bucket index, but does not
63 // change the order.
64 void finalizeBuckets(uint32_t RecordZeroOffset,
65 MutableArrayRef<BulkPublic> Globals);
66};
67
68// DenseMapInfo implementation for deduplicating symbol records.
69struct llvm::pdb::SymbolDenseMapInfo {
70 static inline CVSymbol getEmptyKey() {
71 static CVSymbol Empty;
72 return Empty;
73 }
74 static inline CVSymbol getTombstoneKey() {
75 static CVSymbol Tombstone(
76 DenseMapInfo<ArrayRef<uint8_t>>::getTombstoneKey());
77 return Tombstone;
78 }
79 static unsigned getHashValue(const CVSymbol &Val) {
80 return xxh3_64bits(data: Val.RecordData);
81 }
82 static bool isEqual(const CVSymbol &LHS, const CVSymbol &RHS) {
83 return LHS.RecordData == RHS.RecordData;
84 }
85};
86
87namespace {
88LLVM_PACKED_START
89struct PublicSym32Layout {
90 RecordPrefix Prefix;
91 PublicSym32Header Pub;
92 // char Name[];
93};
94LLVM_PACKED_END
95} // namespace
96
97// Calculate how much memory this public needs when serialized.
98static uint32_t sizeOfPublic(const BulkPublic &Pub) {
99 uint32_t NameLen = Pub.NameLen;
100 NameLen = std::min(a: NameLen,
101 b: uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1));
102 return alignTo(Value: sizeof(PublicSym32Layout) + NameLen + 1, Align: 4);
103}
104
105static CVSymbol serializePublic(uint8_t *Mem, const BulkPublic &Pub) {
106 // Assume the caller has allocated sizeOfPublic bytes.
107 uint32_t NameLen = std::min(
108 a: Pub.NameLen, b: uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1));
109 size_t Size = alignTo(Value: sizeof(PublicSym32Layout) + NameLen + 1, Align: 4);
110 assert(Size == sizeOfPublic(Pub));
111 auto *FixedMem = reinterpret_cast<PublicSym32Layout *>(Mem);
112 FixedMem->Prefix.RecordKind = static_cast<uint16_t>(codeview::S_PUB32);
113 FixedMem->Prefix.RecordLen = static_cast<uint16_t>(Size - 2);
114 FixedMem->Pub.Flags = Pub.Flags;
115 FixedMem->Pub.Offset = Pub.Offset;
116 FixedMem->Pub.Segment = Pub.Segment;
117 char *NameMem = reinterpret_cast<char *>(FixedMem + 1);
118 memcpy(dest: NameMem, src: Pub.Name, n: NameLen);
119 // Zero the null terminator and remaining bytes.
120 memset(s: &NameMem[NameLen], c: 0, n: Size - sizeof(PublicSym32Layout) - NameLen);
121 return CVSymbol(ArrayRef(reinterpret_cast<uint8_t *>(Mem), Size));
122}
123
124uint32_t GSIHashStreamBuilder::calculateSerializedLength() const {
125 uint32_t Size = sizeof(GSIHashHeader);
126 Size += HashRecords.size() * sizeof(PSHashRecord);
127 Size += HashBitmap.size() * sizeof(uint32_t);
128 Size += HashBuckets.size() * sizeof(uint32_t);
129 return Size;
130}
131
132Error GSIHashStreamBuilder::commit(BinaryStreamWriter &Writer) {
133 GSIHashHeader Header;
134 Header.VerSignature = GSIHashHeader::HdrSignature;
135 Header.VerHdr = GSIHashHeader::HdrVersion;
136 Header.HrSize = HashRecords.size() * sizeof(PSHashRecord);
137 Header.NumBuckets = HashBitmap.size() * 4 + HashBuckets.size() * 4;
138
139 if (auto EC = Writer.writeObject(Obj: Header))
140 return EC;
141
142 if (auto EC = Writer.writeArray(Array: ArrayRef(HashRecords)))
143 return EC;
144 if (auto EC = Writer.writeArray(Array: ArrayRef(HashBitmap)))
145 return EC;
146 if (auto EC = Writer.writeArray(Array: ArrayRef(HashBuckets)))
147 return EC;
148 return Error::success();
149}
150
151static bool isAsciiString(StringRef S) {
152 return llvm::all_of(Range&: S, P: [](char C) { return unsigned(C) < 0x80; });
153}
154
155// See `caseInsensitiveComparePchPchCchCch` in gsi.cpp
156static int gsiRecordCmp(StringRef S1, StringRef S2) {
157 size_t LS = S1.size();
158 size_t RS = S2.size();
159 // Shorter strings always compare less than longer strings.
160 if (LS != RS)
161 return (LS > RS) - (LS < RS);
162
163 // If either string contains non ascii characters, memcmp them.
164 if (LLVM_UNLIKELY(!isAsciiString(S1) || !isAsciiString(S2)))
165 return memcmp(s1: S1.data(), s2: S2.data(), n: LS);
166
167 // Both strings are ascii, perform a case-insensitive comparison.
168 return S1.compare_insensitive(RHS: S2.data());
169}
170
171void GSIStreamBuilder::finalizePublicBuckets() {
172 PSH->finalizeBuckets(RecordZeroOffset: 0, Globals: Publics);
173}
174
175void GSIStreamBuilder::finalizeGlobalBuckets(uint32_t RecordZeroOffset) {
176 // Build up a list of globals to be bucketed. Use the BulkPublic data
177 // structure for this purpose, even though these are global records, not
178 // public records. Most of the same fields are required:
179 // - Name
180 // - NameLen
181 // - SymOffset
182 // - BucketIdx
183 // The dead fields are Offset, Segment, and Flags.
184 std::vector<BulkPublic> Records;
185 Records.resize(new_size: Globals.size());
186 uint32_t SymOffset = RecordZeroOffset;
187 for (size_t I = 0, E = Globals.size(); I < E; ++I) {
188 StringRef Name = getSymbolName(Sym: Globals[I]);
189 Records[I].Name = Name.data();
190 Records[I].NameLen = Name.size();
191 Records[I].SymOffset = SymOffset;
192 SymOffset += Globals[I].length();
193 }
194
195 GSH->finalizeBuckets(RecordZeroOffset, Globals: Records);
196}
197
198void GSIHashStreamBuilder::finalizeBuckets(
199 uint32_t RecordZeroOffset, MutableArrayRef<BulkPublic> Records) {
200 // Hash every name in parallel.
201 parallelFor(Begin: 0, End: Records.size(), Fn: [&](size_t I) {
202 Records[I].setBucketIdx(hashStringV1(Str: Records[I].Name) % IPHR_HASH);
203 });
204
205 // Count up the size of each bucket. Then, use an exclusive prefix sum to
206 // calculate the bucket start offsets. This is C++17 std::exclusive_scan, but
207 // we can't use it yet.
208 uint32_t BucketStarts[IPHR_HASH] = {0};
209 for (const BulkPublic &P : Records)
210 ++BucketStarts[P.BucketIdx];
211 uint32_t Sum = 0;
212 for (uint32_t &B : BucketStarts) {
213 uint32_t Size = B;
214 B = Sum;
215 Sum += Size;
216 }
217
218 // Place globals into the hash table in bucket order. When placing a global,
219 // update the bucket start. Every hash table slot should be filled. Always use
220 // a refcount of one for now.
221 HashRecords.resize(new_size: Records.size());
222 uint32_t BucketCursors[IPHR_HASH];
223 memcpy(dest: BucketCursors, src: BucketStarts, n: sizeof(BucketCursors));
224 for (int I = 0, E = Records.size(); I < E; ++I) {
225 uint32_t HashIdx = BucketCursors[Records[I].BucketIdx]++;
226 HashRecords[HashIdx].Off = I;
227 HashRecords[HashIdx].CRef = 1;
228 }
229
230 // Within the buckets, sort each bucket by memcmp of the symbol's name. It's
231 // important that we use the same sorting algorithm as is used by the
232 // reference implementation to ensure that the search for a record within a
233 // bucket can properly early-out when it detects the record won't be found.
234 // The algorithm used here corresponds to the function
235 // caseInsensitiveComparePchPchCchCch in the reference implementation.
236 parallelFor(Begin: 0, End: IPHR_HASH, Fn: [&](size_t I) {
237 auto B = HashRecords.begin() + BucketStarts[I];
238 auto E = HashRecords.begin() + BucketCursors[I];
239 if (B == E)
240 return;
241 auto BucketCmp = [Records](const PSHashRecord &LHash,
242 const PSHashRecord &RHash) {
243 const BulkPublic &L = Records[uint32_t(LHash.Off)];
244 const BulkPublic &R = Records[uint32_t(RHash.Off)];
245 assert(L.BucketIdx == R.BucketIdx);
246 int Cmp = gsiRecordCmp(S1: L.getName(), S2: R.getName());
247 if (Cmp != 0)
248 return Cmp < 0;
249 // This comparison is necessary to make the sorting stable in the presence
250 // of two static globals with the same name. The easiest way to observe
251 // this is with S_LDATA32 records.
252 return L.SymOffset < R.SymOffset;
253 };
254 llvm::sort(Start: B, End: E, Comp: BucketCmp);
255
256 // After we are done sorting, replace the global indices with the stream
257 // offsets of each global. Add one when writing symbol offsets to disk.
258 // See GSI1::fixSymRecs.
259 for (PSHashRecord &HRec : make_range(x: B, y: E))
260 HRec.Off = Records[uint32_t(HRec.Off)].SymOffset + 1;
261 });
262
263 // For each non-empty bucket, push the bucket start offset into HashBuckets
264 // and set a bit in the hash bitmap.
265 for (uint32_t I = 0; I < HashBitmap.size(); ++I) {
266 uint32_t Word = 0;
267 for (uint32_t J = 0; J < 32; ++J) {
268 // Skip empty buckets.
269 uint32_t BucketIdx = I * 32 + J;
270 if (BucketIdx >= IPHR_HASH ||
271 BucketStarts[BucketIdx] == BucketCursors[BucketIdx])
272 continue;
273 Word |= (1U << J);
274
275 // Calculate what the offset of the first hash record in the chain would
276 // be if it were inflated to contain 32-bit pointers. On a 32-bit system,
277 // each record would be 12 bytes. See HROffsetCalc in gsi.h.
278 const int SizeOfHROffsetCalc = 12;
279 ulittle32_t ChainStartOff =
280 ulittle32_t(BucketStarts[BucketIdx] * SizeOfHROffsetCalc);
281 HashBuckets.push_back(x: ChainStartOff);
282 }
283 HashBitmap[I] = Word;
284 }
285}
286
287GSIStreamBuilder::GSIStreamBuilder(msf::MSFBuilder &Msf)
288 : Msf(Msf), PSH(std::make_unique<GSIHashStreamBuilder>()),
289 GSH(std::make_unique<GSIHashStreamBuilder>()) {}
290
291GSIStreamBuilder::~GSIStreamBuilder() = default;
292
293uint32_t GSIStreamBuilder::calculatePublicsHashStreamSize() const {
294 uint32_t Size = 0;
295 Size += sizeof(PublicsStreamHeader);
296 Size += PSH->calculateSerializedLength();
297 Size += Publics.size() * sizeof(uint32_t); // AddrMap
298 // FIXME: Add thunk map and section offsets for incremental linking.
299
300 return Size;
301}
302
303uint32_t GSIStreamBuilder::calculateGlobalsHashStreamSize() const {
304 return GSH->calculateSerializedLength();
305}
306
307Error GSIStreamBuilder::finalizeMsfLayout() {
308 // First we write public symbol records, then we write global symbol records.
309 finalizePublicBuckets();
310 finalizeGlobalBuckets(RecordZeroOffset: PSH->RecordByteSize);
311
312 Expected<uint32_t> Idx = Msf.addStream(Size: calculateGlobalsHashStreamSize());
313 if (!Idx)
314 return Idx.takeError();
315 GlobalsStreamIndex = *Idx;
316
317 Idx = Msf.addStream(Size: calculatePublicsHashStreamSize());
318 if (!Idx)
319 return Idx.takeError();
320 PublicsStreamIndex = *Idx;
321
322 uint32_t RecordBytes = PSH->RecordByteSize + GSH->RecordByteSize;
323
324 Idx = Msf.addStream(Size: RecordBytes);
325 if (!Idx)
326 return Idx.takeError();
327 RecordStreamIndex = *Idx;
328 return Error::success();
329}
330
331void GSIStreamBuilder::addPublicSymbols(std::vector<BulkPublic> &&PublicsIn) {
332 assert(Publics.empty() && PSH->RecordByteSize == 0 &&
333 "publics can only be added once");
334 Publics = std::move(PublicsIn);
335
336 // Sort the symbols by name. PDBs contain lots of symbols, so use parallelism.
337 parallelSort(R&: Publics, Comp: [](const BulkPublic &L, const BulkPublic &R) {
338 return L.getName() < R.getName();
339 });
340
341 // Assign offsets and calculate the length of the public symbol records.
342 uint32_t SymOffset = 0;
343 for (BulkPublic &Pub : Publics) {
344 Pub.SymOffset = SymOffset;
345 SymOffset += sizeOfPublic(Pub);
346 }
347
348 // Remember the length of the public stream records.
349 PSH->RecordByteSize = SymOffset;
350}
351
352void GSIStreamBuilder::addGlobalSymbol(const ProcRefSym &Sym) {
353 serializeAndAddGlobal(Symbol: Sym);
354}
355
356void GSIStreamBuilder::addGlobalSymbol(const DataSym &Sym) {
357 serializeAndAddGlobal(Symbol: Sym);
358}
359
360void GSIStreamBuilder::addGlobalSymbol(const ConstantSym &Sym) {
361 serializeAndAddGlobal(Symbol: Sym);
362}
363
364template <typename T>
365void GSIStreamBuilder::serializeAndAddGlobal(const T &Symbol) {
366 T Copy(Symbol);
367 addGlobalSymbol(SymbolSerializer::writeOneSymbol(Copy, Msf.getAllocator(),
368 CodeViewContainer::Pdb));
369}
370
371void GSIStreamBuilder::addGlobalSymbol(const codeview::CVSymbol &Symbol) {
372 // Ignore duplicate typedefs and constants.
373 if (Symbol.kind() == S_UDT || Symbol.kind() == S_CONSTANT) {
374 auto Iter = GlobalsSeen.insert(V: Symbol);
375 if (!Iter.second)
376 return;
377 }
378 GSH->RecordByteSize += Symbol.length();
379 Globals.push_back(x: Symbol);
380}
381
382// Serialize each public and write it.
383static Error writePublics(BinaryStreamWriter &Writer,
384 ArrayRef<BulkPublic> Publics) {
385 std::vector<uint8_t> Storage;
386 for (const BulkPublic &Pub : Publics) {
387 Storage.resize(new_size: sizeOfPublic(Pub));
388 serializePublic(Mem: Storage.data(), Pub);
389 if (Error E = Writer.writeBytes(Buffer: Storage))
390 return E;
391 }
392 return Error::success();
393}
394
395static Error writeRecords(BinaryStreamWriter &Writer,
396 ArrayRef<CVSymbol> Records) {
397 BinaryItemStream<CVSymbol> ItemStream(llvm::endianness::little);
398 ItemStream.setItems(Records);
399 BinaryStreamRef RecordsRef(ItemStream);
400 return Writer.writeStreamRef(Ref: RecordsRef);
401}
402
403Error GSIStreamBuilder::commitSymbolRecordStream(
404 WritableBinaryStreamRef Stream) {
405 BinaryStreamWriter Writer(Stream);
406
407 // Write public symbol records first, followed by global symbol records. This
408 // must match the order that we assume in finalizeMsfLayout when computing
409 // PSHZero and GSHZero.
410 if (auto EC = writePublics(Writer, Publics))
411 return EC;
412 if (auto EC = writeRecords(Writer, Records: Globals))
413 return EC;
414
415 return Error::success();
416}
417
418static std::vector<support::ulittle32_t>
419computeAddrMap(ArrayRef<BulkPublic> Publics) {
420 // Build a parallel vector of indices into the Publics vector, and sort it by
421 // address.
422 std::vector<ulittle32_t> PubAddrMap;
423 PubAddrMap.reserve(n: Publics.size());
424 for (int I = 0, E = Publics.size(); I < E; ++I)
425 PubAddrMap.push_back(x: ulittle32_t(I));
426
427 auto AddrCmp = [Publics](const ulittle32_t &LIdx, const ulittle32_t &RIdx) {
428 const BulkPublic &L = Publics[LIdx];
429 const BulkPublic &R = Publics[RIdx];
430 if (L.Segment != R.Segment)
431 return L.Segment < R.Segment;
432 if (L.Offset != R.Offset)
433 return L.Offset < R.Offset;
434 // parallelSort is unstable, so we have to do name comparison to ensure
435 // that two names for the same location come out in a deterministic order.
436 return L.getName() < R.getName();
437 };
438 parallelSort(R&: PubAddrMap, Comp: AddrCmp);
439
440 // Rewrite the public symbol indices into symbol offsets.
441 for (ulittle32_t &Entry : PubAddrMap)
442 Entry = Publics[Entry].SymOffset;
443 return PubAddrMap;
444}
445
446Error GSIStreamBuilder::commitPublicsHashStream(
447 WritableBinaryStreamRef Stream) {
448 BinaryStreamWriter Writer(Stream);
449 PublicsStreamHeader Header;
450
451 // FIXME: Fill these in. They are for incremental linking.
452 Header.SymHash = PSH->calculateSerializedLength();
453 Header.AddrMap = Publics.size() * 4;
454 Header.NumThunks = 0;
455 Header.SizeOfThunk = 0;
456 Header.ISectThunkTable = 0;
457 memset(s: Header.Padding, c: 0, n: sizeof(Header.Padding));
458 Header.OffThunkTable = 0;
459 Header.NumSections = 0;
460 if (auto EC = Writer.writeObject(Obj: Header))
461 return EC;
462
463 if (auto EC = PSH->commit(Writer))
464 return EC;
465
466 std::vector<support::ulittle32_t> PubAddrMap = computeAddrMap(Publics);
467 assert(PubAddrMap.size() == Publics.size());
468 if (auto EC = Writer.writeArray(Array: ArrayRef(PubAddrMap)))
469 return EC;
470
471 return Error::success();
472}
473
474Error GSIStreamBuilder::commitGlobalsHashStream(
475 WritableBinaryStreamRef Stream) {
476 BinaryStreamWriter Writer(Stream);
477 return GSH->commit(Writer);
478}
479
480Error GSIStreamBuilder::commit(const msf::MSFLayout &Layout,
481 WritableBinaryStreamRef Buffer) {
482 llvm::TimeTraceScope timeScope("Commit GSI stream");
483 auto GS = WritableMappedBlockStream::createIndexedStream(
484 Layout, MsfData: Buffer, StreamIndex: getGlobalsStreamIndex(), Allocator&: Msf.getAllocator());
485 auto PS = WritableMappedBlockStream::createIndexedStream(
486 Layout, MsfData: Buffer, StreamIndex: getPublicsStreamIndex(), Allocator&: Msf.getAllocator());
487 auto PRS = WritableMappedBlockStream::createIndexedStream(
488 Layout, MsfData: Buffer, StreamIndex: getRecordStreamIndex(), Allocator&: Msf.getAllocator());
489
490 if (auto EC = commitSymbolRecordStream(Stream: *PRS))
491 return EC;
492 if (auto EC = commitGlobalsHashStream(Stream: *GS))
493 return EC;
494 if (auto EC = commitPublicsHashStream(Stream: *PS))
495 return EC;
496 return Error::success();
497}
498