GSIStreamBuilder.cpp source code [llvm_projects/llvm/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp]

1	//===- DbiStreamBuilder.cpp - PDB Dbi Stream Creation ------------ C++ --===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// The data structures defined in this file are based on the reference
10	// implementation which is available at
11	// https://github.com/Microsoft/microsoft-pdb/blob/master/PDB/dbi/gsi.cpp
12	//
13	//===----------------------------------------------------------------------===//
14
15	#include "llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h"
16	#include "llvm/DebugInfo/CodeView/RecordName.h"
17	#include "llvm/DebugInfo/CodeView/RecordSerialization.h"
18	#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
19	#include "llvm/DebugInfo/CodeView/SymbolSerializer.h"
20	#include "llvm/DebugInfo/MSF/MSFBuilder.h"
21	#include "llvm/DebugInfo/MSF/MSFCommon.h"
22	#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
23	#include "llvm/DebugInfo/PDB/Native/GlobalsStream.h"
24	#include "llvm/DebugInfo/PDB/Native/Hash.h"
25	#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
26	#include "llvm/Support/BinaryItemStream.h"
27	#include "llvm/Support/BinaryStreamWriter.h"
28	#include "llvm/Support/FormatVariadic.h"
29	#include "llvm/Support/Parallel.h"
30	#include "llvm/Support/TimeProfiler.h"
31	#include "llvm/Support/xxhash.h"
32	#include <algorithm>
33	#include <vector>
34
35	using namespace llvm;
36	using namespace llvm::msf;
37	using namespace llvm::pdb;
38	using namespace llvm::codeview;
39
40	// Helper class for building the public and global PDB hash table buckets.
41	struct llvm::pdb::GSIHashStreamBuilder {
42	// Sum of the size of all public or global records.
43	uint64_t RecordByteSize = `0`;
44
45	std::vector<PSHashRecord> HashRecords;
46
47	// The hash bitmap has `ceil((IPHR_HASH + 1) / 32)` words in it. The
48	// reference implementation builds a hash table with IPHR_HASH buckets in it.
49	// The last bucket is used to link together free hash table cells in a linked
50	// list, but it is always empty in the compressed, on-disk format. However,
51	// the bitmap must have a bit for it.
52	std::array<support::ulittle32_t, (IPHR_HASH + `32`) / `32`> HashBitmap;
53
54	std::vector<support::ulittle32_t> HashBuckets;
55
56	uint32_t calculateSerializedLength() const;
57	Error commit(BinaryStreamWriter &Writer);
58
59	void finalizePublicBuckets();
60	void finalizeGlobalBuckets(uint32_t RecordZeroOffset);
61
62	// Assign public and global symbol records into hash table buckets.
63	// Modifies the list of records to store the bucket index, but does not
64	// change the order.
65	void finalizeBuckets(uint32_t RecordZeroOffset,
66	MutableArrayRef<BulkPublic> Globals);
67	};
68
69	// DenseMapInfo implementation for deduplicating symbol records.
70	struct llvm::pdb::SymbolDenseMapInfo {
71	static inline CVSymbol getEmptyKey() {
72	static CVSymbol Empty;
73	return Empty;
74	}
75	static inline CVSymbol getTombstoneKey() {
76	static CVSymbol Tombstone(
77	DenseMapInfo<ArrayRef<uint8_t>>::getTombstoneKey());
78	return Tombstone;
79	}
80	static unsigned getHashValue(const CVSymbol &Val) {
81	return xxh3_64bits(data: Val.RecordData);
82	}
83	static bool isEqual(const CVSymbol &LHS, const CVSymbol &RHS) {
84	return LHS.RecordData == RHS.RecordData;
85	}
86	};
87
88	namespace {
89	LLVM_PACKED_START
90	struct PublicSym32Layout {
91	RecordPrefix Prefix;
92	PublicSym32Header Pub;
93	// char Name[];
94	};
95	LLVM_PACKED_END
96	} // namespace
97
98	// Calculate how much memory this public needs when serialized.
99	static uint32_t sizeOfPublic(const BulkPublic &Pub) {
100	uint32_t NameLen = Pub.NameLen;
101	NameLen = std::min(a: NameLen,
102	b: uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - `1`));
103	return alignTo(Value: sizeof(PublicSym32Layout) + NameLen + `1`, Align: `4`);
104	}
105
106	static CVSymbol serializePublic(uint8_t Mem, const* BulkPublic &Pub) {
107	// Assume the caller has allocated sizeOfPublic bytes.
108	uint32_t NameLen = std::min(
109	a: Pub.NameLen, b: uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - `1`));
110	size_t Size = alignTo(Value: sizeof(PublicSym32Layout) + NameLen + `1`, Align: `4`);
111	assert(Size == sizeOfPublic(Pub));
112	auto FixedMem = reinterpret_cast<PublicSym32Layout >(Mem);
113	FixedMem->Prefix.RecordKind = static_cast<uint16_t>(codeview::S_PUB32);
114	FixedMem->Prefix.RecordLen = static_cast<uint16_t>(Size - `2`);
115	FixedMem->Pub.Flags = Pub.Flags;
116	FixedMem->Pub.Offset = Pub.Offset;
117	FixedMem->Pub.Segment = Pub.Segment;
118	char NameMem = reinterpret_cast<char* *>(FixedMem + `1`);
119	memcpy(dest: NameMem, src: Pub.Name, n: NameLen);
120	// Zero the null terminator and remaining bytes.
121	memset(s: &NameMem[NameLen], c: `0`, n: Size - sizeof(PublicSym32Layout) - NameLen);
122	return CVSymbol (ArrayRef(Mem, Size));
123	}
124
125	uint32_t GSIHashStreamBuilder::calculateSerializedLength() const {
126	uint32_t Size = sizeof(GSIHashHeader);
127	Size += HashRecords.size() * sizeof(PSHashRecord);
128	Size += HashBitmap.size() * sizeof(uint32_t);
129	Size += HashBuckets.size() * sizeof(uint32_t);
130	return Size;
131	}
132
133	Error GSIHashStreamBuilder::commit(BinaryStreamWriter &Writer) {
134	GSIHashHeader Header;
135	Header.VerSignature = GSIHashHeader::HdrSignature;
136	Header.VerHdr = GSIHashHeader::HdrVersion;
137	Header.HrSize = HashRecords.size() * sizeof(PSHashRecord);
138	Header.NumBuckets = HashBitmap.size() * `4` + HashBuckets.size() * `4`;
139
140	if (auto EC = Writer.writeObject(Obj: Header))
141	return EC;
142
143	if (auto EC = Writer.writeArray(Array: ArrayRef(HashRecords)))
144	return EC;
145	if (auto EC = Writer.writeArray(Array: ArrayRef(HashBitmap)))
146	return EC;
147	if (auto EC = Writer.writeArray(Array: ArrayRef(HashBuckets)))
148	return EC;
149	return Error::success();
150	}
151
152	static bool isAsciiString(StringRef S) {
153	return llvm::all_of(Range&: S, P: [](char C) { return unsigned(C) < `0x80`; });
154	}
155
156	// See `caseInsensitiveComparePchPchCchCch` in gsi.cpp
157	static int gsiRecordCmp(StringRef S1, StringRef S2) {
158	size_t LS = S1.size();
159	size_t RS = S2.size();
160	// Shorter strings always compare less than longer strings.
161	if (LS != RS)
162	return (LS > RS) - (LS < RS);
163
164	// If either string contains non ascii characters, memcmp them.
165	if (LLVM_UNLIKELY(!isAsciiString(S1) \|\| !isAsciiString(S2)))
166	return memcmp(s1: S1.data(), s2: S2.data(), n: LS);
167
168	// Both strings are ascii, perform a case-insensitive comparison.
169	return S1.compare_insensitive(RHS: S2.data());
170	}
171
172	void GSIStreamBuilder::finalizePublicBuckets() {
173	PSH ->finalizeBuckets(RecordZeroOffset: `0`, Globals: Publics);
174	}
175
176	void GSIStreamBuilder::finalizeGlobalBuckets(uint32_t RecordZeroOffset) {
177	// Build up a list of globals to be bucketed. Use the BulkPublic data
178	// structure for this purpose, even though these are global records, not
179	// public records. Most of the same fields are required:
180	// - Name
181	// - NameLen
182	// - SymOffset
183	// - BucketIdx
184	// The dead fields are Offset, Segment, and Flags.
185	std::vector<BulkPublic> Records;
186	Records.resize(new_size: Globals.size());
187	uint32_t SymOffset = RecordZeroOffset;
188	for (size_t I = `0`, E = Globals.size(); I < E; ++I) {
189	StringRef Name = getSymbolName(Sym: Globals [I]);
190	Records [I].Name = Name.data();
191	Records [I].NameLen = Name.size();
192	Records [I].SymOffset = SymOffset;
193	SymOffset += Globals [I].length();
194	}
195
196	GSH ->finalizeBuckets(RecordZeroOffset, Globals: Records);
197	}
198
199	void GSIHashStreamBuilder::finalizeBuckets(
200	uint32_t RecordZeroOffset, MutableArrayRef<BulkPublic> Records) {
201	// Hash every name in parallel.
202	parallelFor(Begin: `0`, End: Records.size(), Fn: [&](size_t I) {
203	Records [I].setBucketIdx(hashStringV1(Str: Records [I].Name) % IPHR_HASH);
204	});
205
206	// Count up the size of each bucket. Then, use an exclusive prefix sum to
207	// calculate the bucket start offsets. This is C++17 std::exclusive_scan, but
208	// we can't use it yet.
209	uint32_t BucketStarts[IPHR_HASH] = {`0`};
210	for (const BulkPublic &P : Records)
211	++BucketStarts[P.BucketIdx];
212	uint32_t Sum = `0`;
213	for (uint32_t &B : BucketStarts) {
214	uint32_t Size = B;
215	B = Sum;
216	Sum += Size;
217	}
218
219	// Place globals into the hash table in bucket order. When placing a global,
220	// update the bucket start. Every hash table slot should be filled. Always use
221	// a refcount of one for now.
222	HashRecords.resize(new_size: Records.size());
223	uint32_t BucketCursors[IPHR_HASH];
224	memcpy(dest: BucketCursors, src: BucketStarts, n: sizeof(BucketCursors));
225	for (int I = `0`, E = Records.size(); I < E; ++I) {
226	uint32_t HashIdx = BucketCursors[Records [I].BucketIdx]++;
227	HashRecords [HashIdx].Off = I;
228	HashRecords [HashIdx].CRef = `1`;
229	}
230
231	// Within the buckets, sort each bucket by memcmp of the symbol's name. It's
232	// important that we use the same sorting algorithm as is used by the
233	// reference implementation to ensure that the search for a record within a
234	// bucket can properly early-out when it detects the record won't be found.
235	// The algorithm used here corresponds to the function
236	// caseInsensitiveComparePchPchCchCch in the reference implementation.
237	parallelFor(Begin: `0`, End: IPHR_HASH, Fn: [&](size_t I) {
238	auto B = HashRecords.begin() + BucketStarts[I];
239	auto E = HashRecords.begin() + BucketCursors[I];
240	if (B == E)
241	return;
242	auto BucketCmp = [Records](const PSHashRecord &LHash,
243	const PSHashRecord &RHash) {
244	const BulkPublic &L = Records [uint32_t(LHash.Off)];
245	const BulkPublic &R = Records [uint32_t(RHash.Off)];
246	assert(L.BucketIdx == R.BucketIdx);
247	int Cmp = gsiRecordCmp(S1: L.getName(), S2: R.getName());
248	if (Cmp != `0`)
249	return Cmp < `0`;
250	// This comparison is necessary to make the sorting stable in the presence
251	// of two static globals with the same name. The easiest way to observe
252	// this is with S_LDATA32 records.
253	return L.SymOffset < R.SymOffset;
254	};
255	llvm::sort(Start: B, End: E, Comp: BucketCmp);
256
257	// After we are done sorting, replace the global indices with the stream
258	// offsets of each global. Add one when writing symbol offsets to disk.
259	// See GSI1::fixSymRecs.
260	for (PSHashRecord &HRec : make_range(x: B, y: E))
261	HRec.Off = Records [uint32_t(HRec.Off)].SymOffset + `1`;
262	});
263
264	// For each non-empty bucket, push the bucket start offset into HashBuckets
265	// and set a bit in the hash bitmap.
266	for (uint32_t I = `0`; I < HashBitmap.size(); ++I) {
267	uint32_t Word = `0`;
268	for (uint32_t J = `0`; J < `32`; ++J) {
269	// Skip empty buckets.
270	uint32_t BucketIdx = I * `32` + J;
271	if (BucketIdx >= IPHR_HASH \|\|
272	BucketStarts[BucketIdx] == BucketCursors[BucketIdx])
273	continue;
274	Word \|= (`1U` << J);
275
276	// Calculate what the offset of the first hash record in the chain would
277	// be if it were inflated to contain 32-bit pointers. On a 32-bit system,
278	// each record would be 12 bytes. See HROffsetCalc in gsi.h.
279	const int SizeOfHROffsetCalc = `12`;
280	ulittle32_t ChainStartOff =
281	ulittle32_t(BucketStarts[BucketIdx] * SizeOfHROffsetCalc);
282	HashBuckets.push_back(x: ChainStartOff);
283	}
284	HashBitmap [I] = Word;
285	}
286	}
287
288	GSIStreamBuilder::GSIStreamBuilder(msf::MSFBuilder &Msf)
289	: Msf(Msf), PSH(std::make_unique<GSIHashStreamBuilder>()),
290	GSH(std::make_unique<GSIHashStreamBuilder>()) {}
291
292	GSIStreamBuilder::~GSIStreamBuilder() = default;
293
294	uint32_t GSIStreamBuilder::calculatePublicsHashStreamSize() const {
295	uint32_t Size = `0`;
296	Size += sizeof(PublicsStreamHeader);
297	Size += PSH ->calculateSerializedLength();
298	Size += Publics.size() * sizeof(uint32_t); // AddrMap
299	// FIXME: Add thunk map and section offsets for incremental linking.
300
301	return Size;
302	}
303
304	uint32_t GSIStreamBuilder::calculateGlobalsHashStreamSize() const {
305	return GSH ->calculateSerializedLength();
306	}
307
308	Error GSIStreamBuilder::finalizeMsfLayout() {
309	// First we write public symbol records, then we write global symbol records.
310	finalizePublicBuckets();
311	finalizeGlobalBuckets(RecordZeroOffset: PSH ->RecordByteSize);
312
313	Expected<uint32_t> Idx = Msf.addStream(Size: calculateGlobalsHashStreamSize());
314	if (!Idx)
315	return Idx.takeError();
316	GlobalsStreamIndex = *Idx;
317
318	Idx = Msf.addStream(Size: calculatePublicsHashStreamSize());
319	if (!Idx)
320	return Idx.takeError();
321	PublicsStreamIndex = *Idx;
322
323	uint64_t RecordBytes = PSH ->RecordByteSize + GSH ->RecordByteSize;
324	if (RecordBytes > UINT32_MAX)
325	return make_error<StringError>(
326	Args: formatv(Fmt: "the public symbols ({0} bytes) and global symbols ({1} bytes) "
327	"are too large to fit in a PDB file; "
328	"the maximum total is {2} bytes.",
329	Vals&: PSH ->RecordByteSize, Vals&: GSH ->RecordByteSize, UINT32_MAX),
330	Args: inconvertibleErrorCode());
331
332	Idx = Msf.addStream(Size: RecordBytes);
333	if (!Idx)
334	return Idx.takeError();
335	RecordStreamIndex = *Idx;
336	return Error::success();
337	}
338
339	void GSIStreamBuilder::addPublicSymbols(std::vector<BulkPublic> &&PublicsIn) {
340	assert(Publics.empty() && PSH->RecordByteSize == `0` &&
341	"publics can only be added once");
342	Publics = std::move(PublicsIn);
343
344	// Sort the symbols by name. PDBs contain lots of symbols, so use parallelism.
345	parallelSort(R&: Publics, Comp: [](const BulkPublic &L, const BulkPublic &R) {
346	return L.getName() < R.getName();
347	});
348
349	// Assign offsets and calculate the length of the public symbol records.
350	uint32_t SymOffset = `0`;
351	for (BulkPublic &Pub : Publics) {
352	Pub.SymOffset = SymOffset;
353	SymOffset += sizeOfPublic(Pub);
354	}
355
356	// Remember the length of the public stream records.
357	PSH ->RecordByteSize = SymOffset;
358	}
359
360	void GSIStreamBuilder::addGlobalSymbol(const ProcRefSym &Sym) {
361	serializeAndAddGlobal(Symbol: Sym);
362	}
363
364	void GSIStreamBuilder::addGlobalSymbol(const DataSym &Sym) {
365	serializeAndAddGlobal(Symbol: Sym);
366	}
367
368	void GSIStreamBuilder::addGlobalSymbol(const ConstantSym &Sym) {
369	serializeAndAddGlobal(Symbol: Sym);
370	}
371
372	template <typename T>
373	void GSIStreamBuilder::serializeAndAddGlobal(const T &Symbol) {
374	T Copy(Symbol);
375	addGlobalSymbol(SymbolSerializer::writeOneSymbol(Copy, Msf.getAllocator(),
376	CodeViewContainer::Pdb));
377	}
378
379	void GSIStreamBuilder::addGlobalSymbol(const codeview::CVSymbol &Symbol) {
380	// Ignore duplicate typedefs and constants.
381	if (Symbol.kind() == S_UDT \|\| Symbol.kind() == S_CONSTANT) {
382	auto Iter = GlobalsSeen.insert(V: Symbol);
383	if (!Iter.second)
384	return;
385	}
386	GSH ->RecordByteSize += Symbol.length();
387	Globals.push_back(x: Symbol);
388	}
389
390	// Serialize each public and write it.
391	static Error writePublics(BinaryStreamWriter &Writer,
392	ArrayRef<BulkPublic> Publics) {
393	std::vector<uint8_t> Storage;
394	for (const BulkPublic &Pub : Publics) {
395	Storage.resize(new_size: sizeOfPublic(Pub));
396	serializePublic(Mem: Storage.data(), Pub);
397	if (Error E = Writer.writeBytes(Buffer: Storage))
398	return E;
399	}
400	return Error::success();
401	}
402
403	static Error writeRecords(BinaryStreamWriter &Writer,
404	ArrayRef<CVSymbol> Records) {
405	BinaryItemStream<CVSymbol> ItemStream(llvm::endianness::little);
406	ItemStream.setItems(Records);
407	BinaryStreamRef RecordsRef(ItemStream);
408	return Writer.writeStreamRef(Ref: RecordsRef);
409	}
410
411	Error GSIStreamBuilder::commitSymbolRecordStream(
412	WritableBinaryStreamRef Stream) {
413	BinaryStreamWriter Writer(Stream);
414
415	// Write public symbol records first, followed by global symbol records. This
416	// must match the order that we assume in finalizeMsfLayout when computing
417	// PSHZero and GSHZero.
418	if (auto EC = writePublics(Writer, Publics))
419	return EC;
420	if (auto EC = writeRecords(Writer, Records: Globals))
421	return EC;
422
423	return Error::success();
424	}
425
426	static std::vector<support::ulittle32_t>
427	computeAddrMap(ArrayRef<BulkPublic> Publics) {
428	// Build a parallel vector of indices into the Publics vector, and sort it by
429	// address.
430	std::vector<ulittle32_t> PubAddrMap;
431	PubAddrMap.reserve(n: Publics.size());
432	for (int I = `0`, E = Publics.size(); I < E; ++I)
433	PubAddrMap.push_back(x: ulittle32_t(I));
434
435	auto AddrCmp = [Publics](const ulittle32_t &LIdx, const ulittle32_t &RIdx) {
436	const BulkPublic &L = Publics [LIdx];
437	const BulkPublic &R = Publics [RIdx];
438	if (L.Segment != R.Segment)
439	return L.Segment < R.Segment;
440	if (L.Offset != R.Offset)
441	return L.Offset < R.Offset;
442	// parallelSort is unstable, so we have to do name comparison to ensure
443	// that two names for the same location come out in a deterministic order.
444	return L.getName() < R.getName();
445	};
446	parallelSort(R&: PubAddrMap, Comp: AddrCmp);
447
448	// Rewrite the public symbol indices into symbol offsets.
449	for (ulittle32_t &Entry : PubAddrMap)
450	Entry = Publics [Entry].SymOffset;
451	return PubAddrMap;
452	}
453
454	Error GSIStreamBuilder::commitPublicsHashStream(
455	WritableBinaryStreamRef Stream) {
456	BinaryStreamWriter Writer(Stream);
457	PublicsStreamHeader Header;
458
459	// FIXME: Fill these in. They are for incremental linking.
460	Header.SymHash = PSH ->calculateSerializedLength();
461	Header.AddrMap = Publics.size() * `4`;
462	Header.NumThunks = `0`;
463	Header.SizeOfThunk = `0`;
464	Header.ISectThunkTable = `0`;
465	memset(s: Header.Padding, c: `0`, n: sizeof(Header.Padding));
466	Header.OffThunkTable = `0`;
467	Header.NumSections = `0`;
468	if (auto EC = Writer.writeObject(Obj: Header))
469	return EC;
470
471	if (auto EC = PSH ->commit(Writer))
472	return EC;
473
474	std::vector<support::ulittle32_t> PubAddrMap = computeAddrMap(Publics);
475	assert(PubAddrMap.size() == Publics.size());
476	if (auto EC = Writer.writeArray(Array: ArrayRef(PubAddrMap)))
477	return EC;
478
479	return Error::success();
480	}
481
482	Error GSIStreamBuilder::commitGlobalsHashStream(
483	WritableBinaryStreamRef Stream) {
484	BinaryStreamWriter Writer(Stream);
485	return GSH ->commit(Writer);
486	}
487
488	Error GSIStreamBuilder::commit(const msf::MSFLayout &Layout,
489	WritableBinaryStreamRef Buffer) {
490	llvm::TimeTraceScope timeScope("Commit GSI stream");
491	auto GS = WritableMappedBlockStream::createIndexedStream(
492	Layout, MsfData: Buffer, StreamIndex: getGlobalsStreamIndex(), Allocator&: Msf.getAllocator());
493	auto PS = WritableMappedBlockStream::createIndexedStream(
494	Layout, MsfData: Buffer, StreamIndex: getPublicsStreamIndex(), Allocator&: Msf.getAllocator());
495	auto PRS = WritableMappedBlockStream::createIndexedStream(
496	Layout, MsfData: Buffer, StreamIndex: getRecordStreamIndex(), Allocator&: Msf.getAllocator());
497
498	if (auto EC = commitSymbolRecordStream(Stream: *PRS))
499	return EC;
500	if (auto EC = commitGlobalsHashStream(Stream: *GS))
501	return EC;
502	if (auto EC = commitPublicsHashStream(Stream: *PS))
503	return EC;
504	return Error::success();
505	}
506

Browse the source code of llvm_projects/llvm/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp