GSIStreamBuilder.cpp source code [llvm_projects/llvm/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp]

1	//===- DbiStreamBuilder.cpp - PDB Dbi Stream Creation ------------ C++ --===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// The data structures defined in this file are based on the reference
10	// implementation which is available at
11	// https://github.com/Microsoft/microsoft-pdb/blob/master/PDB/dbi/gsi.cpp
12	//
13	//===----------------------------------------------------------------------===//
14
15	#include "llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h"
16	#include "llvm/DebugInfo/CodeView/RecordName.h"
17	#include "llvm/DebugInfo/CodeView/RecordSerialization.h"
18	#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
19	#include "llvm/DebugInfo/CodeView/SymbolSerializer.h"
20	#include "llvm/DebugInfo/MSF/MSFBuilder.h"
21	#include "llvm/DebugInfo/MSF/MSFCommon.h"
22	#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
23	#include "llvm/DebugInfo/PDB/Native/GlobalsStream.h"
24	#include "llvm/DebugInfo/PDB/Native/Hash.h"
25	#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
26	#include "llvm/Support/BinaryItemStream.h"
27	#include "llvm/Support/BinaryStreamWriter.h"
28	#include "llvm/Support/Parallel.h"
29	#include "llvm/Support/TimeProfiler.h"
30	#include "llvm/Support/xxhash.h"
31	#include <algorithm>
32	#include <vector>
33
34	using namespace llvm;
35	using namespace llvm::msf;
36	using namespace llvm::pdb;
37	using namespace llvm::codeview;
38
39	// Helper class for building the public and global PDB hash table buckets.
40	struct llvm::pdb::GSIHashStreamBuilder {
41	// Sum of the size of all public or global records.
42	uint32_t RecordByteSize = `0`;
43
44	std::vector<PSHashRecord> HashRecords;
45
46	// The hash bitmap has `ceil((IPHR_HASH + 1) / 32)` words in it. The
47	// reference implementation builds a hash table with IPHR_HASH buckets in it.
48	// The last bucket is used to link together free hash table cells in a linked
49	// list, but it is always empty in the compressed, on-disk format. However,
50	// the bitmap must have a bit for it.
51	std::array<support::ulittle32_t, (IPHR_HASH + `32`) / `32`> HashBitmap;
52
53	std::vector<support::ulittle32_t> HashBuckets;
54
55	uint32_t calculateSerializedLength() const;
56	Error commit(BinaryStreamWriter &Writer);
57
58	void finalizePublicBuckets();
59	void finalizeGlobalBuckets(uint32_t RecordZeroOffset);
60
61	// Assign public and global symbol records into hash table buckets.
62	// Modifies the list of records to store the bucket index, but does not
63	// change the order.
64	void finalizeBuckets(uint32_t RecordZeroOffset,
65	MutableArrayRef<BulkPublic> Globals);
66	};
67
68	// DenseMapInfo implementation for deduplicating symbol records.
69	struct llvm::pdb::SymbolDenseMapInfo {
70	static inline CVSymbol getEmptyKey() {
71	static CVSymbol Empty;
72	return Empty;
73	}
74	static inline CVSymbol getTombstoneKey() {
75	static CVSymbol Tombstone(
76	DenseMapInfo<ArrayRef<uint8_t>>::getTombstoneKey());
77	return Tombstone;
78	}
79	static unsigned getHashValue(const CVSymbol &Val) {
80	return xxh3_64bits(data: Val.RecordData);
81	}
82	static bool isEqual(const CVSymbol &LHS, const CVSymbol &RHS) {
83	return LHS.RecordData == RHS.RecordData;
84	}
85	};
86
87	namespace {
88	LLVM_PACKED_START
89	struct PublicSym32Layout {
90	RecordPrefix Prefix;
91	PublicSym32Header Pub;
92	// char Name[];
93	};
94	LLVM_PACKED_END
95	} // namespace
96
97	// Calculate how much memory this public needs when serialized.
98	static uint32_t sizeOfPublic(const BulkPublic &Pub) {
99	uint32_t NameLen = Pub.NameLen;
100	NameLen = std::min(a: NameLen,
101	b: uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - `1`));
102	return alignTo(Value: sizeof(PublicSym32Layout) + NameLen + `1`, Align: `4`);
103	}
104
105	static CVSymbol serializePublic(uint8_t Mem, const* BulkPublic &Pub) {
106	// Assume the caller has allocated sizeOfPublic bytes.
107	uint32_t NameLen = std::min(
108	a: Pub.NameLen, b: uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - `1`));
109	size_t Size = alignTo(Value: sizeof(PublicSym32Layout) + NameLen + `1`, Align: `4`);
110	assert(Size == sizeOfPublic(Pub));
111	auto FixedMem = reinterpret_cast<PublicSym32Layout >(Mem);
112	FixedMem->Prefix.RecordKind = static_cast<uint16_t>(codeview::S_PUB32);
113	FixedMem->Prefix.RecordLen = static_cast<uint16_t>(Size - `2`);
114	FixedMem->Pub.Flags = Pub.Flags;
115	FixedMem->Pub.Offset = Pub.Offset;
116	FixedMem->Pub.Segment = Pub.Segment;
117	char NameMem = reinterpret_cast<char* *>(FixedMem + `1`);
118	memcpy(dest: NameMem, src: Pub.Name, n: NameLen);
119	// Zero the null terminator and remaining bytes.
120	memset(s: &NameMem[NameLen], c: `0`, n: Size - sizeof(PublicSym32Layout) - NameLen);
121	return CVSymbol (ArrayRef(reinterpret_cast<uint8_t *>(Mem), Size));
122	}
123
124	uint32_t GSIHashStreamBuilder::calculateSerializedLength() const {
125	uint32_t Size = sizeof(GSIHashHeader);
126	Size += HashRecords.size() * sizeof(PSHashRecord);
127	Size += HashBitmap.size() * sizeof(uint32_t);
128	Size += HashBuckets.size() * sizeof(uint32_t);
129	return Size;
130	}
131
132	Error GSIHashStreamBuilder::commit(BinaryStreamWriter &Writer) {
133	GSIHashHeader Header;
134	Header.VerSignature = GSIHashHeader::HdrSignature;
135	Header.VerHdr = GSIHashHeader::HdrVersion;
136	Header.HrSize = HashRecords.size() * sizeof(PSHashRecord);
137	Header.NumBuckets = HashBitmap.size() * `4` + HashBuckets.size() * `4`;
138
139	if (auto EC = Writer.writeObject(Obj: Header))
140	return EC;
141
142	if (auto EC = Writer.writeArray(Array: ArrayRef(HashRecords)))
143	return EC;
144	if (auto EC = Writer.writeArray(Array: ArrayRef(HashBitmap)))
145	return EC;
146	if (auto EC = Writer.writeArray(Array: ArrayRef(HashBuckets)))
147	return EC;
148	return Error::success();
149	}
150
151	static bool isAsciiString(StringRef S) {
152	return llvm::all_of(Range&: S, P: [](char C) { return unsigned(C) < `0x80`; });
153	}
154
155	// See `caseInsensitiveComparePchPchCchCch` in gsi.cpp
156	static int gsiRecordCmp(StringRef S1, StringRef S2) {
157	size_t LS = S1.size();
158	size_t RS = S2.size();
159	// Shorter strings always compare less than longer strings.
160	if (LS != RS)
161	return (LS > RS) - (LS < RS);
162
163	// If either string contains non ascii characters, memcmp them.
164	if (LLVM_UNLIKELY(!isAsciiString(S1) \|\| !isAsciiString(S2)))
165	return memcmp(s1: S1.data(), s2: S2.data(), n: LS);
166
167	// Both strings are ascii, perform a case-insensitive comparison.
168	return S1.compare_insensitive(RHS: S2.data());
169	}
170
171	void GSIStreamBuilder::finalizePublicBuckets() {
172	PSH ->finalizeBuckets(RecordZeroOffset: `0`, Globals: Publics);
173	}
174
175	void GSIStreamBuilder::finalizeGlobalBuckets(uint32_t RecordZeroOffset) {
176	// Build up a list of globals to be bucketed. Use the BulkPublic data
177	// structure for this purpose, even though these are global records, not
178	// public records. Most of the same fields are required:
179	// - Name
180	// - NameLen
181	// - SymOffset
182	// - BucketIdx
183	// The dead fields are Offset, Segment, and Flags.
184	std::vector<BulkPublic> Records;
185	Records.resize(new_size: Globals.size());
186	uint32_t SymOffset = RecordZeroOffset;
187	for (size_t I = `0`, E = Globals.size(); I < E; ++I) {
188	StringRef Name = getSymbolName(Sym: Globals [I]);
189	Records [I].Name = Name.data();
190	Records [I].NameLen = Name.size();
191	Records [I].SymOffset = SymOffset;
192	SymOffset += Globals [I].length();
193	}
194
195	GSH ->finalizeBuckets(RecordZeroOffset, Globals: Records);
196	}
197
198	void GSIHashStreamBuilder::finalizeBuckets(
199	uint32_t RecordZeroOffset, MutableArrayRef<BulkPublic> Records) {
200	// Hash every name in parallel.
201	parallelFor(Begin: `0`, End: Records.size(), Fn: [&](size_t I) {
202	Records [I].setBucketIdx(hashStringV1(Str: Records [I].Name) % IPHR_HASH);
203	});
204
205	// Count up the size of each bucket. Then, use an exclusive prefix sum to
206	// calculate the bucket start offsets. This is C++17 std::exclusive_scan, but
207	// we can't use it yet.
208	uint32_t BucketStarts[IPHR_HASH] = {`0`};
209	for (const BulkPublic &P : Records)
210	++BucketStarts[P.BucketIdx];
211	uint32_t Sum = `0`;
212	for (uint32_t &B : BucketStarts) {
213	uint32_t Size = B;
214	B = Sum;
215	Sum += Size;
216	}
217
218	// Place globals into the hash table in bucket order. When placing a global,
219	// update the bucket start. Every hash table slot should be filled. Always use
220	// a refcount of one for now.
221	HashRecords.resize(new_size: Records.size());
222	uint32_t BucketCursors[IPHR_HASH];
223	memcpy(dest: BucketCursors, src: BucketStarts, n: sizeof(BucketCursors));
224	for (int I = `0`, E = Records.size(); I < E; ++I) {
225	uint32_t HashIdx = BucketCursors[Records [I].BucketIdx]++;
226	HashRecords [HashIdx].Off = I;
227	HashRecords [HashIdx].CRef = `1`;
228	}
229
230	// Within the buckets, sort each bucket by memcmp of the symbol's name. It's
231	// important that we use the same sorting algorithm as is used by the
232	// reference implementation to ensure that the search for a record within a
233	// bucket can properly early-out when it detects the record won't be found.
234	// The algorithm used here corresponds to the function
235	// caseInsensitiveComparePchPchCchCch in the reference implementation.
236	parallelFor(Begin: `0`, End: IPHR_HASH, Fn: [&](size_t I) {
237	auto B = HashRecords.begin() + BucketStarts[I];
238	auto E = HashRecords.begin() + BucketCursors[I];
239	if (B == E)
240	return;
241	auto BucketCmp = [Records](const PSHashRecord &LHash,
242	const PSHashRecord &RHash) {
243	const BulkPublic &L = Records [uint32_t(LHash.Off)];
244	const BulkPublic &R = Records [uint32_t(RHash.Off)];
245	assert(L.BucketIdx == R.BucketIdx);
246	int Cmp = gsiRecordCmp(S1: L.getName(), S2: R.getName());
247	if (Cmp != `0`)
248	return Cmp < `0`;
249	// This comparison is necessary to make the sorting stable in the presence
250	// of two static globals with the same name. The easiest way to observe
251	// this is with S_LDATA32 records.
252	return L.SymOffset < R.SymOffset;
253	};
254	llvm::sort(Start: B, End: E, Comp: BucketCmp);
255
256	// After we are done sorting, replace the global indices with the stream
257	// offsets of each global. Add one when writing symbol offsets to disk.
258	// See GSI1::fixSymRecs.
259	for (PSHashRecord &HRec : make_range(x: B, y: E))
260	HRec.Off = Records [uint32_t(HRec.Off)].SymOffset + `1`;
261	});
262
263	// For each non-empty bucket, push the bucket start offset into HashBuckets
264	// and set a bit in the hash bitmap.
265	for (uint32_t I = `0`; I < HashBitmap.size(); ++I) {
266	uint32_t Word = `0`;
267	for (uint32_t J = `0`; J < `32`; ++J) {
268	// Skip empty buckets.
269	uint32_t BucketIdx = I * `32` + J;
270	if (BucketIdx >= IPHR_HASH \|\|
271	BucketStarts[BucketIdx] == BucketCursors[BucketIdx])
272	continue;
273	Word \|= (`1U` << J);
274
275	// Calculate what the offset of the first hash record in the chain would
276	// be if it were inflated to contain 32-bit pointers. On a 32-bit system,
277	// each record would be 12 bytes. See HROffsetCalc in gsi.h.
278	const int SizeOfHROffsetCalc = `12`;
279	ulittle32_t ChainStartOff =
280	ulittle32_t (BucketStarts[BucketIdx] * SizeOfHROffsetCalc);
281	HashBuckets.push_back(x: ChainStartOff);
282	}
283	HashBitmap [I] = Word;
284	}
285	}
286
287	GSIStreamBuilder::GSIStreamBuilder(msf::MSFBuilder &Msf)
288	: Msf(Msf), PSH(std::make_unique<GSIHashStreamBuilder>()),
289	GSH(std::make_unique<GSIHashStreamBuilder>()) {}
290
291	GSIStreamBuilder::~GSIStreamBuilder() = default;
292
293	uint32_t GSIStreamBuilder::calculatePublicsHashStreamSize() const {
294	uint32_t Size = `0`;
295	Size += sizeof(PublicsStreamHeader);
296	Size += PSH ->calculateSerializedLength();
297	Size += Publics.size() * sizeof(uint32_t); // AddrMap
298	// FIXME: Add thunk map and section offsets for incremental linking.
299
300	return Size;
301	}
302
303	uint32_t GSIStreamBuilder::calculateGlobalsHashStreamSize() const {
304	return GSH ->calculateSerializedLength();
305	}
306
307	Error GSIStreamBuilder::finalizeMsfLayout() {
308	// First we write public symbol records, then we write global symbol records.
309	finalizePublicBuckets();
310	finalizeGlobalBuckets(RecordZeroOffset: PSH ->RecordByteSize);
311
312	Expected<uint32_t> Idx = Msf.addStream(Size: calculateGlobalsHashStreamSize());
313	if (!Idx)
314	return Idx.takeError();
315	GlobalsStreamIndex = *Idx;
316
317	Idx = Msf.addStream(Size: calculatePublicsHashStreamSize());
318	if (!Idx)
319	return Idx.takeError();
320	PublicsStreamIndex = *Idx;
321
322	uint32_t RecordBytes = PSH ->RecordByteSize + GSH ->RecordByteSize;
323
324	Idx = Msf.addStream(Size: RecordBytes);
325	if (!Idx)
326	return Idx.takeError();
327	RecordStreamIndex = *Idx;
328	return Error::success();
329	}
330
331	void GSIStreamBuilder::addPublicSymbols(std::vector<BulkPublic> &&PublicsIn) {
332	assert(Publics.empty() && PSH->RecordByteSize == `0` &&
333	"publics can only be added once");
334	Publics = std::move(PublicsIn);
335
336	// Sort the symbols by name. PDBs contain lots of symbols, so use parallelism.
337	parallelSort(R&: Publics, Comp: [](const BulkPublic &L, const BulkPublic &R) {
338	return L.getName() < R.getName();
339	});
340
341	// Assign offsets and calculate the length of the public symbol records.
342	uint32_t SymOffset = `0`;
343	for (BulkPublic &Pub : Publics) {
344	Pub.SymOffset = SymOffset;
345	SymOffset += sizeOfPublic(Pub);
346	}
347
348	// Remember the length of the public stream records.
349	PSH ->RecordByteSize = SymOffset;
350	}
351
352	void GSIStreamBuilder::addGlobalSymbol(const ProcRefSym &Sym) {
353	serializeAndAddGlobal(Symbol: Sym);
354	}
355
356	void GSIStreamBuilder::addGlobalSymbol(const DataSym &Sym) {
357	serializeAndAddGlobal(Symbol: Sym);
358	}
359
360	void GSIStreamBuilder::addGlobalSymbol(const ConstantSym &Sym) {
361	serializeAndAddGlobal(Symbol: Sym);
362	}
363
364	template <typename T>
365	void GSIStreamBuilder::serializeAndAddGlobal(const T &Symbol) {
366	T Copy(Symbol);
367	addGlobalSymbol(SymbolSerializer::writeOneSymbol(Copy, Msf.getAllocator(),
368	CodeViewContainer::Pdb));
369	}
370
371	void GSIStreamBuilder::addGlobalSymbol(const codeview::CVSymbol &Symbol) {
372	// Ignore duplicate typedefs and constants.
373	if (Symbol.kind() == S_UDT \|\| Symbol.kind() == S_CONSTANT) {
374	auto Iter = GlobalsSeen.insert(V: Symbol);
375	if (!Iter.second)
376	return;
377	}
378	GSH ->RecordByteSize += Symbol.length();
379	Globals.push_back(x: Symbol);
380	}
381
382	// Serialize each public and write it.
383	static Error writePublics(BinaryStreamWriter &Writer,
384	ArrayRef<BulkPublic> Publics) {
385	std::vector<uint8_t> Storage;
386	for (const BulkPublic &Pub : Publics) {
387	Storage.resize(new_size: sizeOfPublic(Pub));
388	serializePublic(Mem: Storage.data(), Pub);
389	if (Error E = Writer.writeBytes(Buffer: Storage))
390	return E;
391	}
392	return Error::success();
393	}
394
395	static Error writeRecords(BinaryStreamWriter &Writer,
396	ArrayRef<CVSymbol> Records) {
397	BinaryItemStream<CVSymbol> ItemStream(llvm::endianness::little);
398	ItemStream.setItems(Records);
399	BinaryStreamRef RecordsRef(ItemStream);
400	return Writer.writeStreamRef(Ref: RecordsRef);
401	}
402
403	Error GSIStreamBuilder::commitSymbolRecordStream(
404	WritableBinaryStreamRef Stream) {
405	BinaryStreamWriter Writer(Stream);
406
407	// Write public symbol records first, followed by global symbol records. This
408	// must match the order that we assume in finalizeMsfLayout when computing
409	// PSHZero and GSHZero.
410	if (auto EC = writePublics(Writer, Publics))
411	return EC;
412	if (auto EC = writeRecords(Writer, Records: Globals))
413	return EC;
414
415	return Error::success();
416	}
417
418	static std::vector<support::ulittle32_t>
419	computeAddrMap(ArrayRef<BulkPublic> Publics) {
420	// Build a parallel vector of indices into the Publics vector, and sort it by
421	// address.
422	std::vector<ulittle32_t> PubAddrMap;
423	PubAddrMap.reserve(n: Publics.size());
424	for (int I = `0`, E = Publics.size(); I < E; ++I)
425	PubAddrMap.push_back(x: ulittle32_t (I));
426
427	auto AddrCmp = [Publics](const ulittle32_t &LIdx, const ulittle32_t &RIdx) {
428	const BulkPublic &L = Publics [LIdx];
429	const BulkPublic &R = Publics [RIdx];
430	if (L.Segment != R.Segment)
431	return L.Segment < R.Segment;
432	if (L.Offset != R.Offset)
433	return L.Offset < R.Offset;
434	// parallelSort is unstable, so we have to do name comparison to ensure
435	// that two names for the same location come out in a deterministic order.
436	return L.getName() < R.getName();
437	};
438	parallelSort(R&: PubAddrMap, Comp: AddrCmp);
439
440	// Rewrite the public symbol indices into symbol offsets.
441	for (ulittle32_t &Entry : PubAddrMap)
442	Entry = Publics [Entry].SymOffset;
443	return PubAddrMap;
444	}
445
446	Error GSIStreamBuilder::commitPublicsHashStream(
447	WritableBinaryStreamRef Stream) {
448	BinaryStreamWriter Writer(Stream);
449	PublicsStreamHeader Header;
450
451	// FIXME: Fill these in. They are for incremental linking.
452	Header.SymHash = PSH ->calculateSerializedLength();
453	Header.AddrMap = Publics.size() * `4`;
454	Header.NumThunks = `0`;
455	Header.SizeOfThunk = `0`;
456	Header.ISectThunkTable = `0`;
457	memset(s: Header.Padding, c: `0`, n: sizeof(Header.Padding));
458	Header.OffThunkTable = `0`;
459	Header.NumSections = `0`;
460	if (auto EC = Writer.writeObject(Obj: Header))
461	return EC;
462
463	if (auto EC = PSH ->commit(Writer))
464	return EC;
465
466	std::vector<support::ulittle32_t> PubAddrMap = computeAddrMap(Publics);
467	assert(PubAddrMap.size() == Publics.size());
468	if (auto EC = Writer.writeArray(Array: ArrayRef(PubAddrMap)))
469	return EC;
470
471	return Error::success();
472	}
473
474	Error GSIStreamBuilder::commitGlobalsHashStream(
475	WritableBinaryStreamRef Stream) {
476	BinaryStreamWriter Writer(Stream);
477	return GSH ->commit(Writer);
478	}
479
480	Error GSIStreamBuilder::commit(const msf::MSFLayout &Layout,
481	WritableBinaryStreamRef Buffer) {
482	llvm::TimeTraceScope timeScope("Commit GSI stream");
483	auto GS = WritableMappedBlockStream::createIndexedStream(
484	Layout, MsfData: Buffer, StreamIndex: getGlobalsStreamIndex(), Allocator&: Msf.getAllocator());
485	auto PS = WritableMappedBlockStream::createIndexedStream(
486	Layout, MsfData: Buffer, StreamIndex: getPublicsStreamIndex(), Allocator&: Msf.getAllocator());
487	auto PRS = WritableMappedBlockStream::createIndexedStream(
488	Layout, MsfData: Buffer, StreamIndex: getRecordStreamIndex(), Allocator&: Msf.getAllocator());
489
490	if (auto EC = commitSymbolRecordStream(Stream: *PRS))
491	return EC;
492	if (auto EC = commitGlobalsHashStream(Stream: *GS))
493	return EC;
494	if (auto EC = commitPublicsHashStream(Stream: *PS))
495	return EC;
496	return Error::success();
497	}
498

Browse the source code of llvm_projects/llvm/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp