1//===- GsymCreator.cpp ----------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//===----------------------------------------------------------------------===//
7
8#include "llvm/DebugInfo/GSYM/GsymCreator.h"
9#include "llvm/DebugInfo/GSYM/FileWriter.h"
10#include "llvm/DebugInfo/GSYM/Header.h"
11#include "llvm/DebugInfo/GSYM/LineTable.h"
12#include "llvm/DebugInfo/GSYM/OutputAggregator.h"
13#include "llvm/MC/StringTableBuilder.h"
14#include "llvm/Support/raw_ostream.h"
15
16#include <algorithm>
17#include <cassert>
18#include <functional>
19#include <vector>
20
21using namespace llvm;
22using namespace gsym;
23
24GsymCreator::GsymCreator(bool Quiet)
25 : StrTab(StringTableBuilder::ELF), Quiet(Quiet) {
26 insertFile(Path: StringRef());
27}
28
29uint32_t GsymCreator::insertFile(StringRef Path, llvm::sys::path::Style Style) {
30 llvm::StringRef directory = llvm::sys::path::parent_path(path: Path, style: Style);
31 llvm::StringRef filename = llvm::sys::path::filename(path: Path, style: Style);
32 // We must insert the strings first, then call the FileEntry constructor.
33 // If we inline the insertString() function call into the constructor, the
34 // call order is undefined due to parameter lists not having any ordering
35 // requirements.
36 const uint32_t Dir = insertString(S: directory);
37 const uint32_t Base = insertString(S: filename);
38 return insertFileEntry(FE: FileEntry(Dir, Base));
39}
40
41uint32_t GsymCreator::insertFileEntry(FileEntry FE) {
42 std::lock_guard<std::mutex> Guard(Mutex);
43 const auto NextIndex = Files.size();
44 // Find FE in hash map and insert if not present.
45 auto R = FileEntryToIndex.insert(KV: std::make_pair(x&: FE, y: NextIndex));
46 if (R.second)
47 Files.emplace_back(args&: FE);
48 return R.first->second;
49}
50
51uint32_t GsymCreator::copyFile(const GsymCreator &SrcGC, uint32_t FileIdx) {
52 // File index zero is reserved for a FileEntry with no directory and no
53 // filename. Any other file and we need to copy the strings for the directory
54 // and filename.
55 if (FileIdx == 0)
56 return 0;
57 const FileEntry SrcFE = SrcGC.Files[FileIdx];
58 // Copy the strings for the file and then add the newly converted file entry.
59 uint32_t Dir =
60 SrcFE.Dir == 0
61 ? 0
62 : StrTab.add(S: SrcGC.StringOffsetMap.find(Val: SrcFE.Dir)->second);
63 uint32_t Base = StrTab.add(S: SrcGC.StringOffsetMap.find(Val: SrcFE.Base)->second);
64 FileEntry DstFE(Dir, Base);
65 return insertFileEntry(FE: DstFE);
66}
67
68llvm::Error GsymCreator::save(StringRef Path, llvm::endianness ByteOrder,
69 std::optional<uint64_t> SegmentSize) const {
70 if (SegmentSize)
71 return saveSegments(Path, ByteOrder, SegmentSize: *SegmentSize);
72 std::error_code EC;
73 raw_fd_ostream OutStrm(Path, EC);
74 if (EC)
75 return llvm::errorCodeToError(EC);
76 FileWriter O(OutStrm, ByteOrder);
77 return encode(O);
78}
79
80llvm::Error GsymCreator::encode(FileWriter &O) const {
81 std::lock_guard<std::mutex> Guard(Mutex);
82 if (Funcs.empty())
83 return createStringError(EC: std::errc::invalid_argument,
84 Fmt: "no functions to encode");
85 if (!Finalized)
86 return createStringError(EC: std::errc::invalid_argument,
87 Fmt: "GsymCreator wasn't finalized prior to encoding");
88
89 if (Funcs.size() > UINT32_MAX)
90 return createStringError(EC: std::errc::invalid_argument,
91 Fmt: "too many FunctionInfos");
92
93 std::optional<uint64_t> BaseAddress = getBaseAddress();
94 // Base address should be valid if we have any functions.
95 if (!BaseAddress)
96 return createStringError(EC: std::errc::invalid_argument,
97 Fmt: "invalid base address");
98 Header Hdr;
99 Hdr.Magic = GSYM_MAGIC;
100 Hdr.Version = GSYM_VERSION;
101 Hdr.AddrOffSize = getAddressOffsetSize();
102 Hdr.UUIDSize = static_cast<uint8_t>(UUID.size());
103 Hdr.BaseAddress = *BaseAddress;
104 Hdr.NumAddresses = static_cast<uint32_t>(Funcs.size());
105 Hdr.StrtabOffset = 0; // We will fix this up later.
106 Hdr.StrtabSize = 0; // We will fix this up later.
107 memset(s: Hdr.UUID, c: 0, n: sizeof(Hdr.UUID));
108 if (UUID.size() > sizeof(Hdr.UUID))
109 return createStringError(EC: std::errc::invalid_argument,
110 Fmt: "invalid UUID size %u", Vals: (uint32_t)UUID.size());
111 // Copy the UUID value if we have one.
112 if (UUID.size() > 0)
113 memcpy(dest: Hdr.UUID, src: UUID.data(), n: UUID.size());
114 // Write out the header.
115 llvm::Error Err = Hdr.encode(O);
116 if (Err)
117 return Err;
118
119 const uint64_t MaxAddressOffset = getMaxAddressOffset();
120 // Write out the address offsets.
121 O.alignTo(Align: Hdr.AddrOffSize);
122 for (const auto &FuncInfo : Funcs) {
123 uint64_t AddrOffset = FuncInfo.startAddress() - Hdr.BaseAddress;
124 // Make sure we calculated the address offsets byte size correctly by
125 // verifying the current address offset is within ranges. We have seen bugs
126 // introduced when the code changes that can cause problems here so it is
127 // good to catch this during testing.
128 assert(AddrOffset <= MaxAddressOffset);
129 (void)MaxAddressOffset;
130 switch (Hdr.AddrOffSize) {
131 case 1:
132 O.writeU8(Value: static_cast<uint8_t>(AddrOffset));
133 break;
134 case 2:
135 O.writeU16(Value: static_cast<uint16_t>(AddrOffset));
136 break;
137 case 4:
138 O.writeU32(Value: static_cast<uint32_t>(AddrOffset));
139 break;
140 case 8:
141 O.writeU64(Value: AddrOffset);
142 break;
143 }
144 }
145
146 // Write out all zeros for the AddrInfoOffsets.
147 O.alignTo(Align: 4);
148 const off_t AddrInfoOffsetsOffset = O.tell();
149 for (size_t i = 0, n = Funcs.size(); i < n; ++i)
150 O.writeU32(Value: 0);
151
152 // Write out the file table
153 O.alignTo(Align: 4);
154 assert(!Files.empty());
155 assert(Files[0].Dir == 0);
156 assert(Files[0].Base == 0);
157 size_t NumFiles = Files.size();
158 if (NumFiles > UINT32_MAX)
159 return createStringError(EC: std::errc::invalid_argument, Fmt: "too many files");
160 O.writeU32(Value: static_cast<uint32_t>(NumFiles));
161 for (auto File : Files) {
162 O.writeU32(Value: File.Dir);
163 O.writeU32(Value: File.Base);
164 }
165
166 // Write out the string table.
167 const off_t StrtabOffset = O.tell();
168 StrTab.write(OS&: O.get_stream());
169 const off_t StrtabSize = O.tell() - StrtabOffset;
170 std::vector<uint32_t> AddrInfoOffsets;
171
172 // Write out the address infos for each function info.
173 for (const auto &FuncInfo : Funcs) {
174 if (Expected<uint64_t> OffsetOrErr = FuncInfo.encode(O))
175 AddrInfoOffsets.push_back(x: OffsetOrErr.get());
176 else
177 return OffsetOrErr.takeError();
178 }
179 // Fixup the string table offset and size in the header
180 O.fixup32(Value: (uint32_t)StrtabOffset, offsetof(Header, StrtabOffset));
181 O.fixup32(Value: (uint32_t)StrtabSize, offsetof(Header, StrtabSize));
182
183 // Fixup all address info offsets
184 uint64_t Offset = 0;
185 for (auto AddrInfoOffset : AddrInfoOffsets) {
186 O.fixup32(Value: AddrInfoOffset, Offset: AddrInfoOffsetsOffset + Offset);
187 Offset += 4;
188 }
189 return ErrorSuccess();
190}
191
192llvm::Error GsymCreator::finalize(OutputAggregator &Out) {
193 std::lock_guard<std::mutex> Guard(Mutex);
194 if (Finalized)
195 return createStringError(EC: std::errc::invalid_argument, Fmt: "already finalized");
196 Finalized = true;
197
198 // Don't let the string table indexes change by finalizing in order.
199 StrTab.finalizeInOrder();
200
201 // Remove duplicates function infos that have both entries from debug info
202 // (DWARF or Breakpad) and entries from the SymbolTable.
203 //
204 // Also handle overlapping function. Usually there shouldn't be any, but they
205 // can and do happen in some rare cases.
206 //
207 // (a) (b) (c)
208 // ^ ^ ^ ^
209 // |X |Y |X ^ |X
210 // | | | |Y | ^
211 // | | | v v |Y
212 // v v v v
213 //
214 // In (a) and (b), Y is ignored and X will be reported for the full range.
215 // In (c), both functions will be included in the result and lookups for an
216 // address in the intersection will return Y because of binary search.
217 //
218 // Note that in case of (b), we cannot include Y in the result because then
219 // we wouldn't find any function for range (end of Y, end of X)
220 // with binary search
221
222 const auto NumBefore = Funcs.size();
223 // Only sort and unique if this isn't a segment. If this is a segment we
224 // already finalized the main GsymCreator with all of the function infos
225 // and then the already sorted and uniqued function infos were added to this
226 // object.
227 if (!IsSegment) {
228 if (NumBefore > 1) {
229 // Sort function infos so we can emit sorted functions.
230 llvm::sort(C&: Funcs);
231 std::vector<FunctionInfo> FinalizedFuncs;
232 FinalizedFuncs.reserve(n: Funcs.size());
233 FinalizedFuncs.emplace_back(args: std::move(Funcs.front()));
234 for (size_t Idx=1; Idx < NumBefore; ++Idx) {
235 FunctionInfo &Prev = FinalizedFuncs.back();
236 FunctionInfo &Curr = Funcs[Idx];
237 // Empty ranges won't intersect, but we still need to
238 // catch the case where we have multiple symbols at the
239 // same address and coalesce them.
240 const bool ranges_equal = Prev.Range == Curr.Range;
241 if (ranges_equal || Prev.Range.intersects(R: Curr.Range)) {
242 // Overlapping ranges or empty identical ranges.
243 if (ranges_equal) {
244 // Same address range. Check if one is from debug
245 // info and the other is from a symbol table. If
246 // so, then keep the one with debug info. Our
247 // sorting guarantees that entries with matching
248 // address ranges that have debug info are last in
249 // the sort.
250 if (!(Prev == Curr)) {
251 if (Prev.hasRichInfo() && Curr.hasRichInfo())
252 Out.Report(
253 s: "Duplicate address ranges with different debug info.",
254 detailCallback: [&](raw_ostream &OS) {
255 OS << "warning: same address range contains "
256 "different debug "
257 << "info. Removing:\n"
258 << Prev << "\nIn favor of this one:\n"
259 << Curr << "\n";
260 });
261
262 // We want to swap the current entry with the previous since
263 // later entries with the same range always have more debug info
264 // or different debug info.
265 std::swap(a&: Prev, b&: Curr);
266 }
267 } else {
268 Out.Report(s: "Overlapping function ranges", detailCallback: [&](raw_ostream &OS) {
269 // print warnings about overlaps
270 OS << "warning: function ranges overlap:\n"
271 << Prev << "\n"
272 << Curr << "\n";
273 });
274 FinalizedFuncs.emplace_back(args: std::move(Curr));
275 }
276 } else {
277 if (Prev.Range.size() == 0 && Curr.Range.contains(Addr: Prev.Range.start())) {
278 // Symbols on macOS don't have address ranges, so if the range
279 // doesn't match and the size is zero, then we replace the empty
280 // symbol function info with the current one.
281 std::swap(a&: Prev, b&: Curr);
282 } else {
283 FinalizedFuncs.emplace_back(args: std::move(Curr));
284 }
285 }
286 }
287 std::swap(x&: Funcs, y&: FinalizedFuncs);
288 }
289 // If our last function info entry doesn't have a size and if we have valid
290 // text ranges, we should set the size of the last entry since any search for
291 // a high address might match our last entry. By fixing up this size, we can
292 // help ensure we don't cause lookups to always return the last symbol that
293 // has no size when doing lookups.
294 if (!Funcs.empty() && Funcs.back().Range.size() == 0 && ValidTextRanges) {
295 if (auto Range =
296 ValidTextRanges->getRangeThatContains(Addr: Funcs.back().Range.start())) {
297 Funcs.back().Range = {Funcs.back().Range.start(), Range->end()};
298 }
299 }
300 Out << "Pruned " << NumBefore - Funcs.size() << " functions, ended with "
301 << Funcs.size() << " total\n";
302 }
303 return Error::success();
304}
305
306uint32_t GsymCreator::copyString(const GsymCreator &SrcGC, uint32_t StrOff) {
307 // String offset at zero is always the empty string, no copying needed.
308 if (StrOff == 0)
309 return 0;
310 return StrTab.add(S: SrcGC.StringOffsetMap.find(Val: StrOff)->second);
311}
312
313uint32_t GsymCreator::insertString(StringRef S, bool Copy) {
314 if (S.empty())
315 return 0;
316
317 // The hash can be calculated outside the lock.
318 CachedHashStringRef CHStr(S);
319 std::lock_guard<std::mutex> Guard(Mutex);
320 if (Copy) {
321 // We need to provide backing storage for the string if requested
322 // since StringTableBuilder stores references to strings. Any string
323 // that comes from a section in an object file doesn't need to be
324 // copied, but any string created by code will need to be copied.
325 // This allows GsymCreator to be really fast when parsing DWARF and
326 // other object files as most strings don't need to be copied.
327 if (!StrTab.contains(S: CHStr))
328 CHStr = CachedHashStringRef{StringStorage.insert(key: S).first->getKey(),
329 CHStr.hash()};
330 }
331 const uint32_t StrOff = StrTab.add(S: CHStr);
332 // Save a mapping of string offsets to the cached string reference in case
333 // we need to segment the GSYM file and copy string from one string table to
334 // another.
335 if (StringOffsetMap.count(Val: StrOff) == 0)
336 StringOffsetMap.insert(KV: std::make_pair(x: StrOff, y&: CHStr));
337 return StrOff;
338}
339
340void GsymCreator::addFunctionInfo(FunctionInfo &&FI) {
341 std::lock_guard<std::mutex> Guard(Mutex);
342 Funcs.emplace_back(args: std::move(FI));
343}
344
345void GsymCreator::forEachFunctionInfo(
346 std::function<bool(FunctionInfo &)> const &Callback) {
347 std::lock_guard<std::mutex> Guard(Mutex);
348 for (auto &FI : Funcs) {
349 if (!Callback(FI))
350 break;
351 }
352}
353
354void GsymCreator::forEachFunctionInfo(
355 std::function<bool(const FunctionInfo &)> const &Callback) const {
356 std::lock_guard<std::mutex> Guard(Mutex);
357 for (const auto &FI : Funcs) {
358 if (!Callback(FI))
359 break;
360 }
361}
362
363size_t GsymCreator::getNumFunctionInfos() const {
364 std::lock_guard<std::mutex> Guard(Mutex);
365 return Funcs.size();
366}
367
368bool GsymCreator::IsValidTextAddress(uint64_t Addr) const {
369 if (ValidTextRanges)
370 return ValidTextRanges->contains(Addr);
371 return true; // No valid text ranges has been set, so accept all ranges.
372}
373
374std::optional<uint64_t> GsymCreator::getFirstFunctionAddress() const {
375 // If we have finalized then Funcs are sorted. If we are a segment then
376 // Funcs will be sorted as well since function infos get added from an
377 // already finalized GsymCreator object where its functions were sorted and
378 // uniqued.
379 if ((Finalized || IsSegment) && !Funcs.empty())
380 return std::optional<uint64_t>(Funcs.front().startAddress());
381 return std::nullopt;
382}
383
384std::optional<uint64_t> GsymCreator::getLastFunctionAddress() const {
385 // If we have finalized then Funcs are sorted. If we are a segment then
386 // Funcs will be sorted as well since function infos get added from an
387 // already finalized GsymCreator object where its functions were sorted and
388 // uniqued.
389 if ((Finalized || IsSegment) && !Funcs.empty())
390 return std::optional<uint64_t>(Funcs.back().startAddress());
391 return std::nullopt;
392}
393
394std::optional<uint64_t> GsymCreator::getBaseAddress() const {
395 if (BaseAddress)
396 return BaseAddress;
397 return getFirstFunctionAddress();
398}
399
400uint64_t GsymCreator::getMaxAddressOffset() const {
401 switch (getAddressOffsetSize()) {
402 case 1: return UINT8_MAX;
403 case 2: return UINT16_MAX;
404 case 4: return UINT32_MAX;
405 case 8: return UINT64_MAX;
406 }
407 llvm_unreachable("invalid address offset");
408}
409
410uint8_t GsymCreator::getAddressOffsetSize() const {
411 const std::optional<uint64_t> BaseAddress = getBaseAddress();
412 const std::optional<uint64_t> LastFuncAddr = getLastFunctionAddress();
413 if (BaseAddress && LastFuncAddr) {
414 const uint64_t AddrDelta = *LastFuncAddr - *BaseAddress;
415 if (AddrDelta <= UINT8_MAX)
416 return 1;
417 else if (AddrDelta <= UINT16_MAX)
418 return 2;
419 else if (AddrDelta <= UINT32_MAX)
420 return 4;
421 return 8;
422 }
423 return 1;
424}
425
426uint64_t GsymCreator::calculateHeaderAndTableSize() const {
427 uint64_t Size = sizeof(Header);
428 const size_t NumFuncs = Funcs.size();
429 // Add size of address offset table
430 Size += NumFuncs * getAddressOffsetSize();
431 // Add size of address info offsets which are 32 bit integers in version 1.
432 Size += NumFuncs * sizeof(uint32_t);
433 // Add file table size
434 Size += Files.size() * sizeof(FileEntry);
435 // Add string table size
436 Size += StrTab.getSize();
437
438 return Size;
439}
440
441// This function takes a InlineInfo class that was copy constructed from an
442// InlineInfo from the \a SrcGC and updates all members that point to strings
443// and files to point to strings and files from this GsymCreator.
444void GsymCreator::fixupInlineInfo(const GsymCreator &SrcGC, InlineInfo &II) {
445 II.Name = copyString(SrcGC, StrOff: II.Name);
446 II.CallFile = copyFile(SrcGC, FileIdx: II.CallFile);
447 for (auto &ChildII: II.Children)
448 fixupInlineInfo(SrcGC, II&: ChildII);
449}
450
451uint64_t GsymCreator::copyFunctionInfo(const GsymCreator &SrcGC, size_t FuncIdx) {
452 // To copy a function info we need to copy any files and strings over into
453 // this GsymCreator and then copy the function info and update the string
454 // table offsets to match the new offsets.
455 const FunctionInfo &SrcFI = SrcGC.Funcs[FuncIdx];
456
457 FunctionInfo DstFI;
458 DstFI.Range = SrcFI.Range;
459 DstFI.Name = copyString(SrcGC, StrOff: SrcFI.Name);
460 // Copy the line table if there is one.
461 if (SrcFI.OptLineTable) {
462 // Copy the entire line table.
463 DstFI.OptLineTable = LineTable(SrcFI.OptLineTable.value());
464 // Fixup all LineEntry::File entries which are indexes in the the file table
465 // from SrcGC and must be converted to file indexes from this GsymCreator.
466 LineTable &DstLT = DstFI.OptLineTable.value();
467 const size_t NumLines = DstLT.size();
468 for (size_t I=0; I<NumLines; ++I) {
469 LineEntry &LE = DstLT.get(i: I);
470 LE.File = copyFile(SrcGC, FileIdx: LE.File);
471 }
472 }
473 // Copy the inline information if needed.
474 if (SrcFI.Inline) {
475 // Make a copy of the source inline information.
476 DstFI.Inline = SrcFI.Inline.value();
477 // Fixup all strings and files in the copied inline information.
478 fixupInlineInfo(SrcGC, II&: *DstFI.Inline);
479 }
480 std::lock_guard<std::mutex> Guard(Mutex);
481 Funcs.emplace_back(args&: DstFI);
482 return Funcs.back().cacheEncoding();
483}
484
485llvm::Error GsymCreator::saveSegments(StringRef Path,
486 llvm::endianness ByteOrder,
487 uint64_t SegmentSize) const {
488 if (SegmentSize == 0)
489 return createStringError(EC: std::errc::invalid_argument,
490 Fmt: "invalid segment size zero");
491
492 size_t FuncIdx = 0;
493 const size_t NumFuncs = Funcs.size();
494 while (FuncIdx < NumFuncs) {
495 llvm::Expected<std::unique_ptr<GsymCreator>> ExpectedGC =
496 createSegment(SegmentSize, FuncIdx);
497 if (ExpectedGC) {
498 GsymCreator *GC = ExpectedGC->get();
499 if (GC == NULL)
500 break; // We had not more functions to encode.
501 // Don't collect any messages at all
502 OutputAggregator Out(nullptr);
503 llvm::Error Err = GC->finalize(Out);
504 if (Err)
505 return Err;
506 std::string SegmentedGsymPath;
507 raw_string_ostream SGP(SegmentedGsymPath);
508 std::optional<uint64_t> FirstFuncAddr = GC->getFirstFunctionAddress();
509 if (FirstFuncAddr) {
510 SGP << Path << "-" << llvm::format_hex(N: *FirstFuncAddr, Width: 1);
511 SGP.flush();
512 Err = GC->save(Path: SegmentedGsymPath, ByteOrder, SegmentSize: std::nullopt);
513 if (Err)
514 return Err;
515 }
516 } else {
517 return ExpectedGC.takeError();
518 }
519 }
520 return Error::success();
521}
522
523llvm::Expected<std::unique_ptr<GsymCreator>>
524GsymCreator::createSegment(uint64_t SegmentSize, size_t &FuncIdx) const {
525 // No function entries, return empty unique pointer
526 if (FuncIdx >= Funcs.size())
527 return std::unique_ptr<GsymCreator>();
528
529 std::unique_ptr<GsymCreator> GC(new GsymCreator(/*Quiet=*/true));
530
531 // Tell the creator that this is a segment.
532 GC->setIsSegment();
533
534 // Set the base address if there is one.
535 if (BaseAddress)
536 GC->setBaseAddress(*BaseAddress);
537 // Copy the UUID value from this object into the new creator.
538 GC->setUUID(UUID);
539 const size_t NumFuncs = Funcs.size();
540 // Track how big the function infos are for the current segment so we can
541 // emit segments that are close to the requested size. It is quick math to
542 // determine the current header and tables sizes, so we can do that each loop.
543 uint64_t SegmentFuncInfosSize = 0;
544 for (; FuncIdx < NumFuncs; ++FuncIdx) {
545 const uint64_t HeaderAndTableSize = GC->calculateHeaderAndTableSize();
546 if (HeaderAndTableSize + SegmentFuncInfosSize >= SegmentSize) {
547 if (SegmentFuncInfosSize == 0)
548 return createStringError(EC: std::errc::invalid_argument,
549 Fmt: "a segment size of %" PRIu64 " is to small to "
550 "fit any function infos, specify a larger value",
551 Vals: SegmentSize);
552
553 break;
554 }
555 SegmentFuncInfosSize += alignTo(Value: GC->copyFunctionInfo(SrcGC: *this, FuncIdx), Align: 4);
556 }
557 return std::move(GC);
558}
559