1//===----------------------------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file implements OnDiskGraphDB, an on-disk CAS nodes database,
11/// independent of a particular hashing algorithm. It only needs to be
12/// configured for the hash size and controls the schema of the storage.
13///
14/// OnDiskGraphDB defines:
15///
16/// - How the data is stored inside database, either as a standalone file, or
17/// allocated inside a datapool.
18/// - How references to other objects inside the same database is stored. They
19/// are stored as internal references, instead of full hash value to save
20/// space.
21/// - How to chain databases together and import objects from upstream
22/// databases.
23///
24/// Here's a top-level description of the current layout:
25///
26/// - db/index.<version>: a file for the "index" table, named by \a
27/// IndexTableName and managed by \a TrieRawHashMap. The contents are 8B
28/// that are accessed atomically, describing the object kind and where/how
29/// it's stored (including an optional file offset). See \a TrieRecord for
30/// more details.
31/// - db/data.<version>: a file for the "data" table, named by \a
32/// DataPoolTableName and managed by \a DataStore. New objects within
33/// TrieRecord::MaxEmbeddedSize are inserted here as \a
34/// TrieRecord::StorageKind::DataPool.
35/// - db/obj.<offset>.<version>: a file storing an object outside the main
36/// "data" table, named by its offset into the "index" table, with the
37/// format of \a TrieRecord::StorageKind::Standalone.
38/// - db/leaf.<offset>.<version>: a file storing a leaf node outside the
39/// main "data" table, named by its offset into the "index" table, with
40/// the format of \a TrieRecord::StorageKind::StandaloneLeaf.
41/// - db/leaf+0.<offset>.<version>: a file storing a null-terminated leaf object
42/// outside the main "data" table, named by its offset into the "index" table,
43/// with the format of \a TrieRecord::StorageKind::StandaloneLeaf0.
44//
45//===----------------------------------------------------------------------===//
46
47#include "llvm/CAS/OnDiskGraphDB.h"
48#include "OnDiskCommon.h"
49#include "llvm/ADT/DenseMap.h"
50#include "llvm/ADT/ScopeExit.h"
51#include "llvm/ADT/StringExtras.h"
52#include "llvm/CAS/OnDiskCASLogger.h"
53#include "llvm/CAS/OnDiskDataAllocator.h"
54#include "llvm/CAS/OnDiskTrieRawHashMap.h"
55#include "llvm/Support/Alignment.h"
56#include "llvm/Support/Compiler.h"
57#include "llvm/Support/Errc.h"
58#include "llvm/Support/Error.h"
59#include "llvm/Support/ErrorHandling.h"
60#include "llvm/Support/FileSystem.h"
61#include "llvm/Support/IOSandbox.h"
62#include "llvm/Support/MemoryBuffer.h"
63#include "llvm/Support/Path.h"
64#include "llvm/Support/Process.h"
65#include <atomic>
66#include <mutex>
67#include <optional>
68
69#define DEBUG_TYPE "on-disk-cas"
70
71using namespace llvm;
72using namespace llvm::cas;
73using namespace llvm::cas::ondisk;
74
75static constexpr StringLiteral IndexTableName = "llvm.cas.index";
76static constexpr StringLiteral DataPoolTableName = "llvm.cas.data";
77
78static constexpr StringLiteral IndexFilePrefix = "index.";
79static constexpr StringLiteral DataPoolFilePrefix = "data.";
80
81static constexpr StringLiteral FilePrefixObject = "obj.";
82static constexpr StringLiteral FilePrefixLeaf = "leaf.";
83static constexpr StringLiteral FilePrefixLeaf0 = "leaf+0.";
84
85static Error createCorruptObjectError(Expected<ArrayRef<uint8_t>> ID) {
86 if (!ID)
87 return ID.takeError();
88
89 return createStringError(EC: llvm::errc::invalid_argument,
90 S: "corrupt object '" + toHex(Input: *ID) + "'");
91}
92
93namespace {
94
95/// Trie record data: 8 bytes, atomic<uint64_t>
96/// - 1-byte: StorageKind
97/// - 7-bytes: DataStoreOffset (offset into referenced file)
98class TrieRecord {
99public:
100 enum class StorageKind : uint8_t {
101 /// Unknown object.
102 Unknown = 0,
103
104 /// data.vX: main pool, full DataStore record.
105 DataPool = 1,
106
107 /// obj.<TrieRecordOffset>.vX: standalone, with a full DataStore record.
108 Standalone = 10,
109
110 /// leaf.<TrieRecordOffset>.vX: standalone, just the data. File contents
111 /// exactly the data content and file size matches the data size. No refs.
112 StandaloneLeaf = 11,
113
114 /// leaf+0.<TrieRecordOffset>.vX: standalone, just the data plus an
115 /// extra null character ('\0'). File size is 1 bigger than the data size.
116 /// No refs.
117 StandaloneLeaf0 = 12,
118 };
119
120 static StringRef getStandaloneFilePrefix(StorageKind SK) {
121 switch (SK) {
122 default:
123 llvm_unreachable("Expected standalone storage kind");
124 case TrieRecord::StorageKind::Standalone:
125 return FilePrefixObject;
126 case TrieRecord::StorageKind::StandaloneLeaf:
127 return FilePrefixLeaf;
128 case TrieRecord::StorageKind::StandaloneLeaf0:
129 return FilePrefixLeaf0;
130 }
131 }
132
133 enum Limits : int64_t {
134 /// Saves files bigger than 64KB standalone instead of embedding them.
135 MaxEmbeddedSize = 64LL * 1024LL - 1,
136 };
137
138 struct Data {
139 StorageKind SK = StorageKind::Unknown;
140 FileOffset Offset;
141 };
142
143 /// Pack StorageKind and Offset from Data into 8 byte TrieRecord.
144 static uint64_t pack(Data D) {
145 assert(D.Offset.get() < (int64_t)(1ULL << 56));
146 uint64_t Packed = uint64_t(D.SK) << 56 | D.Offset.get();
147 assert(D.SK != StorageKind::Unknown || Packed == 0);
148#ifndef NDEBUG
149 Data RoundTrip = unpack(Packed);
150 assert(D.SK == RoundTrip.SK);
151 assert(D.Offset.get() == RoundTrip.Offset.get());
152#endif
153 return Packed;
154 }
155
156 // Unpack TrieRecord into Data.
157 static Data unpack(uint64_t Packed) {
158 Data D;
159 if (!Packed)
160 return D;
161 D.SK = (StorageKind)(Packed >> 56);
162 D.Offset = FileOffset(Packed & (UINT64_MAX >> 8));
163 return D;
164 }
165
166 TrieRecord() : Storage(0) {}
167
168 Data load() const { return unpack(Packed: Storage); }
169 bool compare_exchange_strong(Data &Existing, Data New);
170
171private:
172 std::atomic<uint64_t> Storage;
173};
174
175/// DataStore record data: 4B + size? + refs? + data + 0
176/// - 4-bytes: Header
177/// - {0,4,8}-bytes: DataSize (may be packed in Header)
178/// - {0,4,8}-bytes: NumRefs (may be packed in Header)
179/// - NumRefs*{4,8}-bytes: Refs[] (end-ptr is 8-byte aligned)
180/// - <data>
181/// - 1-byte: 0-term
182struct DataRecordHandle {
183 /// NumRefs storage: 4B, 2B, 1B, or 0B (no refs). Or, 8B, for alignment
184 /// convenience to avoid computing padding later.
185 enum class NumRefsFlags : uint8_t {
186 Uses0B = 0U,
187 Uses1B = 1U,
188 Uses2B = 2U,
189 Uses4B = 3U,
190 Uses8B = 4U,
191 Max = Uses8B,
192 };
193
194 /// DataSize storage: 8B, 4B, 2B, or 1B.
195 enum class DataSizeFlags {
196 Uses1B = 0U,
197 Uses2B = 1U,
198 Uses4B = 2U,
199 Uses8B = 3U,
200 Max = Uses8B,
201 };
202
203 /// Kind of ref stored in Refs[]: InternalRef or InternalRef4B.
204 enum class RefKindFlags {
205 InternalRef = 0U,
206 InternalRef4B = 1U,
207 Max = InternalRef4B,
208 };
209
210 enum Counts : int {
211 NumRefsShift = 0,
212 NumRefsBits = 3,
213 DataSizeShift = NumRefsShift + NumRefsBits,
214 DataSizeBits = 2,
215 RefKindShift = DataSizeShift + DataSizeBits,
216 RefKindBits = 1,
217 };
218 static_assert(((UINT32_MAX << NumRefsBits) & (uint32_t)NumRefsFlags::Max) ==
219 0,
220 "Not enough bits");
221 static_assert(((UINT32_MAX << DataSizeBits) & (uint32_t)DataSizeFlags::Max) ==
222 0,
223 "Not enough bits");
224 static_assert(((UINT32_MAX << RefKindBits) & (uint32_t)RefKindFlags::Max) ==
225 0,
226 "Not enough bits");
227
228 /// Layout of the DataRecordHandle and how to decode it.
229 struct LayoutFlags {
230 NumRefsFlags NumRefs;
231 DataSizeFlags DataSize;
232 RefKindFlags RefKind;
233
234 static uint64_t pack(LayoutFlags LF) {
235 unsigned Packed = ((unsigned)LF.NumRefs << NumRefsShift) |
236 ((unsigned)LF.DataSize << DataSizeShift) |
237 ((unsigned)LF.RefKind << RefKindShift);
238#ifndef NDEBUG
239 LayoutFlags RoundTrip = unpack(Packed);
240 assert(LF.NumRefs == RoundTrip.NumRefs);
241 assert(LF.DataSize == RoundTrip.DataSize);
242 assert(LF.RefKind == RoundTrip.RefKind);
243#endif
244 return Packed;
245 }
246 static LayoutFlags unpack(uint64_t Storage) {
247 assert(Storage <= UINT8_MAX && "Expect storage to fit in a byte");
248 LayoutFlags LF;
249 LF.NumRefs =
250 (NumRefsFlags)((Storage >> NumRefsShift) & ((1U << NumRefsBits) - 1));
251 LF.DataSize = (DataSizeFlags)((Storage >> DataSizeShift) &
252 ((1U << DataSizeBits) - 1));
253 LF.RefKind =
254 (RefKindFlags)((Storage >> RefKindShift) & ((1U << RefKindBits) - 1));
255 return LF;
256 }
257 };
258
259 /// Header layout:
260 /// - 1-byte: LayoutFlags
261 /// - 1-byte: 1B size field
262 /// - {0,2}-bytes: 2B size field
263 struct Header {
264 using PackTy = uint32_t;
265 PackTy Packed;
266
267 static constexpr unsigned LayoutFlagsShift =
268 (sizeof(PackTy) - 1) * CHAR_BIT;
269 };
270
271 struct Input {
272 InternalRefArrayRef Refs;
273 ArrayRef<char> Data;
274 };
275
276 LayoutFlags getLayoutFlags() const {
277 return LayoutFlags::unpack(Storage: H->Packed >> Header::LayoutFlagsShift);
278 }
279
280 uint64_t getDataSize() const;
281 void skipDataSize(LayoutFlags LF, int64_t &RelOffset) const;
282 uint32_t getNumRefs() const;
283 void skipNumRefs(LayoutFlags LF, int64_t &RelOffset) const;
284 int64_t getRefsRelOffset() const;
285 int64_t getDataRelOffset() const;
286
287 static uint64_t getTotalSize(uint64_t DataRelOffset, uint64_t DataSize) {
288 return DataRelOffset + DataSize + 1;
289 }
290 uint64_t getTotalSize() const {
291 return getDataRelOffset() + getDataSize() + 1;
292 }
293
294 /// Describe the layout of data stored and how to decode from
295 /// DataRecordHandle.
296 struct Layout {
297 explicit Layout(const Input &I);
298
299 LayoutFlags Flags;
300 uint64_t DataSize = 0;
301 uint32_t NumRefs = 0;
302 int64_t RefsRelOffset = 0;
303 int64_t DataRelOffset = 0;
304 uint64_t getTotalSize() const {
305 return DataRecordHandle::getTotalSize(DataRelOffset, DataSize);
306 }
307 };
308
309 InternalRefArrayRef getRefs() const {
310 assert(H && "Expected valid handle");
311 auto *BeginByte = reinterpret_cast<const char *>(H) + getRefsRelOffset();
312 size_t Size = getNumRefs();
313 if (!Size)
314 return InternalRefArrayRef();
315 if (getLayoutFlags().RefKind == RefKindFlags::InternalRef4B)
316 return ArrayRef(reinterpret_cast<const InternalRef4B *>(BeginByte), Size);
317 return ArrayRef(reinterpret_cast<const InternalRef *>(BeginByte), Size);
318 }
319
320 ArrayRef<char> getData() const {
321 assert(H && "Expected valid handle");
322 return ArrayRef(reinterpret_cast<const char *>(H) + getDataRelOffset(),
323 getDataSize());
324 }
325
326 static DataRecordHandle create(function_ref<char *(size_t Size)> Alloc,
327 const Input &I);
328 static Expected<DataRecordHandle>
329 createWithError(function_ref<Expected<char *>(size_t Size)> Alloc,
330 const Input &I);
331 static DataRecordHandle construct(char *Mem, const Input &I);
332
333 static DataRecordHandle get(const char *Mem) {
334 return DataRecordHandle(
335 *reinterpret_cast<const DataRecordHandle::Header *>(Mem));
336 }
337 static Expected<DataRecordHandle>
338 getFromDataPool(const OnDiskDataAllocator &Pool, FileOffset Offset);
339
340 explicit operator bool() const { return H; }
341 const Header &getHeader() const { return *H; }
342
343 DataRecordHandle() = default;
344 explicit DataRecordHandle(const Header &H) : H(&H) {}
345
346private:
347 static DataRecordHandle constructImpl(char *Mem, const Input &I,
348 const Layout &L);
349 const Header *H = nullptr;
350};
351
352/// Proxy for any on-disk object or raw data.
353struct OnDiskContent {
354 std::optional<DataRecordHandle> Record;
355 std::optional<ArrayRef<char>> Bytes;
356};
357
358/// Data loaded inside the memory from standalone file.
359class StandaloneDataInMemory {
360public:
361 OnDiskContent getContent() const;
362
363 StandaloneDataInMemory(std::unique_ptr<sys::fs::mapped_file_region> Region,
364 TrieRecord::StorageKind SK)
365 : Region(std::move(Region)), SK(SK) {
366#ifndef NDEBUG
367 bool IsStandalone = false;
368 switch (SK) {
369 case TrieRecord::StorageKind::Standalone:
370 case TrieRecord::StorageKind::StandaloneLeaf:
371 case TrieRecord::StorageKind::StandaloneLeaf0:
372 IsStandalone = true;
373 break;
374 default:
375 break;
376 }
377 assert(IsStandalone);
378#endif
379 }
380
381private:
382 std::unique_ptr<sys::fs::mapped_file_region> Region;
383 TrieRecord::StorageKind SK;
384};
385
386/// Container to lookup loaded standalone objects.
387template <size_t NumShards> class StandaloneDataMap {
388 static_assert(isPowerOf2_64(Value: NumShards), "Expected power of 2");
389
390public:
391 uintptr_t insert(ArrayRef<uint8_t> Hash, TrieRecord::StorageKind SK,
392 std::unique_ptr<sys::fs::mapped_file_region> Region);
393
394 const StandaloneDataInMemory *lookup(ArrayRef<uint8_t> Hash) const;
395 bool count(ArrayRef<uint8_t> Hash) const { return bool(lookup(Hash)); }
396
397private:
398 struct Shard {
399 /// Needs to store a std::unique_ptr for a stable address identity.
400 DenseMap<const uint8_t *, std::unique_ptr<StandaloneDataInMemory>> Map;
401 mutable std::mutex Mutex;
402 };
403 Shard &getShard(ArrayRef<uint8_t> Hash) {
404 return const_cast<Shard &>(
405 const_cast<const StandaloneDataMap *>(this)->getShard(Hash));
406 }
407 const Shard &getShard(ArrayRef<uint8_t> Hash) const {
408 static_assert(NumShards <= 256, "Expected only 8 bits of shard");
409 return Shards[Hash[0] % NumShards];
410 }
411
412 Shard Shards[NumShards];
413};
414
415using StandaloneDataMapTy = StandaloneDataMap<16>;
416
417/// A vector of internal node references.
418class InternalRefVector {
419public:
420 void push_back(InternalRef Ref) {
421 if (NeedsFull)
422 return FullRefs.push_back(Elt: Ref);
423 if (std::optional<InternalRef4B> Small = InternalRef4B::tryToShrink(Ref))
424 return SmallRefs.push_back(Elt: *Small);
425 NeedsFull = true;
426 assert(FullRefs.empty());
427 FullRefs.reserve(N: SmallRefs.size() + 1);
428 for (InternalRef4B Small : SmallRefs)
429 FullRefs.push_back(Elt: Small);
430 FullRefs.push_back(Elt: Ref);
431 SmallRefs.clear();
432 }
433
434 operator InternalRefArrayRef() const {
435 assert(SmallRefs.empty() || FullRefs.empty());
436 return NeedsFull ? InternalRefArrayRef(FullRefs)
437 : InternalRefArrayRef(SmallRefs);
438 }
439
440private:
441 bool NeedsFull = false;
442 SmallVector<InternalRef4B> SmallRefs;
443 SmallVector<InternalRef> FullRefs;
444};
445
446} // namespace
447
448Expected<DataRecordHandle> DataRecordHandle::createWithError(
449 function_ref<Expected<char *>(size_t Size)> Alloc, const Input &I) {
450 Layout L(I);
451 if (Expected<char *> Mem = Alloc(L.getTotalSize()))
452 return constructImpl(Mem: *Mem, I, L);
453 else
454 return Mem.takeError();
455}
456
457ObjectHandle ObjectHandle::fromFileOffset(FileOffset Offset) {
458 // Store the file offset as it is.
459 assert(!(Offset.get() & 0x1));
460 return ObjectHandle(Offset.get());
461}
462
463ObjectHandle ObjectHandle::fromMemory(uintptr_t Ptr) {
464 // Store the pointer from memory with lowest bit set.
465 assert(!(Ptr & 0x1));
466 return ObjectHandle(Ptr | 1);
467}
468
469/// Proxy for an on-disk index record.
470struct OnDiskGraphDB::IndexProxy {
471 FileOffset Offset;
472 ArrayRef<uint8_t> Hash;
473 TrieRecord &Ref;
474};
475
476template <size_t N>
477uintptr_t StandaloneDataMap<N>::insert(
478 ArrayRef<uint8_t> Hash, TrieRecord::StorageKind SK,
479 std::unique_ptr<sys::fs::mapped_file_region> Region) {
480 auto &S = getShard(Hash);
481 std::lock_guard<std::mutex> Lock(S.Mutex);
482 auto &V = S.Map[Hash.data()];
483 if (!V)
484 V = std::make_unique<StandaloneDataInMemory>(args: std::move(Region), args&: SK);
485 return reinterpret_cast<uintptr_t>(V.get());
486}
487
488template <size_t N>
489const StandaloneDataInMemory *
490StandaloneDataMap<N>::lookup(ArrayRef<uint8_t> Hash) const {
491 auto &S = getShard(Hash);
492 std::lock_guard<std::mutex> Lock(S.Mutex);
493 auto I = S.Map.find(Hash.data());
494 if (I == S.Map.end())
495 return nullptr;
496 return &*I->second;
497}
498
499namespace {
500
501/// Copy of \a sys::fs::TempFile that skips RemoveOnSignal, which is too
502/// expensive to register/unregister at this rate.
503///
504/// FIXME: Add a TempFileManager that maintains a thread-safe list of open temp
505/// files and has a signal handler registerd that removes them all.
506class TempFile {
507 bool Done = false;
508 TempFile(StringRef Name, int FD, OnDiskCASLogger *Logger)
509 : TmpName(std::string(Name)), FD(FD), Logger(Logger) {}
510
511public:
512 /// This creates a temporary file with createUniqueFile.
513 static Expected<TempFile> create(const Twine &Model, OnDiskCASLogger *Logger);
514 TempFile(TempFile &&Other) { *this = std::move(Other); }
515 TempFile &operator=(TempFile &&Other) {
516 TmpName = std::move(Other.TmpName);
517 FD = Other.FD;
518 Logger = Other.Logger;
519 Other.Done = true;
520 Other.FD = -1;
521 return *this;
522 }
523
524 // Name of the temporary file.
525 std::string TmpName;
526
527 // The open file descriptor.
528 int FD = -1;
529
530 OnDiskCASLogger *Logger = nullptr;
531
532 // Keep this with the given name.
533 Error keep(const Twine &Name);
534 Error discard();
535
536 // This checks that keep or delete was called.
537 ~TempFile() { consumeError(Err: discard()); }
538};
539
540class MappedTempFile {
541public:
542 char *data() const { return Map.data(); }
543 size_t size() const { return Map.size(); }
544
545 Error discard() {
546 assert(Map && "Map already destroyed");
547 Map.unmap();
548 return Temp.discard();
549 }
550
551 Error keep(const Twine &Name) {
552 assert(Map && "Map already destroyed");
553 Map.unmap();
554 return Temp.keep(Name);
555 }
556
557 MappedTempFile(TempFile Temp, sys::fs::mapped_file_region Map)
558 : Temp(std::move(Temp)), Map(std::move(Map)) {}
559
560private:
561 TempFile Temp;
562 sys::fs::mapped_file_region Map;
563};
564} // namespace
565
566Error TempFile::discard() {
567 Done = true;
568 if (FD != -1) {
569 sys::fs::file_t File = sys::fs::convertFDToNativeFile(FD);
570 if (std::error_code EC = sys::fs::closeFile(F&: File))
571 return errorCodeToError(EC);
572 }
573 FD = -1;
574
575 // Always try to close and remove.
576 std::error_code RemoveEC;
577 if (!TmpName.empty()) {
578 std::error_code EC = sys::fs::remove(path: TmpName);
579 if (Logger)
580 Logger->logTempFileRemove(TmpName, EC);
581 if (EC)
582 return errorCodeToError(EC);
583 }
584 TmpName = "";
585
586 return Error::success();
587}
588
589Error TempFile::keep(const Twine &Name) {
590 assert(!Done);
591 Done = true;
592 // Always try to close and rename.
593 std::error_code RenameEC = sys::fs::rename(from: TmpName, to: Name);
594
595 if (Logger)
596 Logger->logTempFileKeep(TmpName, Name: Name.str(), EC: RenameEC);
597
598 if (!RenameEC)
599 TmpName = "";
600
601 sys::fs::file_t File = sys::fs::convertFDToNativeFile(FD);
602 if (std::error_code EC = sys::fs::closeFile(F&: File))
603 return errorCodeToError(EC);
604 FD = -1;
605
606 return errorCodeToError(EC: RenameEC);
607}
608
609Expected<TempFile> TempFile::create(const Twine &Model,
610 OnDiskCASLogger *Logger) {
611 int FD;
612 SmallString<128> ResultPath;
613 if (std::error_code EC = sys::fs::createUniqueFile(Model, ResultFD&: FD, ResultPath))
614 return errorCodeToError(EC);
615
616 if (Logger)
617 Logger->logTempFileCreate(Name: ResultPath);
618
619 TempFile Ret(ResultPath, FD, Logger);
620 return std::move(Ret);
621}
622
623bool TrieRecord::compare_exchange_strong(Data &Existing, Data New) {
624 uint64_t ExistingPacked = pack(D: Existing);
625 uint64_t NewPacked = pack(D: New);
626 if (Storage.compare_exchange_strong(i1&: ExistingPacked, i2: NewPacked))
627 return true;
628 Existing = unpack(Packed: ExistingPacked);
629 return false;
630}
631
632Expected<DataRecordHandle>
633DataRecordHandle::getFromDataPool(const OnDiskDataAllocator &Pool,
634 FileOffset Offset) {
635 auto HeaderData = Pool.get(Offset, Size: sizeof(DataRecordHandle::Header));
636 if (!HeaderData)
637 return HeaderData.takeError();
638
639 auto Record = DataRecordHandle::get(Mem: HeaderData->data());
640 if (Record.getTotalSize() + Offset.get() > Pool.size())
641 return createStringError(
642 EC: make_error_code(e: std::errc::illegal_byte_sequence),
643 S: "data record span passed the end of the data pool");
644
645 return Record;
646}
647
648DataRecordHandle DataRecordHandle::constructImpl(char *Mem, const Input &I,
649 const Layout &L) {
650 char *Next = Mem + sizeof(Header);
651
652 // Fill in Packed and set other data, then come back to construct the header.
653 Header::PackTy Packed = 0;
654 Packed |= LayoutFlags::pack(LF: L.Flags) << Header::LayoutFlagsShift;
655
656 // Construct DataSize.
657 switch (L.Flags.DataSize) {
658 case DataSizeFlags::Uses1B:
659 assert(I.Data.size() <= UINT8_MAX);
660 Packed |= (Header::PackTy)I.Data.size()
661 << ((sizeof(Packed) - 2) * CHAR_BIT);
662 break;
663 case DataSizeFlags::Uses2B:
664 assert(I.Data.size() <= UINT16_MAX);
665 Packed |= (Header::PackTy)I.Data.size()
666 << ((sizeof(Packed) - 4) * CHAR_BIT);
667 break;
668 case DataSizeFlags::Uses4B:
669 support::endian::write32le(P: Next, V: I.Data.size());
670 Next += 4;
671 break;
672 case DataSizeFlags::Uses8B:
673 support::endian::write64le(P: Next, V: I.Data.size());
674 Next += 8;
675 break;
676 }
677
678 // Construct NumRefs.
679 //
680 // NOTE: May be writing NumRefs even if there are zero refs in order to fix
681 // alignment.
682 switch (L.Flags.NumRefs) {
683 case NumRefsFlags::Uses0B:
684 break;
685 case NumRefsFlags::Uses1B:
686 assert(I.Refs.size() <= UINT8_MAX);
687 Packed |= (Header::PackTy)I.Refs.size()
688 << ((sizeof(Packed) - 2) * CHAR_BIT);
689 break;
690 case NumRefsFlags::Uses2B:
691 assert(I.Refs.size() <= UINT16_MAX);
692 Packed |= (Header::PackTy)I.Refs.size()
693 << ((sizeof(Packed) - 4) * CHAR_BIT);
694 break;
695 case NumRefsFlags::Uses4B:
696 support::endian::write32le(P: Next, V: I.Refs.size());
697 Next += 4;
698 break;
699 case NumRefsFlags::Uses8B:
700 support::endian::write64le(P: Next, V: I.Refs.size());
701 Next += 8;
702 break;
703 }
704
705 // Construct Refs[].
706 if (!I.Refs.empty()) {
707 assert((L.Flags.RefKind == RefKindFlags::InternalRef4B) == I.Refs.is4B());
708 ArrayRef<uint8_t> RefsBuffer = I.Refs.getBuffer();
709 llvm::copy(Range&: RefsBuffer, Out: Next);
710 Next += RefsBuffer.size();
711 }
712
713 // Construct Data and the trailing null.
714 assert(isAddrAligned(Align(8), Next));
715 llvm::copy(Range: I.Data, Out: Next);
716 Next[I.Data.size()] = 0;
717
718 // Construct the header itself and return.
719 Header *H = new (Mem) Header{.Packed: Packed};
720 DataRecordHandle Record(*H);
721 assert(Record.getData() == I.Data);
722 assert(Record.getNumRefs() == I.Refs.size());
723 assert(Record.getRefs() == I.Refs);
724 assert(Record.getLayoutFlags().DataSize == L.Flags.DataSize);
725 assert(Record.getLayoutFlags().NumRefs == L.Flags.NumRefs);
726 assert(Record.getLayoutFlags().RefKind == L.Flags.RefKind);
727 return Record;
728}
729
730DataRecordHandle::Layout::Layout(const Input &I) {
731 // Start initial relative offsets right after the Header.
732 uint64_t RelOffset = sizeof(Header);
733
734 // Initialize the easy stuff.
735 DataSize = I.Data.size();
736 NumRefs = I.Refs.size();
737
738 // Check refs size.
739 Flags.RefKind =
740 I.Refs.is4B() ? RefKindFlags::InternalRef4B : RefKindFlags::InternalRef;
741
742 // Find the smallest slot available for DataSize.
743 bool Has1B = true;
744 bool Has2B = true;
745 if (DataSize <= UINT8_MAX && Has1B) {
746 Flags.DataSize = DataSizeFlags::Uses1B;
747 Has1B = false;
748 } else if (DataSize <= UINT16_MAX && Has2B) {
749 Flags.DataSize = DataSizeFlags::Uses2B;
750 Has2B = false;
751 } else if (DataSize <= UINT32_MAX) {
752 Flags.DataSize = DataSizeFlags::Uses4B;
753 RelOffset += 4;
754 } else {
755 Flags.DataSize = DataSizeFlags::Uses8B;
756 RelOffset += 8;
757 }
758
759 // Find the smallest slot available for NumRefs. Never sets NumRefs8B here.
760 if (!NumRefs) {
761 Flags.NumRefs = NumRefsFlags::Uses0B;
762 } else if (NumRefs <= UINT8_MAX && Has1B) {
763 Flags.NumRefs = NumRefsFlags::Uses1B;
764 Has1B = false;
765 } else if (NumRefs <= UINT16_MAX && Has2B) {
766 Flags.NumRefs = NumRefsFlags::Uses2B;
767 Has2B = false;
768 } else {
769 Flags.NumRefs = NumRefsFlags::Uses4B;
770 RelOffset += 4;
771 }
772
773 // Helper to "upgrade" either DataSize or NumRefs by 4B to avoid complicated
774 // padding rules when reading and writing. This also bumps RelOffset.
775 //
776 // The value for NumRefs is strictly limited to UINT32_MAX, but it can be
777 // stored as 8B. This means we can *always* find a size to grow.
778 //
779 // NOTE: Only call this once.
780 auto GrowSizeFieldsBy4B = [&]() {
781 assert(isAligned(Align(4), RelOffset));
782 RelOffset += 4;
783
784 assert(Flags.NumRefs != NumRefsFlags::Uses8B &&
785 "Expected to be able to grow NumRefs8B");
786
787 // First try to grow DataSize. NumRefs will not (yet) be 8B, and if
788 // DataSize is upgraded to 8B it'll already be aligned.
789 //
790 // Failing that, grow NumRefs.
791 if (Flags.DataSize < DataSizeFlags::Uses4B)
792 Flags.DataSize = DataSizeFlags::Uses4B; // DataSize: Packed => 4B.
793 else if (Flags.DataSize < DataSizeFlags::Uses8B)
794 Flags.DataSize = DataSizeFlags::Uses8B; // DataSize: 4B => 8B.
795 else if (Flags.NumRefs < NumRefsFlags::Uses4B)
796 Flags.NumRefs = NumRefsFlags::Uses4B; // NumRefs: Packed => 4B.
797 else
798 Flags.NumRefs = NumRefsFlags::Uses8B; // NumRefs: 4B => 8B.
799 };
800
801 assert(isAligned(Align(4), RelOffset));
802 if (Flags.RefKind == RefKindFlags::InternalRef) {
803 // List of 8B refs should be 8B-aligned. Grow one of the sizes to get this
804 // without padding.
805 if (!isAligned(Lhs: Align(8), SizeInBytes: RelOffset))
806 GrowSizeFieldsBy4B();
807
808 assert(isAligned(Align(8), RelOffset));
809 RefsRelOffset = RelOffset;
810 RelOffset += 8 * NumRefs;
811 } else {
812 // The array of 4B refs doesn't need 8B alignment, but the data will need
813 // to be 8B-aligned. Detect this now, and, if necessary, shift everything
814 // by 4B by growing one of the sizes.
815 // If we remove the need for 8B-alignment for data there is <1% savings in
816 // disk storage for a clang build using MCCAS but the 8B-alignment may be
817 // useful in the future so keep it for now.
818 uint64_t RefListSize = 4 * NumRefs;
819 if (!isAligned(Lhs: Align(8), SizeInBytes: RelOffset + RefListSize))
820 GrowSizeFieldsBy4B();
821 RefsRelOffset = RelOffset;
822 RelOffset += RefListSize;
823 }
824
825 assert(isAligned(Align(8), RelOffset));
826 DataRelOffset = RelOffset;
827}
828
829uint64_t DataRecordHandle::getDataSize() const {
830 int64_t RelOffset = sizeof(Header);
831 auto *DataSizePtr = reinterpret_cast<const char *>(H) + RelOffset;
832 switch (getLayoutFlags().DataSize) {
833 case DataSizeFlags::Uses1B:
834 return (H->Packed >> ((sizeof(Header::PackTy) - 2) * CHAR_BIT)) & UINT8_MAX;
835 case DataSizeFlags::Uses2B:
836 return (H->Packed >> ((sizeof(Header::PackTy) - 4) * CHAR_BIT)) &
837 UINT16_MAX;
838 case DataSizeFlags::Uses4B:
839 return support::endian::read32le(P: DataSizePtr);
840 case DataSizeFlags::Uses8B:
841 return support::endian::read64le(P: DataSizePtr);
842 }
843 llvm_unreachable("Unknown DataSizeFlags enum");
844}
845
846void DataRecordHandle::skipDataSize(LayoutFlags LF, int64_t &RelOffset) const {
847 if (LF.DataSize >= DataSizeFlags::Uses4B)
848 RelOffset += 4;
849 if (LF.DataSize >= DataSizeFlags::Uses8B)
850 RelOffset += 4;
851}
852
853uint32_t DataRecordHandle::getNumRefs() const {
854 LayoutFlags LF = getLayoutFlags();
855 int64_t RelOffset = sizeof(Header);
856 skipDataSize(LF, RelOffset);
857 auto *NumRefsPtr = reinterpret_cast<const char *>(H) + RelOffset;
858 switch (LF.NumRefs) {
859 case NumRefsFlags::Uses0B:
860 return 0;
861 case NumRefsFlags::Uses1B:
862 return (H->Packed >> ((sizeof(Header::PackTy) - 2) * CHAR_BIT)) & UINT8_MAX;
863 case NumRefsFlags::Uses2B:
864 return (H->Packed >> ((sizeof(Header::PackTy) - 4) * CHAR_BIT)) &
865 UINT16_MAX;
866 case NumRefsFlags::Uses4B:
867 return support::endian::read32le(P: NumRefsPtr);
868 case NumRefsFlags::Uses8B:
869 return support::endian::read64le(P: NumRefsPtr);
870 }
871 llvm_unreachable("Unknown NumRefsFlags enum");
872}
873
874void DataRecordHandle::skipNumRefs(LayoutFlags LF, int64_t &RelOffset) const {
875 if (LF.NumRefs >= NumRefsFlags::Uses4B)
876 RelOffset += 4;
877 if (LF.NumRefs >= NumRefsFlags::Uses8B)
878 RelOffset += 4;
879}
880
881int64_t DataRecordHandle::getRefsRelOffset() const {
882 LayoutFlags LF = getLayoutFlags();
883 int64_t RelOffset = sizeof(Header);
884 skipDataSize(LF, RelOffset);
885 skipNumRefs(LF, RelOffset);
886 return RelOffset;
887}
888
889int64_t DataRecordHandle::getDataRelOffset() const {
890 LayoutFlags LF = getLayoutFlags();
891 int64_t RelOffset = sizeof(Header);
892 skipDataSize(LF, RelOffset);
893 skipNumRefs(LF, RelOffset);
894 uint32_t RefSize = LF.RefKind == RefKindFlags::InternalRef4B ? 4 : 8;
895 RelOffset += RefSize * getNumRefs();
896 return RelOffset;
897}
898
899Error OnDiskGraphDB::validate(bool Deep, HashingFuncT Hasher) const {
900 if (UpstreamDB) {
901 if (auto E = UpstreamDB->validate(Deep, Hasher))
902 return E;
903 }
904 return Index.validate(RecordVerifier: [&](FileOffset Offset,
905 OnDiskTrieRawHashMap::ConstValueProxy Record)
906 -> Error {
907 auto formatError = [&](Twine Msg) {
908 return createStringError(
909 EC: llvm::errc::illegal_byte_sequence,
910 S: "bad record at 0x" +
911 utohexstr(X: (unsigned)Offset.get(), /*LowerCase=*/true) + ": " +
912 Msg.str());
913 };
914
915 if (Record.Data.size() != sizeof(TrieRecord))
916 return formatError("wrong data record size");
917 if (!isAligned(Lhs: Align::Of<TrieRecord>(), SizeInBytes: Record.Data.size()))
918 return formatError("wrong data record alignment");
919
920 auto *R = reinterpret_cast<const TrieRecord *>(Record.Data.data());
921 TrieRecord::Data D = R->load();
922 std::unique_ptr<MemoryBuffer> FileBuffer;
923 if ((uint8_t)D.SK != (uint8_t)TrieRecord::StorageKind::Unknown &&
924 (uint8_t)D.SK != (uint8_t)TrieRecord::StorageKind::DataPool &&
925 (uint8_t)D.SK != (uint8_t)TrieRecord::StorageKind::Standalone &&
926 (uint8_t)D.SK != (uint8_t)TrieRecord::StorageKind::StandaloneLeaf &&
927 (uint8_t)D.SK != (uint8_t)TrieRecord::StorageKind::StandaloneLeaf0)
928 return formatError("invalid record kind value");
929
930 auto Ref = InternalRef::getFromOffset(Offset);
931 auto I = getIndexProxyFromRef(Ref);
932 if (!I)
933 return I.takeError();
934
935 switch (D.SK) {
936 case TrieRecord::StorageKind::Unknown:
937 // This could be an abandoned entry due to a termination before updating
938 // the record. It can be reused by later insertion so just skip this entry
939 // for now.
940 return Error::success();
941 case TrieRecord::StorageKind::DataPool:
942 // Check offset is a postive value, and large enough to hold the
943 // header for the data record.
944 if (D.Offset.get() <= 0 ||
945 D.Offset.get() + sizeof(DataRecordHandle::Header) >= DataPool.size())
946 return formatError("datapool record out of bound");
947 break;
948 case TrieRecord::StorageKind::Standalone:
949 case TrieRecord::StorageKind::StandaloneLeaf:
950 case TrieRecord::StorageKind::StandaloneLeaf0:
951 SmallString<256> Path;
952 getStandalonePath(FileSuffix: TrieRecord::getStandaloneFilePrefix(SK: D.SK), I: *I, Path);
953 // If need to validate the content of the file later, just load the
954 // buffer here. Otherwise, just check the existance of the file.
955 if (Deep) {
956 auto File = MemoryBuffer::getFile(Filename: Path, /*IsText=*/false,
957 /*RequiresNullTerminator=*/false);
958 if (!File || !*File)
959 return formatError("record file \'" + Path + "\' does not exist");
960
961 FileBuffer = std::move(*File);
962 } else if (!llvm::sys::fs::exists(Path))
963 return formatError("record file \'" + Path + "\' does not exist");
964 }
965
966 if (!Deep)
967 return Error::success();
968
969 auto dataError = [&](Twine Msg) {
970 return createStringError(EC: llvm::errc::illegal_byte_sequence,
971 S: "bad data for digest \'" + toHex(Input: I->Hash) +
972 "\': " + Msg.str());
973 };
974 SmallVector<ArrayRef<uint8_t>> Refs;
975 ArrayRef<char> StoredData;
976
977 switch (D.SK) {
978 case TrieRecord::StorageKind::Unknown:
979 llvm_unreachable("already handled");
980 case TrieRecord::StorageKind::DataPool: {
981 auto DataRecord = DataRecordHandle::getFromDataPool(Pool: DataPool, Offset: D.Offset);
982 if (!DataRecord)
983 return dataError(toString(E: DataRecord.takeError()));
984
985 for (auto InternRef : DataRecord->getRefs()) {
986 auto Index = getIndexProxyFromRef(Ref: InternRef);
987 if (!Index)
988 return Index.takeError();
989 Refs.push_back(Elt: Index->Hash);
990 }
991 StoredData = DataRecord->getData();
992 break;
993 }
994 case TrieRecord::StorageKind::Standalone: {
995 if (FileBuffer->getBufferSize() < sizeof(DataRecordHandle::Header))
996 return dataError("data record is not big enough to read the header");
997 auto DataRecord = DataRecordHandle::get(Mem: FileBuffer->getBufferStart());
998 if (DataRecord.getTotalSize() < FileBuffer->getBufferSize())
999 return dataError(
1000 "data record span passed the end of the standalone file");
1001 for (auto InternRef : DataRecord.getRefs()) {
1002 auto Index = getIndexProxyFromRef(Ref: InternRef);
1003 if (!Index)
1004 return Index.takeError();
1005 Refs.push_back(Elt: Index->Hash);
1006 }
1007 StoredData = DataRecord.getData();
1008 break;
1009 }
1010 case TrieRecord::StorageKind::StandaloneLeaf:
1011 case TrieRecord::StorageKind::StandaloneLeaf0: {
1012 StoredData = arrayRefFromStringRef<char>(Input: FileBuffer->getBuffer());
1013 if (D.SK == TrieRecord::StorageKind::StandaloneLeaf0) {
1014 if (!FileBuffer->getBuffer().ends_with(Suffix: '\0'))
1015 return dataError("standalone file is not zero terminated");
1016 StoredData = StoredData.drop_back(N: 1);
1017 }
1018 break;
1019 }
1020 }
1021
1022 SmallVector<uint8_t> ComputedHash;
1023 Hasher(Refs, StoredData, ComputedHash);
1024 if (I->Hash != ArrayRef(ComputedHash))
1025 return dataError("hash mismatch, got \'" + toHex(Input: ComputedHash) +
1026 "\' instead");
1027
1028 return Error::success();
1029 });
1030}
1031
1032Error OnDiskGraphDB::validateObjectID(ObjectID ExternalRef) const {
1033 auto formatError = [&](Twine Msg) {
1034 return createStringError(
1035 EC: llvm::errc::illegal_byte_sequence,
1036 S: "bad ref=0x" +
1037 utohexstr(X: ExternalRef.getOpaqueData(), /*LowerCase=*/true) + ": " +
1038 Msg.str());
1039 };
1040
1041 if (ExternalRef.getOpaqueData() == 0)
1042 return formatError("zero is not a valid ref");
1043
1044 InternalRef InternalRef = getInternalRef(Ref: ExternalRef);
1045 auto I = getIndexProxyFromRef(Ref: InternalRef);
1046 if (!I)
1047 return formatError(llvm::toString(E: I.takeError()));
1048 auto Hash = getDigest(I: *I);
1049
1050 OnDiskTrieRawHashMap::ConstOnDiskPtr P = Index.find(Hash);
1051 if (!P)
1052 return formatError("not found using hash " + toHex(Input: Hash));
1053 IndexProxy OtherI = getIndexProxyFromPointer(P);
1054 ObjectID OtherRef = getExternalReference(Ref: makeInternalRef(IndexOffset: OtherI.Offset));
1055 if (OtherRef != ExternalRef)
1056 return formatError("ref does not match indexed offset " +
1057 utohexstr(X: OtherRef.getOpaqueData(), /*LowerCase=*/true) +
1058 " for hash " + toHex(Input: Hash));
1059 return Error::success();
1060}
1061
1062void OnDiskGraphDB::print(raw_ostream &OS) const {
1063 OS << "on-disk-root-path: " << RootPath << "\n";
1064
1065 struct PoolInfo {
1066 uint64_t Offset;
1067 };
1068 SmallVector<PoolInfo> Pool;
1069
1070 OS << "\n";
1071 OS << "index:\n";
1072 Index.print(OS, PrintRecordData: [&](ArrayRef<char> Data) {
1073 assert(Data.size() == sizeof(TrieRecord));
1074 assert(isAligned(Align::Of<TrieRecord>(), Data.size()));
1075 auto *R = reinterpret_cast<const TrieRecord *>(Data.data());
1076 TrieRecord::Data D = R->load();
1077 OS << " SK=";
1078 switch (D.SK) {
1079 case TrieRecord::StorageKind::Unknown:
1080 OS << "unknown ";
1081 break;
1082 case TrieRecord::StorageKind::DataPool:
1083 OS << "datapool ";
1084 Pool.push_back(Elt: {.Offset: D.Offset.get()});
1085 break;
1086 case TrieRecord::StorageKind::Standalone:
1087 OS << "standalone-data ";
1088 break;
1089 case TrieRecord::StorageKind::StandaloneLeaf:
1090 OS << "standalone-leaf ";
1091 break;
1092 case TrieRecord::StorageKind::StandaloneLeaf0:
1093 OS << "standalone-leaf+0";
1094 break;
1095 }
1096 OS << " Offset=" << (void *)D.Offset.get();
1097 });
1098 if (Pool.empty())
1099 return;
1100
1101 OS << "\n";
1102 OS << "pool:\n";
1103 llvm::sort(
1104 C&: Pool, Comp: [](PoolInfo LHS, PoolInfo RHS) { return LHS.Offset < RHS.Offset; });
1105 for (PoolInfo PI : Pool) {
1106 OS << "- addr=" << (void *)PI.Offset << " ";
1107 auto D = DataRecordHandle::getFromDataPool(Pool: DataPool, Offset: FileOffset(PI.Offset));
1108 if (!D) {
1109 OS << "error: " << toString(E: D.takeError());
1110 return;
1111 }
1112
1113 OS << "record refs=" << D->getNumRefs() << " data=" << D->getDataSize()
1114 << " size=" << D->getTotalSize()
1115 << " end=" << (void *)(PI.Offset + D->getTotalSize()) << "\n";
1116 }
1117}
1118
1119Expected<OnDiskGraphDB::IndexProxy>
1120OnDiskGraphDB::indexHash(ArrayRef<uint8_t> Hash) {
1121 auto P = Index.insertLazy(
1122 Hash, OnConstruct: [](FileOffset TentativeOffset,
1123 OnDiskTrieRawHashMap::ValueProxy TentativeValue) {
1124 assert(TentativeValue.Data.size() == sizeof(TrieRecord));
1125 assert(
1126 isAddrAligned(Align::Of<TrieRecord>(), TentativeValue.Data.data()));
1127 new (TentativeValue.Data.data()) TrieRecord();
1128 });
1129 if (LLVM_UNLIKELY(!P))
1130 return P.takeError();
1131
1132 assert(*P && "Expected insertion");
1133 return getIndexProxyFromPointer(P: *P);
1134}
1135
1136OnDiskGraphDB::IndexProxy OnDiskGraphDB::getIndexProxyFromPointer(
1137 OnDiskTrieRawHashMap::ConstOnDiskPtr P) const {
1138 assert(P);
1139 assert(P.getOffset());
1140 return IndexProxy{.Offset: P.getOffset(), .Hash: P->Hash,
1141 .Ref: *const_cast<TrieRecord *>(
1142 reinterpret_cast<const TrieRecord *>(P->Data.data()))};
1143}
1144
1145Expected<ObjectID> OnDiskGraphDB::getReference(ArrayRef<uint8_t> Hash) {
1146 auto I = indexHash(Hash);
1147 if (LLVM_UNLIKELY(!I))
1148 return I.takeError();
1149 return getExternalReference(I: *I);
1150}
1151
1152ObjectID OnDiskGraphDB::getExternalReference(const IndexProxy &I) {
1153 return getExternalReference(Ref: makeInternalRef(IndexOffset: I.Offset));
1154}
1155
1156std::optional<ObjectID>
1157OnDiskGraphDB::getExistingReference(ArrayRef<uint8_t> Digest,
1158 bool CheckUpstream) {
1159 auto tryUpstream =
1160 [&](std::optional<IndexProxy> I) -> std::optional<ObjectID> {
1161 if (!CheckUpstream || !UpstreamDB)
1162 return std::nullopt;
1163 std::optional<ObjectID> UpstreamID =
1164 UpstreamDB->getExistingReference(Digest);
1165 if (LLVM_UNLIKELY(!UpstreamID))
1166 return std::nullopt;
1167 auto Ref = expectedToOptional(E: indexHash(Hash: Digest));
1168 if (!Ref)
1169 return std::nullopt;
1170 if (!I)
1171 I.emplace(args&: *Ref);
1172 return getExternalReference(I: *I);
1173 };
1174
1175 OnDiskTrieRawHashMap::ConstOnDiskPtr P = Index.find(Hash: Digest);
1176 if (!P)
1177 return tryUpstream(std::nullopt);
1178 IndexProxy I = getIndexProxyFromPointer(P);
1179 TrieRecord::Data Obj = I.Ref.load();
1180 if (Obj.SK == TrieRecord::StorageKind::Unknown)
1181 return tryUpstream(I);
1182 return getExternalReference(Ref: makeInternalRef(IndexOffset: I.Offset));
1183}
1184
1185Expected<OnDiskGraphDB::IndexProxy>
1186OnDiskGraphDB::getIndexProxyFromRef(InternalRef Ref) const {
1187 auto P = Index.recoverFromFileOffset(Offset: Ref.getFileOffset());
1188 if (LLVM_UNLIKELY(!P))
1189 return P.takeError();
1190 return getIndexProxyFromPointer(P: *P);
1191}
1192
1193Expected<ArrayRef<uint8_t>> OnDiskGraphDB::getDigest(InternalRef Ref) const {
1194 auto I = getIndexProxyFromRef(Ref);
1195 if (!I)
1196 return I.takeError();
1197 return I->Hash;
1198}
1199
1200ArrayRef<uint8_t> OnDiskGraphDB::getDigest(const IndexProxy &I) const {
1201 return I.Hash;
1202}
1203
1204static OnDiskContent getContentFromHandle(const OnDiskDataAllocator &DataPool,
1205 ObjectHandle OH) {
1206 // Decode ObjectHandle to locate the stored content.
1207 uint64_t Data = OH.getOpaqueData();
1208 if (Data & 1) {
1209 const auto *SDIM =
1210 reinterpret_cast<const StandaloneDataInMemory *>(Data & (-1ULL << 1));
1211 return SDIM->getContent();
1212 }
1213
1214 auto DataHandle =
1215 cantFail(ValOrErr: DataRecordHandle::getFromDataPool(Pool: DataPool, Offset: FileOffset(Data)));
1216 assert(DataHandle.getData().end()[0] == 0 && "Null termination");
1217 return OnDiskContent{.Record: DataHandle, .Bytes: std::nullopt};
1218}
1219
1220ArrayRef<char> OnDiskGraphDB::getObjectData(ObjectHandle Node) const {
1221 OnDiskContent Content = getContentFromHandle(DataPool, OH: Node);
1222 if (Content.Bytes)
1223 return *Content.Bytes;
1224 assert(Content.Record && "Expected record or bytes");
1225 return Content.Record->getData();
1226}
1227
1228InternalRefArrayRef OnDiskGraphDB::getInternalRefs(ObjectHandle Node) const {
1229 if (std::optional<DataRecordHandle> Record =
1230 getContentFromHandle(DataPool, OH: Node).Record)
1231 return Record->getRefs();
1232 return std::nullopt;
1233}
1234
1235Expected<std::optional<ObjectHandle>>
1236OnDiskGraphDB::load(ObjectID ExternalRef) {
1237 InternalRef Ref = getInternalRef(Ref: ExternalRef);
1238 auto I = getIndexProxyFromRef(Ref);
1239 if (!I)
1240 return I.takeError();
1241 TrieRecord::Data Object = I->Ref.load();
1242
1243 if (Object.SK == TrieRecord::StorageKind::Unknown)
1244 return faultInFromUpstream(PrimaryID: ExternalRef);
1245
1246 if (Object.SK == TrieRecord::StorageKind::DataPool)
1247 return ObjectHandle::fromFileOffset(Offset: Object.Offset);
1248
1249 // Only TrieRecord::StorageKind::Standalone (and variants) need to be
1250 // explicitly loaded.
1251 //
1252 // There's corruption if standalone objects have offsets, or if we get here
1253 // for something that isn't standalone.
1254 if (Object.Offset)
1255 return createCorruptObjectError(ID: getDigest(I: *I));
1256 switch (Object.SK) {
1257 case TrieRecord::StorageKind::Unknown:
1258 case TrieRecord::StorageKind::DataPool:
1259 llvm_unreachable("unexpected storage kind");
1260 case TrieRecord::StorageKind::Standalone:
1261 case TrieRecord::StorageKind::StandaloneLeaf0:
1262 case TrieRecord::StorageKind::StandaloneLeaf:
1263 break;
1264 }
1265
1266 // Load it from disk.
1267 //
1268 // Note: Creation logic guarantees that data that needs null-termination is
1269 // suitably 0-padded. Requiring null-termination here would be too expensive
1270 // for extremely large objects that happen to be page-aligned.
1271 SmallString<256> Path;
1272 getStandalonePath(FileSuffix: TrieRecord::getStandaloneFilePrefix(SK: Object.SK), I: *I, Path);
1273
1274 auto BypassSandbox = sys::sandbox::scopedDisable();
1275
1276 auto File = sys::fs::openNativeFileForRead(Name: Path);
1277 if (!File)
1278 return createFileError(F: Path, E: File.takeError());
1279
1280 llvm::scope_exit CloseFile([&]() { sys::fs::closeFile(F&: *File); });
1281
1282 sys::fs::file_status Status;
1283 if (std::error_code EC = sys::fs::status(FD: *File, Result&: Status))
1284 return createCorruptObjectError(ID: getDigest(I: *I));
1285
1286 std::error_code EC;
1287 auto Region = std::make_unique<sys::fs::mapped_file_region>(
1288 args&: *File, args: sys::fs::mapped_file_region::readonly, args: Status.getSize(), args: 0, args&: EC);
1289 if (EC)
1290 return createCorruptObjectError(ID: getDigest(I: *I));
1291
1292 return ObjectHandle::fromMemory(
1293 Ptr: static_cast<StandaloneDataMapTy *>(StandaloneData)
1294 ->insert(Hash: I->Hash, SK: Object.SK, Region: std::move(Region)));
1295}
1296
1297Expected<bool> OnDiskGraphDB::isMaterialized(ObjectID Ref) {
1298 auto Presence = getObjectPresence(Ref, /*CheckUpstream=*/true);
1299 if (!Presence)
1300 return Presence.takeError();
1301
1302 switch (*Presence) {
1303 case ObjectPresence::Missing:
1304 return false;
1305 case ObjectPresence::InPrimaryDB:
1306 return true;
1307 case ObjectPresence::OnlyInUpstreamDB:
1308 if (auto FaultInResult = faultInFromUpstream(PrimaryID: Ref); !FaultInResult)
1309 return FaultInResult.takeError();
1310 return true;
1311 }
1312 llvm_unreachable("Unknown ObjectPresence enum");
1313}
1314
1315Expected<OnDiskGraphDB::ObjectPresence>
1316OnDiskGraphDB::getObjectPresence(ObjectID ExternalRef,
1317 bool CheckUpstream) const {
1318 InternalRef Ref = getInternalRef(Ref: ExternalRef);
1319 auto I = getIndexProxyFromRef(Ref);
1320 if (!I)
1321 return I.takeError();
1322
1323 TrieRecord::Data Object = I->Ref.load();
1324 if (Object.SK != TrieRecord::StorageKind::Unknown)
1325 return ObjectPresence::InPrimaryDB;
1326
1327 if (!CheckUpstream || !UpstreamDB)
1328 return ObjectPresence::Missing;
1329
1330 std::optional<ObjectID> UpstreamID =
1331 UpstreamDB->getExistingReference(Digest: getDigest(I: *I));
1332 return UpstreamID.has_value() ? ObjectPresence::OnlyInUpstreamDB
1333 : ObjectPresence::Missing;
1334}
1335
1336InternalRef OnDiskGraphDB::makeInternalRef(FileOffset IndexOffset) {
1337 return InternalRef::getFromOffset(Offset: IndexOffset);
1338}
1339
1340void OnDiskGraphDB::getStandalonePath(StringRef Prefix, const IndexProxy &I,
1341 SmallVectorImpl<char> &Path) const {
1342 Path.assign(in_start: RootPath.begin(), in_end: RootPath.end());
1343 sys::path::append(path&: Path,
1344 a: Prefix + Twine(I.Offset.get()) + "." + CASFormatVersion);
1345}
1346
1347OnDiskContent StandaloneDataInMemory::getContent() const {
1348 bool Leaf0 = false;
1349 bool Leaf = false;
1350 switch (SK) {
1351 default:
1352 llvm_unreachable("Storage kind must be standalone");
1353 case TrieRecord::StorageKind::Standalone:
1354 break;
1355 case TrieRecord::StorageKind::StandaloneLeaf0:
1356 Leaf = Leaf0 = true;
1357 break;
1358 case TrieRecord::StorageKind::StandaloneLeaf:
1359 Leaf = true;
1360 break;
1361 }
1362
1363 if (Leaf) {
1364 StringRef Data(Region->data(), Region->size());
1365 assert(Data.drop_back(Leaf0).end()[0] == 0 &&
1366 "Standalone node data missing null termination");
1367 return OnDiskContent{.Record: std::nullopt,
1368 .Bytes: arrayRefFromStringRef<char>(Input: Data.drop_back(N: Leaf0))};
1369 }
1370
1371 DataRecordHandle Record = DataRecordHandle::get(Mem: Region->data());
1372 assert(Record.getData().end()[0] == 0 &&
1373 "Standalone object record missing null termination for data");
1374 return OnDiskContent{.Record: Record, .Bytes: std::nullopt};
1375}
1376
1377static Expected<MappedTempFile>
1378createTempFile(StringRef FinalPath, uint64_t Size, OnDiskCASLogger *Logger) {
1379 auto BypassSandbox = sys::sandbox::scopedDisable();
1380
1381 assert(Size && "Unexpected request for an empty temp file");
1382 Expected<TempFile> File = TempFile::create(Model: FinalPath + ".%%%%%%", Logger);
1383 if (!File)
1384 return File.takeError();
1385
1386 if (Error E = preallocateFileTail(FD: File->FD, CurrentSize: 0, NewSize: Size).takeError())
1387 return createFileError(F: File->TmpName, E: std::move(E));
1388
1389 if (auto EC = sys::fs::resize_file_before_mapping_readwrite(FD: File->FD, Size))
1390 return createFileError(F: File->TmpName, EC);
1391
1392 std::error_code EC;
1393 sys::fs::mapped_file_region Map(sys::fs::convertFDToNativeFile(FD: File->FD),
1394 sys::fs::mapped_file_region::readwrite, Size,
1395 0, EC);
1396 if (EC)
1397 return createFileError(F: File->TmpName, EC);
1398 return MappedTempFile(std::move(*File), std::move(Map));
1399}
1400
1401static size_t getPageSize() {
1402 static int PageSize = sys::Process::getPageSizeEstimate();
1403 return PageSize;
1404}
1405
1406Error OnDiskGraphDB::createStandaloneLeaf(IndexProxy &I, ArrayRef<char> Data) {
1407 assert(Data.size() > TrieRecord::MaxEmbeddedSize &&
1408 "Expected a bigger file for external content...");
1409
1410 bool Leaf0 = isAligned(Lhs: Align(getPageSize()), SizeInBytes: Data.size());
1411 TrieRecord::StorageKind SK = Leaf0 ? TrieRecord::StorageKind::StandaloneLeaf0
1412 : TrieRecord::StorageKind::StandaloneLeaf;
1413
1414 SmallString<256> Path;
1415 int64_t FileSize = Data.size() + Leaf0;
1416 getStandalonePath(Prefix: TrieRecord::getStandaloneFilePrefix(SK), I, Path);
1417
1418 auto BypassSandbox = sys::sandbox::scopedDisable();
1419
1420 // Write the file. Don't reuse this mapped_file_region, which is read/write.
1421 // Let load() pull up one that's read-only.
1422 Expected<MappedTempFile> File = createTempFile(FinalPath: Path, Size: FileSize, Logger: Logger.get());
1423 if (!File)
1424 return File.takeError();
1425 assert(File->size() == (uint64_t)FileSize);
1426 llvm::copy(Range&: Data, Out: File->data());
1427 if (Leaf0)
1428 File->data()[Data.size()] = 0;
1429 assert(File->data()[Data.size()] == 0);
1430 if (Error E = File->keep(Name: Path))
1431 return E;
1432
1433 // Store the object reference.
1434 TrieRecord::Data Existing;
1435 {
1436 TrieRecord::Data Leaf{.SK: SK, .Offset: FileOffset()};
1437 if (I.Ref.compare_exchange_strong(Existing, New: Leaf)) {
1438 recordStandaloneSizeIncrease(SizeIncrease: FileSize);
1439 return Error::success();
1440 }
1441 }
1442
1443 // If there was a race, confirm that the new value has valid storage.
1444 if (Existing.SK == TrieRecord::StorageKind::Unknown)
1445 return createCorruptObjectError(ID: getDigest(I));
1446
1447 return Error::success();
1448}
1449
1450Error OnDiskGraphDB::store(ObjectID ID, ArrayRef<ObjectID> Refs,
1451 ArrayRef<char> Data) {
1452 auto I = getIndexProxyFromRef(Ref: getInternalRef(Ref: ID));
1453 if (LLVM_UNLIKELY(!I))
1454 return I.takeError();
1455
1456 // Early return in case the node exists.
1457 {
1458 TrieRecord::Data Existing = I->Ref.load();
1459 if (Existing.SK != TrieRecord::StorageKind::Unknown)
1460 return Error::success();
1461 }
1462
1463 // Big leaf nodes.
1464 if (Refs.empty() && Data.size() > TrieRecord::MaxEmbeddedSize)
1465 return createStandaloneLeaf(I&: *I, Data);
1466
1467 // TODO: Check whether it's worth checking the index for an already existing
1468 // object (like storeTreeImpl() does) before building up the
1469 // InternalRefVector.
1470 InternalRefVector InternalRefs;
1471 for (ObjectID Ref : Refs)
1472 InternalRefs.push_back(Ref: getInternalRef(Ref));
1473
1474 // Create the object.
1475
1476 DataRecordHandle::Input Input{.Refs: InternalRefs, .Data: Data};
1477
1478 // Compute the storage kind, allocate it, and create the record.
1479 TrieRecord::StorageKind SK = TrieRecord::StorageKind::Unknown;
1480 FileOffset PoolOffset;
1481 SmallString<256> Path;
1482 std::optional<MappedTempFile> File;
1483 std::optional<uint64_t> FileSize;
1484 auto AllocStandaloneFile = [&](size_t Size) -> Expected<char *> {
1485 getStandalonePath(Prefix: TrieRecord::getStandaloneFilePrefix(
1486 SK: TrieRecord::StorageKind::Standalone),
1487 I: *I, Path);
1488 if (Error E = createTempFile(FinalPath: Path, Size, Logger: Logger.get()).moveInto(Value&: File))
1489 return std::move(E);
1490 assert(File->size() == Size);
1491 FileSize = Size;
1492 SK = TrieRecord::StorageKind::Standalone;
1493 return File->data();
1494 };
1495 auto Alloc = [&](size_t Size) -> Expected<char *> {
1496 if (Size <= TrieRecord::MaxEmbeddedSize) {
1497 SK = TrieRecord::StorageKind::DataPool;
1498 auto P = DataPool.allocate(Size);
1499 if (LLVM_UNLIKELY(!P)) {
1500 char *NewAlloc = nullptr;
1501 auto NewE = handleErrors(
1502 E: P.takeError(), Hs: [&](std::unique_ptr<StringError> E) -> Error {
1503 if (E->convertToErrorCode() == std::errc::not_enough_memory)
1504 return AllocStandaloneFile(Size).moveInto(Value&: NewAlloc);
1505 return Error(std::move(E));
1506 });
1507 if (!NewE)
1508 return NewAlloc;
1509 return std::move(NewE);
1510 }
1511 PoolOffset = P->getOffset();
1512 LLVM_DEBUG({
1513 dbgs() << "pool-alloc addr=" << (void *)PoolOffset.get()
1514 << " size=" << Size
1515 << " end=" << (void *)(PoolOffset.get() + Size) << "\n";
1516 });
1517 return (*P)->data();
1518 }
1519 return AllocStandaloneFile(Size);
1520 };
1521
1522 DataRecordHandle Record;
1523 if (Error E =
1524 DataRecordHandle::createWithError(Alloc, I: Input).moveInto(Value&: Record))
1525 return E;
1526 assert(Record.getData().end()[0] == 0 && "Expected null-termination");
1527 assert(Record.getData() == Input.Data && "Expected initialization");
1528 assert(SK != TrieRecord::StorageKind::Unknown);
1529 assert(bool(File) != bool(PoolOffset) &&
1530 "Expected either a mapped file or a pooled offset");
1531
1532 // Check for a race before calling MappedTempFile::keep().
1533 //
1534 // Then decide what to do with the file. Better to discard than overwrite if
1535 // another thread/process has already added this.
1536 TrieRecord::Data Existing = I->Ref.load();
1537 {
1538 TrieRecord::Data NewObject{.SK: SK, .Offset: PoolOffset};
1539 if (File) {
1540 if (Existing.SK == TrieRecord::StorageKind::Unknown) {
1541 // Keep the file!
1542 if (Error E = File->keep(Name: Path))
1543 return E;
1544 } else {
1545 File.reset();
1546 }
1547 }
1548
1549 // If we didn't already see a racing/existing write, then try storing the
1550 // new object. If that races, confirm that the new value has valid storage.
1551 //
1552 // TODO: Find a way to reuse the storage from the new-but-abandoned record
1553 // handle.
1554 if (Existing.SK == TrieRecord::StorageKind::Unknown) {
1555 if (I->Ref.compare_exchange_strong(Existing, New: NewObject)) {
1556 if (FileSize)
1557 recordStandaloneSizeIncrease(SizeIncrease: *FileSize);
1558 return Error::success();
1559 }
1560 }
1561 }
1562
1563 if (Existing.SK == TrieRecord::StorageKind::Unknown)
1564 return createCorruptObjectError(ID: getDigest(I: *I));
1565
1566 // Load existing object.
1567 return Error::success();
1568}
1569
1570void OnDiskGraphDB::recordStandaloneSizeIncrease(size_t SizeIncrease) {
1571 standaloneStorageSize().fetch_add(i: SizeIncrease, m: std::memory_order_relaxed);
1572}
1573
1574std::atomic<uint64_t> &OnDiskGraphDB::standaloneStorageSize() const {
1575 MutableArrayRef<uint8_t> UserHeader = DataPool.getUserHeader();
1576 assert(UserHeader.size() == sizeof(std::atomic<uint64_t>));
1577 assert(isAddrAligned(Align(8), UserHeader.data()));
1578 return *reinterpret_cast<std::atomic<uint64_t> *>(UserHeader.data());
1579}
1580
1581uint64_t OnDiskGraphDB::getStandaloneStorageSize() const {
1582 return standaloneStorageSize().load(m: std::memory_order_relaxed);
1583}
1584
1585size_t OnDiskGraphDB::getStorageSize() const {
1586 return Index.size() + DataPool.size() + getStandaloneStorageSize();
1587}
1588
1589unsigned OnDiskGraphDB::getHardStorageLimitUtilization() const {
1590 unsigned IndexPercent = Index.size() * 100ULL / Index.capacity();
1591 unsigned DataPercent = DataPool.size() * 100ULL / DataPool.capacity();
1592 return std::max(a: IndexPercent, b: DataPercent);
1593}
1594
1595Expected<std::unique_ptr<OnDiskGraphDB>>
1596OnDiskGraphDB::open(StringRef AbsPath, StringRef HashName,
1597 unsigned HashByteSize, OnDiskGraphDB *UpstreamDB,
1598 std::shared_ptr<OnDiskCASLogger> Logger,
1599 FaultInPolicy Policy) {
1600 if (std::error_code EC = sys::fs::create_directories(path: AbsPath))
1601 return createFileError(F: AbsPath, EC);
1602
1603 constexpr uint64_t MB = 1024ull * 1024ull;
1604 constexpr uint64_t GB = 1024ull * 1024ull * 1024ull;
1605
1606 uint64_t MaxIndexSize = 12 * GB;
1607 uint64_t MaxDataPoolSize = 24 * GB;
1608
1609 if (useSmallMappingSize(Path: AbsPath)) {
1610 MaxIndexSize = 1 * GB;
1611 MaxDataPoolSize = 2 * GB;
1612 }
1613
1614 auto CustomSize = getOverriddenMaxMappingSize();
1615 if (!CustomSize)
1616 return CustomSize.takeError();
1617 if (*CustomSize)
1618 MaxIndexSize = MaxDataPoolSize = **CustomSize;
1619
1620 SmallString<256> IndexPath(AbsPath);
1621 sys::path::append(path&: IndexPath, a: IndexFilePrefix + CASFormatVersion);
1622 std::optional<OnDiskTrieRawHashMap> Index;
1623 if (Error E = OnDiskTrieRawHashMap::create(
1624 Path: IndexPath, TrieName: IndexTableName + "[" + HashName + "]",
1625 NumHashBits: HashByteSize * CHAR_BIT,
1626 /*DataSize=*/sizeof(TrieRecord), MaxFileSize: MaxIndexSize,
1627 /*MinFileSize=*/NewFileInitialSize: MB, Logger)
1628 .moveInto(Value&: Index))
1629 return std::move(E);
1630
1631 uint32_t UserHeaderSize = sizeof(std::atomic<uint64_t>);
1632
1633 SmallString<256> DataPoolPath(AbsPath);
1634 sys::path::append(path&: DataPoolPath, a: DataPoolFilePrefix + CASFormatVersion);
1635 std::optional<OnDiskDataAllocator> DataPool;
1636 StringRef PolicyName =
1637 Policy == FaultInPolicy::SingleNode ? "single" : "full";
1638 if (Error E = OnDiskDataAllocator::create(
1639 Path: DataPoolPath,
1640 TableName: DataPoolTableName + "[" + HashName + "]" + PolicyName,
1641 MaxFileSize: MaxDataPoolSize, /*MinFileSize=*/NewFileInitialSize: MB, UserHeaderSize, Logger,
1642 UserHeaderInit: [](void *UserHeaderPtr) {
1643 new (UserHeaderPtr) std::atomic<uint64_t>(0);
1644 })
1645 .moveInto(Value&: DataPool))
1646 return std::move(E);
1647 if (DataPool->getUserHeader().size() != UserHeaderSize)
1648 return createStringError(EC: llvm::errc::argument_out_of_domain,
1649 S: "unexpected user header in '" + DataPoolPath +
1650 "'");
1651
1652 return std::unique_ptr<OnDiskGraphDB>(
1653 new OnDiskGraphDB(AbsPath, std::move(*Index), std::move(*DataPool),
1654 UpstreamDB, Policy, std::move(Logger)));
1655}
1656
1657OnDiskGraphDB::OnDiskGraphDB(StringRef RootPath, OnDiskTrieRawHashMap Index,
1658 OnDiskDataAllocator DataPool,
1659 OnDiskGraphDB *UpstreamDB, FaultInPolicy Policy,
1660 std::shared_ptr<OnDiskCASLogger> Logger)
1661 : Index(std::move(Index)), DataPool(std::move(DataPool)),
1662 RootPath(RootPath.str()), UpstreamDB(UpstreamDB), FIPolicy(Policy),
1663 Logger(std::move(Logger)) {
1664 /// Lifetime for "big" objects not in DataPool.
1665 ///
1666 /// NOTE: Could use ThreadSafeTrieRawHashMap here. For now, doing something
1667 /// simpler on the assumption there won't be much contention since most data
1668 /// is not big. If there is contention, and we've already fixed ObjectProxy
1669 /// object handles to be cheap enough to use consistently, the fix might be
1670 /// to use better use of them rather than optimizing this map.
1671 ///
1672 /// FIXME: Figure out the right number of shards, if any.
1673 StandaloneData = new StandaloneDataMapTy();
1674}
1675
1676OnDiskGraphDB::~OnDiskGraphDB() {
1677 delete static_cast<StandaloneDataMapTy *>(StandaloneData);
1678}
1679
1680Error OnDiskGraphDB::importFullTree(ObjectID PrimaryID,
1681 ObjectHandle UpstreamNode) {
1682 // Copies the full CAS tree from upstream. Uses depth-first copying to protect
1683 // against the process dying during importing and leaving the database with an
1684 // incomplete tree. Note that if the upstream has missing nodes then the tree
1685 // will be copied with missing nodes as well, it won't be considered an error.
1686 struct UpstreamCursor {
1687 ObjectHandle Node;
1688 size_t RefsCount;
1689 object_refs_iterator RefI;
1690 object_refs_iterator RefE;
1691 };
1692 /// Keeps track of the state of visitation for current node and all of its
1693 /// parents.
1694 SmallVector<UpstreamCursor, 16> CursorStack;
1695 /// Keeps track of the currently visited nodes as they are imported into
1696 /// primary database, from current node and its parents. When a node is
1697 /// entered for visitation it appends its own ID, then appends referenced IDs
1698 /// as they get imported. When a node is fully imported it removes the
1699 /// referenced IDs from the bottom of the stack which leaves its own ID at the
1700 /// bottom, adding to the list of referenced IDs for the parent node.
1701 SmallVector<ObjectID, 128> PrimaryNodesStack;
1702
1703 auto enqueueNode = [&](ObjectID PrimaryID, std::optional<ObjectHandle> Node) {
1704 PrimaryNodesStack.push_back(Elt: PrimaryID);
1705 if (!Node)
1706 return;
1707 auto Refs = UpstreamDB->getObjectRefs(Node: *Node);
1708 CursorStack.push_back(
1709 Elt: {.Node: *Node, .RefsCount: (size_t)llvm::size(Range&: Refs), .RefI: Refs.begin(), .RefE: Refs.end()});
1710 };
1711
1712 enqueueNode(PrimaryID, UpstreamNode);
1713
1714 while (!CursorStack.empty()) {
1715 UpstreamCursor &Cur = CursorStack.back();
1716 if (Cur.RefI == Cur.RefE) {
1717 // Copy the node data into the primary store.
1718 // FIXME: Use hard-link or cloning if the file-system supports it and data
1719 // is stored into a separate file.
1720
1721 // The bottom of \p PrimaryNodesStack contains the primary ID for the
1722 // current node plus the list of imported referenced IDs.
1723 assert(PrimaryNodesStack.size() >= Cur.RefsCount + 1);
1724 ObjectID PrimaryID = *(PrimaryNodesStack.end() - Cur.RefsCount - 1);
1725 auto PrimaryRefs = ArrayRef(PrimaryNodesStack)
1726 .slice(N: PrimaryNodesStack.size() - Cur.RefsCount);
1727 auto Data = UpstreamDB->getObjectData(Node: Cur.Node);
1728 if (Error E = store(ID: PrimaryID, Refs: PrimaryRefs, Data))
1729 return E;
1730 // Remove the current node and its IDs from the stack.
1731 PrimaryNodesStack.truncate(N: PrimaryNodesStack.size() - Cur.RefsCount);
1732 CursorStack.pop_back();
1733 continue;
1734 }
1735
1736 ObjectID UpstreamID = *(Cur.RefI++);
1737 auto PrimaryID = getReference(Hash: UpstreamDB->getDigest(Ref: UpstreamID));
1738 if (LLVM_UNLIKELY(!PrimaryID))
1739 return PrimaryID.takeError();
1740 if (containsObject(Ref: *PrimaryID, /*CheckUpstream=*/false)) {
1741 // This \p ObjectID already exists in the primary. Either it was imported
1742 // via \p importFullTree or the client created it, in which case the
1743 // client takes responsibility for how it was formed.
1744 enqueueNode(*PrimaryID, std::nullopt);
1745 continue;
1746 }
1747 Expected<std::optional<ObjectHandle>> UpstreamNode =
1748 UpstreamDB->load(ExternalRef: UpstreamID);
1749 if (!UpstreamNode)
1750 return UpstreamNode.takeError();
1751 enqueueNode(*PrimaryID, *UpstreamNode);
1752 }
1753
1754 assert(PrimaryNodesStack.size() == 1);
1755 assert(PrimaryNodesStack.front() == PrimaryID);
1756 return Error::success();
1757}
1758
1759Error OnDiskGraphDB::importSingleNode(ObjectID PrimaryID,
1760 ObjectHandle UpstreamNode) {
1761 // Copies only a single node, it doesn't copy the referenced nodes.
1762
1763 // Copy the node data into the primary store.
1764 // FIXME: Use hard-link or cloning if the file-system supports it and data is
1765 // stored into a separate file.
1766 auto Data = UpstreamDB->getObjectData(Node: UpstreamNode);
1767 auto UpstreamRefs = UpstreamDB->getObjectRefs(Node: UpstreamNode);
1768 SmallVector<ObjectID, 64> Refs;
1769 Refs.reserve(N: llvm::size(Range&: UpstreamRefs));
1770 for (ObjectID UpstreamRef : UpstreamRefs) {
1771 auto Ref = getReference(Hash: UpstreamDB->getDigest(Ref: UpstreamRef));
1772 if (LLVM_UNLIKELY(!Ref))
1773 return Ref.takeError();
1774 Refs.push_back(Elt: *Ref);
1775 }
1776
1777 return store(ID: PrimaryID, Refs, Data);
1778}
1779
1780Expected<std::optional<ObjectHandle>>
1781OnDiskGraphDB::faultInFromUpstream(ObjectID PrimaryID) {
1782 if (!UpstreamDB)
1783 return std::nullopt;
1784
1785 auto UpstreamID = UpstreamDB->getReference(Hash: getDigest(Ref: PrimaryID));
1786 if (LLVM_UNLIKELY(!UpstreamID))
1787 return UpstreamID.takeError();
1788
1789 Expected<std::optional<ObjectHandle>> UpstreamNode =
1790 UpstreamDB->load(ExternalRef: *UpstreamID);
1791 if (!UpstreamNode)
1792 return UpstreamNode.takeError();
1793 if (!*UpstreamNode)
1794 return std::nullopt;
1795
1796 if (Error E = FIPolicy == FaultInPolicy::SingleNode
1797 ? importSingleNode(PrimaryID, UpstreamNode: **UpstreamNode)
1798 : importFullTree(PrimaryID, UpstreamNode: **UpstreamNode))
1799 return std::move(E);
1800 return load(ExternalRef: PrimaryID);
1801}
1802