1//===- llvm/CAS/ObjectStore.h -----------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file contains the declaration of the ObjectStore class.
11///
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_CAS_OBJECTSTORE_H
15#define LLVM_CAS_OBJECTSTORE_H
16
17#include "llvm/ADT/StringRef.h"
18#include "llvm/CAS/CASID.h"
19#include "llvm/CAS/CASReference.h"
20#include "llvm/Support/Error.h"
21#include "llvm/Support/FileSystem.h"
22#include <cstddef>
23
24namespace llvm {
25
26class MemoryBuffer;
27template <typename T> class unique_function;
28
29namespace cas {
30
31class ObjectStore;
32class ObjectProxy;
33
34/// Content-addressable storage for objects.
35///
36/// Conceptually, objects are stored in a "unique set".
37///
38/// - Objects are immutable ("value objects") that are defined by their
39/// content. They are implicitly deduplicated by content.
40/// - Each object has a unique identifier (UID) that's derived from its content,
41/// called a \a CASID.
42/// - This UID is a fixed-size (strong) hash of the transitive content of a
43/// CAS object.
44/// - It's comparable between any two CAS instances that have the same \a
45/// CASIDContext::getHashSchemaIdentifier().
46/// - The UID can be printed (e.g., \a CASID::toString()) and it can parsed
47/// by the same or a different CAS instance with \a
48/// ObjectStore::parseID().
49/// - An object can be looked up by content or by UID.
50/// - \a store() is "get-or-create" methods, writing an object if it
51/// doesn't exist yet, and return a ref to it in any case.
52/// - \a loadObject(const CASID&) looks up an object by its UID.
53/// - Objects can reference other objects, forming an arbitrary DAG.
54///
55/// The \a ObjectStore interface has a few ways of referencing objects:
56///
57/// - \a ObjectRef encapsulates a reference to something in the CAS. It is an
58/// opaque type that references an object inside a specific CAS. It is
59/// implementation defined if the underlying object exists or not for an
60/// ObjectRef, and it can used to speed up CAS lookup as an implementation
61/// detail. However, you don't know anything about the underlying objects.
62/// "Loading" the object is a separate step that may not have happened
63/// yet, and which can fail (e.g. due to filesystem corruption) or introduce
64/// latency (if downloading from a remote store).
65/// - \a ObjectHandle encapulates a *loaded* object in the CAS. You need one of
66/// these to inspect the content of an object: to look at its stored
67/// data and references. This is internal to CAS implementation and not
68/// availble from CAS public APIs.
69/// - \a CASID: the UID for an object in the CAS, obtained through \a
70/// ObjectStore::getID() or \a ObjectStore::parseID(). This is a valid CAS
71/// identifier, but may reference an object that is unknown to this CAS
72/// instance.
73/// - \a ObjectProxy pairs an ObjectHandle (subclass) with a ObjectStore, and
74/// wraps access APIs to avoid having to pass extra parameters. It is the
75/// object used for accessing underlying data and refs by CAS users.
76///
77/// Both ObjectRef and ObjectHandle are lightweight, wrapping a `uint64_t` and
78/// are only valid with the associated ObjectStore instance.
79///
80/// There are a few options for accessing content of objects, with different
81/// lifetime tradeoffs:
82///
83/// - \a getData() accesses data without exposing lifetime at all.
84/// - \a getMemoryBuffer() returns a \a MemoryBuffer whose lifetime
85/// is independent of the CAS (it can live longer).
86/// - \a getDataString() return StringRef with lifetime is guaranteed to last as
87/// long as \a ObjectStore.
88/// - \a readRef() and \a forEachRef() iterate through the references in an
89/// object. There is no lifetime assumption.
90class ObjectStore {
91 friend class ObjectProxy;
92 void anchor();
93
94public:
95 /// Get a \p CASID from a \p ID, which should have been generated by \a
96 /// CASID::print(). This succeeds as long as \a validateID() would pass. The
97 /// object may be unknown to this CAS instance.
98 ///
99 /// TODO: Remove, and update callers to use \a validateID() or \a
100 /// extractHashFromID().
101 virtual Expected<CASID> parseID(StringRef ID) = 0;
102
103 /// Store object into ObjectStore.
104 virtual Expected<ObjectRef> store(ArrayRef<ObjectRef> Refs,
105 ArrayRef<char> Data) = 0;
106 /// Get an ID for \p Ref.
107 virtual CASID getID(ObjectRef Ref) const = 0;
108
109 /// Stores the data of a file into ObjectStore.
110 ///
111 /// An underlying implementation could perform optimizations that reduce I/O
112 /// and disk space consumption.
113 ///
114 /// If there are any concurrent modifications to the file, the contents in the
115 /// CAS may be corrupt.
116 ///
117 /// \param FilePath the path of the file data.
118 virtual Expected<ObjectRef> storeFromFile(StringRef Path);
119
120 /// Exports the data of an object to a file path. It does not include any
121 /// references of the object.
122 ///
123 /// An underlying implementation could perform optimizations that reduce I/O
124 /// and disk space consumption.
125 ///
126 /// \param Node the object to read data from.
127 /// \param FilePath the path of the file data.
128 virtual Error exportDataToFile(ObjectHandle Node, StringRef Path) const;
129
130 /// Get an existing reference to the object called \p ID.
131 ///
132 /// Returns \c None if the object is not stored in this CAS.
133 virtual std::optional<ObjectRef> getReference(const CASID &ID) const = 0;
134
135 /// \returns true if the object is directly available from the local CAS, for
136 /// implementations that have this kind of distinction.
137 virtual Expected<bool> isMaterialized(ObjectRef Ref) const = 0;
138
139 /// Validate the underlying object referred by CASID.
140 virtual Error validateObject(const CASID &ID) = 0;
141
142 /// Validate the entire ObjectStore.
143 virtual Error validate(bool CheckHash) const = 0;
144
145protected:
146 /// Load the object referenced by \p Ref.
147 ///
148 /// Errors if the object cannot be loaded.
149 /// \returns \c std::nullopt if the object is missing from the CAS.
150 virtual Expected<std::optional<ObjectHandle>> loadIfExists(ObjectRef Ref) = 0;
151
152 /// Like \c loadIfExists but returns an error if the object is missing.
153 Expected<ObjectHandle> load(ObjectRef Ref);
154
155 /// Get the size of some data.
156 virtual uint64_t getDataSize(ObjectHandle Node) const = 0;
157
158 /// Methods for handling objects. CAS implementations need to override to
159 /// provide functions to access stored CAS objects and references.
160 virtual Error forEachRef(ObjectHandle Node,
161 function_ref<Error(ObjectRef)> Callback) const = 0;
162 virtual ObjectRef readRef(ObjectHandle Node, size_t I) const = 0;
163 virtual size_t getNumRefs(ObjectHandle Node) const = 0;
164 virtual ArrayRef<char> getData(ObjectHandle Node,
165 bool RequiresNullTerminator = false) const = 0;
166
167 /// Get ObjectRef from open file.
168 virtual Expected<ObjectRef>
169 storeFromOpenFileImpl(sys::fs::file_t FD,
170 std::optional<sys::fs::file_status> Status);
171
172 /// Get a lifetime-extended StringRef pointing at \p Data.
173 ///
174 /// Depending on the CAS implementation, this may involve in-memory storage
175 /// overhead.
176 StringRef getDataString(ObjectHandle Node) {
177 return toStringRef(Input: getData(Node));
178 }
179
180 /// Get a lifetime-extended MemoryBuffer pointing at \p Data.
181 ///
182 /// Depending on the CAS implementation, this may involve in-memory storage
183 /// overhead.
184 std::unique_ptr<MemoryBuffer>
185 getMemoryBuffer(ObjectHandle Node, StringRef Name = "",
186 bool RequiresNullTerminator = true);
187
188 /// Read all the refs from object in a SmallVector.
189 virtual void readRefs(ObjectHandle Node,
190 SmallVectorImpl<ObjectRef> &Refs) const;
191
192 /// Allow ObjectStore implementations to create internal handles.
193#define MAKE_CAS_HANDLE_CONSTRUCTOR(HandleKind) \
194 HandleKind make##HandleKind(uint64_t InternalRef) const { \
195 return HandleKind(*this, InternalRef); \
196 }
197 MAKE_CAS_HANDLE_CONSTRUCTOR(ObjectHandle)
198 MAKE_CAS_HANDLE_CONSTRUCTOR(ObjectRef)
199#undef MAKE_CAS_HANDLE_CONSTRUCTOR
200
201public:
202 /// Helper functions to store object and returns a ObjectProxy.
203 LLVM_ABI_FOR_TEST Expected<ObjectProxy> createProxy(ArrayRef<ObjectRef> Refs,
204 StringRef Data);
205
206 /// Store object from StringRef.
207 Expected<ObjectRef> storeFromString(ArrayRef<ObjectRef> Refs,
208 StringRef String) {
209 return store(Refs, Data: arrayRefFromStringRef<char>(Input: String));
210 }
211
212 /// Default implementation reads \p FD and calls \a storeNode(). Does not
213 /// take ownership of \p FD; the caller is responsible for closing it.
214 ///
215 /// If \p Status is sent in it is to be treated as a hint. Implementations
216 /// must protect against the file size potentially growing after the status
217 /// was taken (i.e., they cannot assume that an mmap will be null-terminated
218 /// where \p Status implies).
219 ///
220 /// Returns the \a CASID and the size of the file.
221 Expected<ObjectRef>
222 storeFromOpenFile(sys::fs::file_t FD,
223 std::optional<sys::fs::file_status> Status = std::nullopt) {
224 return storeFromOpenFileImpl(FD, Status);
225 }
226
227 static Error createUnknownObjectError(const CASID &ID);
228
229 /// Create ObjectProxy from CASID. If the object doesn't exist, get an error.
230 LLVM_ABI Expected<ObjectProxy> getProxy(const CASID &ID);
231 /// Create ObjectProxy from ObjectRef. If the object can't be loaded, get an
232 /// error.
233 LLVM_ABI Expected<ObjectProxy> getProxy(ObjectRef Ref);
234
235 /// \returns \c std::nullopt if the object is missing from the CAS.
236 Expected<std::optional<ObjectProxy>> getProxyIfExists(ObjectRef Ref);
237
238 /// Read the data from \p Data into \p OS.
239 uint64_t readData(ObjectHandle Node, raw_ostream &OS, uint64_t Offset = 0,
240 uint64_t MaxBytes = -1ULL) const {
241 ArrayRef<char> Data = getData(Node);
242 assert(Offset < Data.size() && "Expected valid offset");
243 Data = Data.drop_front(N: Offset).take_front(N: MaxBytes);
244 OS << toStringRef(Input: Data);
245 return Data.size();
246 }
247
248 /// Set the size for limiting growth of on-disk storage. This has an effect
249 /// for when the instance is closed.
250 ///
251 /// Implementations may leave this unimplemented.
252 virtual Error setSizeLimit(std::optional<uint64_t> SizeLimit) {
253 return Error::success();
254 }
255
256 /// \returns the storage size of the on-disk CAS data.
257 ///
258 /// Implementations that don't have an implementation for this should return
259 /// \p std::nullopt.
260 virtual Expected<std::optional<uint64_t>> getStorageSize() const {
261 return std::nullopt;
262 }
263
264 /// Prune local storage to reduce its size according to the desired size
265 /// limit. Pruning can happen concurrently with other operations.
266 ///
267 /// Implementations may leave this unimplemented.
268 virtual Error pruneStorageData() { return Error::success(); }
269
270 /// Validate the whole node tree.
271 Error validateTree(ObjectRef Ref);
272
273 /// Import object from another CAS. This will import the full tree from the
274 /// other CAS.
275 LLVM_ABI Expected<ObjectRef> importObject(ObjectStore &Upstream,
276 ObjectRef Other);
277
278 /// Print the ObjectStore internals for debugging purpose.
279 virtual void print(raw_ostream &) const {}
280 void dump() const;
281
282 /// Get CASContext
283 const CASContext &getContext() const { return Context; }
284
285 virtual ~ObjectStore() = default;
286
287protected:
288 ObjectStore(const CASContext &Context) : Context(Context) {}
289
290private:
291 const CASContext &Context;
292};
293
294/// Reference to an abstract hierarchical node, with data and references.
295/// Reference is passed by value and is expected to be valid as long as the \a
296/// ObjectStore is.
297class ObjectProxy {
298public:
299 ObjectStore &getCAS() const { return *CAS; }
300 CASID getID() const { return CAS->getID(Ref); }
301 ObjectRef getRef() const { return Ref; }
302 size_t getNumReferences() const { return CAS->getNumRefs(Node: H); }
303 ObjectRef getReference(size_t I) const { return CAS->readRef(Node: H, I); }
304
305 operator CASID() const { return getID(); }
306 CASID getReferenceID(size_t I) const {
307 std::optional<CASID> ID = getCAS().getID(Ref: getReference(I));
308 assert(ID && "Expected reference to be first-class object");
309 return *ID;
310 }
311
312 /// Visit each reference in order, returning an error from \p Callback to
313 /// stop early.
314 Error forEachReference(function_ref<Error(ObjectRef)> Callback) const {
315 return CAS->forEachRef(Node: H, Callback);
316 }
317
318 std::unique_ptr<MemoryBuffer>
319 getMemoryBuffer(StringRef Name = "",
320 bool RequiresNullTerminator = true) const;
321
322 /// Get the content of the node. Valid as long as the CAS is valid.
323 StringRef getData() const { return CAS->getDataString(Node: H); }
324
325 /// Exports the data of an object to a file path.
326 Error exportDataToFile(StringRef Path) const {
327 return CAS->exportDataToFile(Node: H, Path);
328 }
329
330 friend bool operator==(const ObjectProxy &Proxy, ObjectRef Ref) {
331 return Proxy.getRef() == Ref;
332 }
333 friend bool operator==(ObjectRef Ref, const ObjectProxy &Proxy) {
334 return Proxy.getRef() == Ref;
335 }
336 friend bool operator!=(const ObjectProxy &Proxy, ObjectRef Ref) {
337 return !(Proxy.getRef() == Ref);
338 }
339 friend bool operator!=(ObjectRef Ref, const ObjectProxy &Proxy) {
340 return !(Proxy.getRef() == Ref);
341 }
342
343public:
344 ObjectProxy() = delete;
345
346 static ObjectProxy load(ObjectStore &CAS, ObjectRef Ref, ObjectHandle Node) {
347 return ObjectProxy(CAS, Ref, Node);
348 }
349
350private:
351 ObjectProxy(ObjectStore &CAS, ObjectRef Ref, ObjectHandle H)
352 : CAS(&CAS), Ref(Ref), H(H) {}
353
354 ObjectStore *CAS;
355 ObjectRef Ref;
356 ObjectHandle H;
357};
358
359/// Create an in memory CAS.
360LLVM_ABI std::unique_ptr<ObjectStore> createInMemoryCAS();
361
362/// \returns true if \c LLVM_ENABLE_ONDISK_CAS configuration was enabled.
363bool isOnDiskCASEnabled();
364
365/// Create a persistent on-disk path at \p Path.
366LLVM_ABI Expected<std::unique_ptr<ObjectStore>>
367createOnDiskCAS(const Twine &Path);
368
369} // namespace cas
370} // namespace llvm
371
372#endif // LLVM_CAS_OBJECTSTORE_H
373