| 1 | //===- llvm/CAS/ObjectStore.h -----------------------------------*- C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | /// |
| 9 | /// \file |
| 10 | /// This file contains the declaration of the ObjectStore class. |
| 11 | /// |
| 12 | //===----------------------------------------------------------------------===// |
| 13 | |
| 14 | #ifndef LLVM_CAS_OBJECTSTORE_H |
| 15 | #define LLVM_CAS_OBJECTSTORE_H |
| 16 | |
| 17 | #include "llvm/ADT/StringRef.h" |
| 18 | #include "llvm/CAS/CASID.h" |
| 19 | #include "llvm/CAS/CASReference.h" |
| 20 | #include "llvm/Support/Error.h" |
| 21 | #include "llvm/Support/FileSystem.h" |
| 22 | #include <cstddef> |
| 23 | |
| 24 | namespace llvm { |
| 25 | |
| 26 | class MemoryBuffer; |
| 27 | template <typename T> class unique_function; |
| 28 | |
| 29 | namespace cas { |
| 30 | |
| 31 | class ObjectStore; |
| 32 | class ObjectProxy; |
| 33 | |
| 34 | /// Content-addressable storage for objects. |
| 35 | /// |
| 36 | /// Conceptually, objects are stored in a "unique set". |
| 37 | /// |
| 38 | /// - Objects are immutable ("value objects") that are defined by their |
| 39 | /// content. They are implicitly deduplicated by content. |
| 40 | /// - Each object has a unique identifier (UID) that's derived from its content, |
| 41 | /// called a \a CASID. |
| 42 | /// - This UID is a fixed-size (strong) hash of the transitive content of a |
| 43 | /// CAS object. |
| 44 | /// - It's comparable between any two CAS instances that have the same \a |
| 45 | /// CASIDContext::getHashSchemaIdentifier(). |
| 46 | /// - The UID can be printed (e.g., \a CASID::toString()) and it can parsed |
| 47 | /// by the same or a different CAS instance with \a |
| 48 | /// ObjectStore::parseID(). |
| 49 | /// - An object can be looked up by content or by UID. |
| 50 | /// - \a store() is "get-or-create" methods, writing an object if it |
| 51 | /// doesn't exist yet, and return a ref to it in any case. |
| 52 | /// - \a loadObject(const CASID&) looks up an object by its UID. |
| 53 | /// - Objects can reference other objects, forming an arbitrary DAG. |
| 54 | /// |
| 55 | /// The \a ObjectStore interface has a few ways of referencing objects: |
| 56 | /// |
| 57 | /// - \a ObjectRef encapsulates a reference to something in the CAS. It is an |
| 58 | /// opaque type that references an object inside a specific CAS. It is |
| 59 | /// implementation defined if the underlying object exists or not for an |
| 60 | /// ObjectRef, and it can used to speed up CAS lookup as an implementation |
| 61 | /// detail. However, you don't know anything about the underlying objects. |
| 62 | /// "Loading" the object is a separate step that may not have happened |
| 63 | /// yet, and which can fail (e.g. due to filesystem corruption) or introduce |
| 64 | /// latency (if downloading from a remote store). |
| 65 | /// - \a ObjectHandle encapulates a *loaded* object in the CAS. You need one of |
| 66 | /// these to inspect the content of an object: to look at its stored |
| 67 | /// data and references. This is internal to CAS implementation and not |
| 68 | /// availble from CAS public APIs. |
| 69 | /// - \a CASID: the UID for an object in the CAS, obtained through \a |
| 70 | /// ObjectStore::getID() or \a ObjectStore::parseID(). This is a valid CAS |
| 71 | /// identifier, but may reference an object that is unknown to this CAS |
| 72 | /// instance. |
| 73 | /// - \a ObjectProxy pairs an ObjectHandle (subclass) with a ObjectStore, and |
| 74 | /// wraps access APIs to avoid having to pass extra parameters. It is the |
| 75 | /// object used for accessing underlying data and refs by CAS users. |
| 76 | /// |
| 77 | /// Both ObjectRef and ObjectHandle are lightweight, wrapping a `uint64_t` and |
| 78 | /// are only valid with the associated ObjectStore instance. |
| 79 | /// |
| 80 | /// There are a few options for accessing content of objects, with different |
| 81 | /// lifetime tradeoffs: |
| 82 | /// |
| 83 | /// - \a getData() accesses data without exposing lifetime at all. |
| 84 | /// - \a getMemoryBuffer() returns a \a MemoryBuffer whose lifetime |
| 85 | /// is independent of the CAS (it can live longer). |
| 86 | /// - \a getDataString() return StringRef with lifetime is guaranteed to last as |
| 87 | /// long as \a ObjectStore. |
| 88 | /// - \a readRef() and \a forEachRef() iterate through the references in an |
| 89 | /// object. There is no lifetime assumption. |
| 90 | class ObjectStore { |
| 91 | friend class ObjectProxy; |
| 92 | void anchor(); |
| 93 | |
| 94 | public: |
| 95 | /// Get a \p CASID from a \p ID, which should have been generated by \a |
| 96 | /// CASID::print(). This succeeds as long as \a validateID() would pass. The |
| 97 | /// object may be unknown to this CAS instance. |
| 98 | /// |
| 99 | /// TODO: Remove, and update callers to use \a validateID() or \a |
| 100 | /// extractHashFromID(). |
| 101 | virtual Expected<CASID> parseID(StringRef ID) = 0; |
| 102 | |
| 103 | /// Store object into ObjectStore. |
| 104 | virtual Expected<ObjectRef> store(ArrayRef<ObjectRef> Refs, |
| 105 | ArrayRef<char> Data) = 0; |
| 106 | /// Get an ID for \p Ref. |
| 107 | virtual CASID getID(ObjectRef Ref) const = 0; |
| 108 | |
| 109 | /// Get an existing reference to the object called \p ID. |
| 110 | /// |
| 111 | /// Returns \c None if the object is not stored in this CAS. |
| 112 | virtual std::optional<ObjectRef> getReference(const CASID &ID) const = 0; |
| 113 | |
| 114 | /// \returns true if the object is directly available from the local CAS, for |
| 115 | /// implementations that have this kind of distinction. |
| 116 | virtual Expected<bool> isMaterialized(ObjectRef Ref) const = 0; |
| 117 | |
| 118 | /// Validate the underlying object referred by CASID. |
| 119 | virtual Error validateObject(const CASID &ID) = 0; |
| 120 | |
| 121 | /// Validate the entire ObjectStore. |
| 122 | virtual Error validate(bool CheckHash) const = 0; |
| 123 | |
| 124 | protected: |
| 125 | /// Load the object referenced by \p Ref. |
| 126 | /// |
| 127 | /// Errors if the object cannot be loaded. |
| 128 | /// \returns \c std::nullopt if the object is missing from the CAS. |
| 129 | virtual Expected<std::optional<ObjectHandle>> loadIfExists(ObjectRef Ref) = 0; |
| 130 | |
| 131 | /// Like \c loadIfExists but returns an error if the object is missing. |
| 132 | Expected<ObjectHandle> load(ObjectRef Ref); |
| 133 | |
| 134 | /// Get the size of some data. |
| 135 | virtual uint64_t getDataSize(ObjectHandle Node) const = 0; |
| 136 | |
| 137 | /// Methods for handling objects. CAS implementations need to override to |
| 138 | /// provide functions to access stored CAS objects and references. |
| 139 | virtual Error forEachRef(ObjectHandle Node, |
| 140 | function_ref<Error(ObjectRef)> Callback) const = 0; |
| 141 | virtual ObjectRef readRef(ObjectHandle Node, size_t I) const = 0; |
| 142 | virtual size_t getNumRefs(ObjectHandle Node) const = 0; |
| 143 | virtual ArrayRef<char> getData(ObjectHandle Node, |
| 144 | bool RequiresNullTerminator = false) const = 0; |
| 145 | |
| 146 | /// Get ObjectRef from open file. |
| 147 | virtual Expected<ObjectRef> |
| 148 | storeFromOpenFileImpl(sys::fs::file_t FD, |
| 149 | std::optional<sys::fs::file_status> Status); |
| 150 | |
| 151 | /// Get a lifetime-extended StringRef pointing at \p Data. |
| 152 | /// |
| 153 | /// Depending on the CAS implementation, this may involve in-memory storage |
| 154 | /// overhead. |
| 155 | StringRef getDataString(ObjectHandle Node) { |
| 156 | return toStringRef(Input: getData(Node)); |
| 157 | } |
| 158 | |
| 159 | /// Get a lifetime-extended MemoryBuffer pointing at \p Data. |
| 160 | /// |
| 161 | /// Depending on the CAS implementation, this may involve in-memory storage |
| 162 | /// overhead. |
| 163 | std::unique_ptr<MemoryBuffer> |
| 164 | getMemoryBuffer(ObjectHandle Node, StringRef Name = "" , |
| 165 | bool RequiresNullTerminator = true); |
| 166 | |
| 167 | /// Read all the refs from object in a SmallVector. |
| 168 | virtual void readRefs(ObjectHandle Node, |
| 169 | SmallVectorImpl<ObjectRef> &Refs) const; |
| 170 | |
| 171 | /// Allow ObjectStore implementations to create internal handles. |
| 172 | #define MAKE_CAS_HANDLE_CONSTRUCTOR(HandleKind) \ |
| 173 | HandleKind make##HandleKind(uint64_t InternalRef) const { \ |
| 174 | return HandleKind(*this, InternalRef); \ |
| 175 | } |
| 176 | MAKE_CAS_HANDLE_CONSTRUCTOR(ObjectHandle) |
| 177 | MAKE_CAS_HANDLE_CONSTRUCTOR(ObjectRef) |
| 178 | #undef MAKE_CAS_HANDLE_CONSTRUCTOR |
| 179 | |
| 180 | public: |
| 181 | /// Helper functions to store object and returns a ObjectProxy. |
| 182 | LLVM_ABI_FOR_TEST Expected<ObjectProxy> createProxy(ArrayRef<ObjectRef> Refs, |
| 183 | StringRef Data); |
| 184 | |
| 185 | /// Store object from StringRef. |
| 186 | Expected<ObjectRef> storeFromString(ArrayRef<ObjectRef> Refs, |
| 187 | StringRef String) { |
| 188 | return store(Refs, Data: arrayRefFromStringRef<char>(Input: String)); |
| 189 | } |
| 190 | |
| 191 | /// Default implementation reads \p FD and calls \a storeNode(). Does not |
| 192 | /// take ownership of \p FD; the caller is responsible for closing it. |
| 193 | /// |
| 194 | /// If \p Status is sent in it is to be treated as a hint. Implementations |
| 195 | /// must protect against the file size potentially growing after the status |
| 196 | /// was taken (i.e., they cannot assume that an mmap will be null-terminated |
| 197 | /// where \p Status implies). |
| 198 | /// |
| 199 | /// Returns the \a CASID and the size of the file. |
| 200 | Expected<ObjectRef> |
| 201 | storeFromOpenFile(sys::fs::file_t FD, |
| 202 | std::optional<sys::fs::file_status> Status = std::nullopt) { |
| 203 | return storeFromOpenFileImpl(FD, Status); |
| 204 | } |
| 205 | |
| 206 | static Error createUnknownObjectError(const CASID &ID); |
| 207 | |
| 208 | /// Create ObjectProxy from CASID. If the object doesn't exist, get an error. |
| 209 | LLVM_ABI Expected<ObjectProxy> getProxy(const CASID &ID); |
| 210 | /// Create ObjectProxy from ObjectRef. If the object can't be loaded, get an |
| 211 | /// error. |
| 212 | LLVM_ABI Expected<ObjectProxy> getProxy(ObjectRef Ref); |
| 213 | |
| 214 | /// \returns \c std::nullopt if the object is missing from the CAS. |
| 215 | Expected<std::optional<ObjectProxy>> getProxyIfExists(ObjectRef Ref); |
| 216 | |
| 217 | /// Read the data from \p Data into \p OS. |
| 218 | uint64_t readData(ObjectHandle Node, raw_ostream &OS, uint64_t Offset = 0, |
| 219 | uint64_t MaxBytes = -1ULL) const { |
| 220 | ArrayRef<char> Data = getData(Node); |
| 221 | assert(Offset < Data.size() && "Expected valid offset" ); |
| 222 | Data = Data.drop_front(N: Offset).take_front(N: MaxBytes); |
| 223 | OS << toStringRef(Input: Data); |
| 224 | return Data.size(); |
| 225 | } |
| 226 | |
| 227 | /// Set the size for limiting growth of on-disk storage. This has an effect |
| 228 | /// for when the instance is closed. |
| 229 | /// |
| 230 | /// Implementations may leave this unimplemented. |
| 231 | virtual Error setSizeLimit(std::optional<uint64_t> SizeLimit) { |
| 232 | return Error::success(); |
| 233 | } |
| 234 | |
| 235 | /// \returns the storage size of the on-disk CAS data. |
| 236 | /// |
| 237 | /// Implementations that don't have an implementation for this should return |
| 238 | /// \p std::nullopt. |
| 239 | virtual Expected<std::optional<uint64_t>> getStorageSize() const { |
| 240 | return std::nullopt; |
| 241 | } |
| 242 | |
| 243 | /// Prune local storage to reduce its size according to the desired size |
| 244 | /// limit. Pruning can happen concurrently with other operations. |
| 245 | /// |
| 246 | /// Implementations may leave this unimplemented. |
| 247 | virtual Error pruneStorageData() { return Error::success(); } |
| 248 | |
| 249 | /// Validate the whole node tree. |
| 250 | Error validateTree(ObjectRef Ref); |
| 251 | |
| 252 | /// Import object from another CAS. This will import the full tree from the |
| 253 | /// other CAS. |
| 254 | LLVM_ABI Expected<ObjectRef> importObject(ObjectStore &Upstream, |
| 255 | ObjectRef Other); |
| 256 | |
| 257 | /// Print the ObjectStore internals for debugging purpose. |
| 258 | virtual void print(raw_ostream &) const {} |
| 259 | void dump() const; |
| 260 | |
| 261 | /// Get CASContext |
| 262 | const CASContext &getContext() const { return Context; } |
| 263 | |
| 264 | virtual ~ObjectStore() = default; |
| 265 | |
| 266 | protected: |
| 267 | ObjectStore(const CASContext &Context) : Context(Context) {} |
| 268 | |
| 269 | private: |
| 270 | const CASContext &Context; |
| 271 | }; |
| 272 | |
| 273 | /// Reference to an abstract hierarchical node, with data and references. |
| 274 | /// Reference is passed by value and is expected to be valid as long as the \a |
| 275 | /// ObjectStore is. |
| 276 | class ObjectProxy { |
| 277 | public: |
| 278 | ObjectStore &getCAS() const { return *CAS; } |
| 279 | CASID getID() const { return CAS->getID(Ref); } |
| 280 | ObjectRef getRef() const { return Ref; } |
| 281 | size_t getNumReferences() const { return CAS->getNumRefs(Node: H); } |
| 282 | ObjectRef getReference(size_t I) const { return CAS->readRef(Node: H, I); } |
| 283 | |
| 284 | operator CASID() const { return getID(); } |
| 285 | CASID getReferenceID(size_t I) const { |
| 286 | std::optional<CASID> ID = getCAS().getID(Ref: getReference(I)); |
| 287 | assert(ID && "Expected reference to be first-class object" ); |
| 288 | return *ID; |
| 289 | } |
| 290 | |
| 291 | /// Visit each reference in order, returning an error from \p Callback to |
| 292 | /// stop early. |
| 293 | Error forEachReference(function_ref<Error(ObjectRef)> Callback) const { |
| 294 | return CAS->forEachRef(Node: H, Callback); |
| 295 | } |
| 296 | |
| 297 | std::unique_ptr<MemoryBuffer> |
| 298 | getMemoryBuffer(StringRef Name = "" , |
| 299 | bool RequiresNullTerminator = true) const; |
| 300 | |
| 301 | /// Get the content of the node. Valid as long as the CAS is valid. |
| 302 | StringRef getData() const { return CAS->getDataString(Node: H); } |
| 303 | |
| 304 | friend bool operator==(const ObjectProxy &Proxy, ObjectRef Ref) { |
| 305 | return Proxy.getRef() == Ref; |
| 306 | } |
| 307 | friend bool operator==(ObjectRef Ref, const ObjectProxy &Proxy) { |
| 308 | return Proxy.getRef() == Ref; |
| 309 | } |
| 310 | friend bool operator!=(const ObjectProxy &Proxy, ObjectRef Ref) { |
| 311 | return !(Proxy.getRef() == Ref); |
| 312 | } |
| 313 | friend bool operator!=(ObjectRef Ref, const ObjectProxy &Proxy) { |
| 314 | return !(Proxy.getRef() == Ref); |
| 315 | } |
| 316 | |
| 317 | public: |
| 318 | ObjectProxy() = delete; |
| 319 | |
| 320 | static ObjectProxy load(ObjectStore &CAS, ObjectRef Ref, ObjectHandle Node) { |
| 321 | return ObjectProxy(CAS, Ref, Node); |
| 322 | } |
| 323 | |
| 324 | private: |
| 325 | ObjectProxy(ObjectStore &CAS, ObjectRef Ref, ObjectHandle H) |
| 326 | : CAS(&CAS), Ref(Ref), H(H) {} |
| 327 | |
| 328 | ObjectStore *CAS; |
| 329 | ObjectRef Ref; |
| 330 | ObjectHandle H; |
| 331 | }; |
| 332 | |
| 333 | /// Create an in memory CAS. |
| 334 | LLVM_ABI std::unique_ptr<ObjectStore> createInMemoryCAS(); |
| 335 | |
| 336 | /// \returns true if \c LLVM_ENABLE_ONDISK_CAS configuration was enabled. |
| 337 | bool isOnDiskCASEnabled(); |
| 338 | |
| 339 | /// Create a persistent on-disk path at \p Path. |
| 340 | LLVM_ABI Expected<std::unique_ptr<ObjectStore>> |
| 341 | createOnDiskCAS(const Twine &Path); |
| 342 | |
| 343 | } // namespace cas |
| 344 | } // namespace llvm |
| 345 | |
| 346 | #endif // LLVM_CAS_OBJECTSTORE_H |
| 347 | |